diff mbox series

[bpf-next,1/2] bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map

Message ID 20180118230851.1533009-2-yhs@fb.com
State Accepted, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map | expand

Commit Message

Yonghong Song Jan. 18, 2018, 11:08 p.m. UTC
Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY
command. This command is handy when users want to enumerate
keys. Otherwise, a different map which supports key
enumeration may be required to store the keys. If the
map data is sparse and all map data are to be deleted without
closing file descriptor, using MAP_GET_NEXT_KEY to find
all keys is much faster than enumerating all key space.

This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map.
If user provided key pointer is NULL or the key does not have
an exact match in the trie, the first key will be returned.
Otherwise, the next key will be returned.

In this implemenation, key enumeration follows a postorder
traversal of internal trie. More specific keys
will be returned first than less specific ones, given
a sequence of MAP_GET_NEXT_KEY syscalls.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 2 deletions(-)

Comments

Eric Dumazet Jan. 22, 2018, 7:28 p.m. UTC | #1
On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:
> Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY
> command. This command is handy when users want to enumerate
> keys. Otherwise, a different map which supports key
> enumeration may be required to store the keys. If the
> map data is sparse and all map data are to be deleted without
> closing file descriptor, using MAP_GET_NEXT_KEY to find
> all keys is much faster than enumerating all key space.
> 
> This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map.
> If user provided key pointer is NULL or the key does not have
> an exact match in the trie, the first key will be returned.
> Otherwise, the next key will be returned.
> 
> In this implemenation, key enumeration follows a postorder
> traversal of internal trie. More specific keys
> will be returned first than less specific ones, given
> a sequence of MAP_GET_NEXT_KEY syscalls.
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 93 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index 584e022..d7ea962 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -591,9 +591,100 @@ static void trie_free(struct bpf_map *map)
>  	raw_spin_unlock(&trie->lock);
>  }
>  
> -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
> +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
>  {
> -	return -ENOTSUPP;
> +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
> +	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
> +	struct lpm_trie_node *node, *next_node = NULL, *parent;
> +	struct lpm_trie_node **node_stack = NULL;
> +	struct lpm_trie_node __rcu **root;
> +	int err = 0, stack_ptr = -1;
> +	unsigned int next_bit;
> +	size_t matchlen;
> +
> +	/* The get_next_key follows postorder. For the 4 node example in
> +	 * the top of this file, the trie_get_next_key() returns the following
> +	 * one after another:
> +	 *   192.168.0.0/24
> +	 *   192.168.1.0/24
> +	 *   192.168.128.0/24
> +	 *   192.168.0.0/16
> +	 *
> +	 * The idea is to return more specific keys before less specific ones.
> +	 */
> +
> +	/* Empty trie */
> +	if (!rcu_dereference(trie->root))
> +		return -ENOENT;
> +
> +	/* For invalid key, find the leftmost node in the trie */
> +	if (!key || key->prefixlen > trie->max_prefixlen) {
> +		root = &trie->root;
> +		goto find_leftmost;
> +	}
> +
> +	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
> +			     GFP_USER | __GFP_NOWARN);

It is illegal to use a blocking kmalloc() while holding RCU.

CONFIG_DEBUG_ATOMIC_SLEEP=y is your friend

> +	if (!node_stack)
> +		return -ENOMEM;
> +
> +	/* Try to find the exact node for the given key */
> +	for (node = rcu_dereference(trie->root); node;) {
> +		node_stack[++stack_ptr] = node;
> +		matchlen = longest_prefix_match(trie, node, key);
> +		if (node->prefixlen != matchlen ||
> +		    node->prefixlen == key->prefixlen)
> +			break;
> +
> +		next_bit = extract_bit(key->data, node->prefixlen);
> +		node = rcu_dereference(node->child[next_bit]);
> +	}
> +	if (!node || node->prefixlen != key->prefixlen ||
> +	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
> +		root = &trie->root;
> +		goto find_leftmost;
> +	}
> +
> +	/* The node with the exactly-matching key has been found,
> +	 * find the first node in postorder after the matched node.
> +	 */
> +	node = node_stack[stack_ptr];
> +	while (stack_ptr > 0) {
> +		parent = node_stack[stack_ptr - 1];
> +		if (rcu_dereference(parent->child[0]) == node &&
> +		    rcu_dereference(parent->child[1])) {
> +			root = &parent->child[1];
> +			goto find_leftmost;
> +		}
> +		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
> +			next_node = parent;
> +			goto do_copy;
> +		}
> +
> +		node = parent;
> +		stack_ptr--;
> +	}
> +
> +	/* did not find anything */
> +	err = -ENOENT;
> +	goto free_stack;
> +
> +find_leftmost:
> +	/* Find the leftmost non-intermediate node, all intermediate nodes
> +	 * have exact two children, so this function will never return NULL.
> +	 */
> +	for (node = rcu_dereference(*root); node;) {
> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
> +			next_node = node;
> +		node = rcu_dereference(node->child[0]);
> +	}
> +do_copy:
> +	next_key->prefixlen = next_node->prefixlen;
> +	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
> +	       next_node->data, trie->data_size);
> +free_stack:
> +	kfree(node_stack);
> +	return err;
>  }
>  
>  const struct bpf_map_ops trie_map_ops = {
Yonghong Song Jan. 23, 2018, 12:05 a.m. UTC | #2
On 1/22/18 11:28 AM, Eric Dumazet wrote:
> On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:
>> Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY
>> command. This command is handy when users want to enumerate
>> keys. Otherwise, a different map which supports key
>> enumeration may be required to store the keys. If the
>> map data is sparse and all map data are to be deleted without
>> closing file descriptor, using MAP_GET_NEXT_KEY to find
>> all keys is much faster than enumerating all key space.
>>
>> This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map.
>> If user provided key pointer is NULL or the key does not have
>> an exact match in the trie, the first key will be returned.
>> Otherwise, the next key will be returned.
>>
>> In this implemenation, key enumeration follows a postorder
>> traversal of internal trie. More specific keys
>> will be returned first than less specific ones, given
>> a sequence of MAP_GET_NEXT_KEY syscalls.
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 93 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 584e022..d7ea962 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -591,9 +591,100 @@ static void trie_free(struct bpf_map *map)
>>   	raw_spin_unlock(&trie->lock);
>>   }
>>   
>> -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
>> +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
>>   {
>> -	return -ENOTSUPP;
>> +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
>> +	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
>> +	struct lpm_trie_node *node, *next_node = NULL, *parent;
>> +	struct lpm_trie_node **node_stack = NULL;
>> +	struct lpm_trie_node __rcu **root;
>> +	int err = 0, stack_ptr = -1;
>> +	unsigned int next_bit;
>> +	size_t matchlen;
>> +
>> +	/* The get_next_key follows postorder. For the 4 node example in
>> +	 * the top of this file, the trie_get_next_key() returns the following
>> +	 * one after another:
>> +	 *   192.168.0.0/24
>> +	 *   192.168.1.0/24
>> +	 *   192.168.128.0/24
>> +	 *   192.168.0.0/16
>> +	 *
>> +	 * The idea is to return more specific keys before less specific ones.
>> +	 */
>> +
>> +	/* Empty trie */
>> +	if (!rcu_dereference(trie->root))
>> +		return -ENOENT;
>> +
>> +	/* For invalid key, find the leftmost node in the trie */
>> +	if (!key || key->prefixlen > trie->max_prefixlen) {
>> +		root = &trie->root;
>> +		goto find_leftmost;
>> +	}
>> +
>> +	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
>> +			     GFP_USER | __GFP_NOWARN);
> 
> It is illegal to use a blocking kmalloc() while holding RCU.
> 
> CONFIG_DEBUG_ATOMIC_SLEEP=y is your friend

Thanks Eric for spotting the problem. Will fix the issue soon.

> 
>> +	if (!node_stack)
>> +		return -ENOMEM;
>> +
>> +	/* Try to find the exact node for the given key */
>> +	for (node = rcu_dereference(trie->root); node;) {
>> +		node_stack[++stack_ptr] = node;
>> +		matchlen = longest_prefix_match(trie, node, key);
>> +		if (node->prefixlen != matchlen ||
>> +		    node->prefixlen == key->prefixlen)
>> +			break;
>> +
>> +		next_bit = extract_bit(key->data, node->prefixlen);
>> +		node = rcu_dereference(node->child[next_bit]);
>> +	}
>> +	if (!node || node->prefixlen != key->prefixlen ||
>> +	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +		root = &trie->root;
>> +		goto find_leftmost;
>> +	}
>> +
>> +	/* The node with the exactly-matching key has been found,
>> +	 * find the first node in postorder after the matched node.
>> +	 */
>> +	node = node_stack[stack_ptr];
>> +	while (stack_ptr > 0) {
>> +		parent = node_stack[stack_ptr - 1];
>> +		if (rcu_dereference(parent->child[0]) == node &&
>> +		    rcu_dereference(parent->child[1])) {
>> +			root = &parent->child[1];
>> +			goto find_leftmost;
>> +		}
>> +		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +			next_node = parent;
>> +			goto do_copy;
>> +		}
>> +
>> +		node = parent;
>> +		stack_ptr--;
>> +	}
>> +
>> +	/* did not find anything */
>> +	err = -ENOENT;
>> +	goto free_stack;
>> +
>> +find_leftmost:
>> +	/* Find the leftmost non-intermediate node, all intermediate nodes
>> +	 * have exact two children, so this function will never return NULL.
>> +	 */
>> +	for (node = rcu_dereference(*root); node;) {
>> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
>> +			next_node = node;
>> +		node = rcu_dereference(node->child[0]);
>> +	}
>> +do_copy:
>> +	next_key->prefixlen = next_node->prefixlen;
>> +	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
>> +	       next_node->data, trie->data_size);
>> +free_stack:
>> +	kfree(node_stack);
>> +	return err;
>>   }
>>   
>>   const struct bpf_map_ops trie_map_ops = {
Eric Dumazet Jan. 26, 2018, 4:47 a.m. UTC | #3
On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:

> +find_leftmost:
> +	/* Find the leftmost non-intermediate node, all intermediate nodes
> +	 * have exact two children, so this function will never return NULL.
> +	 */

syzbot [1] disagrees violently with this comment.

> +	for (node = rcu_dereference(*root); node;) {
> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
> +			next_node = node;
> +		node = rcu_dereference(node->child[0]);
> +	}
> +do_copy:
> +	next_key->prefixlen = next_node->prefixlen;
> +	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
> +	       next_node->data, trie->data_size);

[1]

syzbot hit the following crash on e9dcd80b9d77a92bfae6ce42a451f5c5fd318832
git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: https://syzkaller-buganizer.googleplex.com/text?tag=Config&id=b2216f04db2aa337e2bbf5ebd233919c3e2aa05f
compiler: gcc (GCC) 7.1.1 20170620


Unfortunately, I don't have any reproducer for this bug yet.
raw crash log: https://syzkaller-buganizer.googleplex.com/text?tag=CrashLog&id=4f78be02e2cd37040b8796322e02b147caae6024
dashboard link: https://syzkaller.appspot.com/bug?extid=148b56534d9269ab7433

See http://go/syzbot for details on how to handle this bug.

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682
RSP: 0018:ffff8801aa44f628 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff81829a9d
RDX: 0000000000000004 RSI: ffffc90003b7b000 RDI: 0000000000000020
RBP: ffff8801aa44f8b0 R08: ffffffff817e8f95 R09: 0000000000000002
R10: ffff8801aa44f790 R11: 0000000000000000 R12: 0000000000000000
R13: 1ffff10035489f01 R14: fffffffffffffff4 R15: 0000000000000000
FS:  00007fbb3b39b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000002057a000 CR3: 00000001c26e4005 CR4: 00000000001606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 map_get_next_key kernel/bpf/syscall.c:842 [inline]
 SYSC_bpf kernel/bpf/syscall.c:1881 [inline]
 SyS_bpf+0x11b4/0x4860 kernel/bpf/syscall.c:1846
 entry_SYSCALL_64_fastpath+0x29/0xa0
RIP: 0033:0x452f19
RSP: 002b:00007fbb3b39ac58 EFLAGS: 00000212 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 000000000071bea0 RCX: 0000000000452f19
RDX: 0000000000000018 RSI: 0000000020daf000 RDI: 0000000000000004
RBP: 000000000000003e R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ef670
R13: 00000000ffffffff R14: 00007fbb3b39b6d4 R15: 0000000000000000
Code: 19 d3 ff e8 81 98 ed ff 4d 85 e4 0f 85 30 ff ff ff e8 73 98 ed ff 49 8d 7f 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e f2 0a 00 00 48 8b b5 98 fd 
RIP: trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 RSP: ffff8801aa44f628
---[ end trace b4eb675edf4c4059 ]---
Kernel panic - not syncing: Fatal exception
Dumping ftrace buffer:
   (ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..
Yonghong Song Jan. 26, 2018, 4:50 p.m. UTC | #4
On 1/25/18 8:47 PM, Eric Dumazet wrote:
> On Thu, 2018-01-18 at 15:08 -0800, Yonghong Song wrote:
> 
>> +find_leftmost:
>> +	/* Find the leftmost non-intermediate node, all intermediate nodes
>> +	 * have exact two children, so this function will never return NULL.
>> +	 */
> 
> syzbot [1] disagrees violently with this comment.
> 
>> +	for (node = rcu_dereference(*root); node;) {
>> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
>> +			next_node = node;
>> +		node = rcu_dereference(node->child[0]);
>> +	}
>> +do_copy:
>> +	next_key->prefixlen = next_node->prefixlen;
>> +	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
>> +	       next_node->data, trie->data_size);

Thanks for reporting the issue. I looked at the update and delete code 
in lpm_trie.c, it looks the comment that each intermediate node has two 
children still holds. But I do find a problem:

616         /* Empty trie */
617         if (!rcu_dereference(trie->root))
618                 return -ENOENT;
619
620         /* For invalid key, find the leftmost node in the trie */
621         if (!key || key->prefixlen > trie->max_prefixlen) {
622                 root = &trie->root;
623                 goto find_leftmost;
624         }
......
672 find_leftmost:
673         /* Find the leftmost non-intermediate node, all intermediate 
nodes
674          * have exact two children, so this function will never 
return NULL.
675          */
676         for (node = rcu_dereference(*root); node;) {
677                 if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
678                         next_node = node;
679                 node = rcu_dereference(node->child[0]);
680         }

It is possible that at line 617, trie->root is not NULL, but
later at line 676 is NULL. This will lead to next_node is NULL.

Will write a test to demonstrate this and examine intermediate node
property and propose a fix soon.

> 
> [1]
> 
> syzbot hit the following crash on e9dcd80b9d77a92bfae6ce42a451f5c5fd318832
> git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> config: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DConfig-26id-3Db2216f04db2aa337e2bbf5ebd233919c3e2aa05f&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=aHeuptD-ytpgZCGZK_koCI1pUdJBACYveZ3RmN35g90&e=
> compiler: gcc (GCC) 7.1.1 20170620
> 
> 
> Unfortunately, I don't have any reproducer for this bug yet.
> raw crash log: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller-2Dbuganizer.googleplex.com_text-3Ftag-3DCrashLog-26id-3D4f78be02e2cd37040b8796322e02b147caae6024&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=RS0jaMd3n5ipeDtuFCL8B0152mZoHBx0b8xNq9AjDes&e=
> dashboard link: https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D148b56534d9269ab7433&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=6L_1a0P1zZJmhAN49_cgfs8_LjnBW7h4L-SXEFF4orc&e=
> 
> See https://urldefense.proofpoint.com/v2/url?u=http-3A__go_syzbot&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=i6WobKxbeG3slzHSIOxTVtYIJw7qjCE6S0spDTKL-J4&m=RlBFk8FZhgGNl21aXGyx6UTYgjRdx0JLPJJXuREGo_c&s=iajwDJDDZzMqC0CxPLANoybk0lVIpuh2r7pz0rcD8kI&e= for details on how to handle this bug.
> 
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>     (ftrace buffer empty)
> Modules linked in:
> CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682
> RSP: 0018:ffff8801aa44f628 EFLAGS: 00010202
> RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff81829a9d
> RDX: 0000000000000004 RSI: ffffc90003b7b000 RDI: 0000000000000020
> RBP: ffff8801aa44f8b0 R08: ffffffff817e8f95 R09: 0000000000000002
> R10: ffff8801aa44f790 R11: 0000000000000000 R12: 0000000000000000
> R13: 1ffff10035489f01 R14: fffffffffffffff4 R15: 0000000000000000
> FS:  00007fbb3b39b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002057a000 CR3: 00000001c26e4005 CR4: 00000000001606e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>   map_get_next_key kernel/bpf/syscall.c:842 [inline]
>   SYSC_bpf kernel/bpf/syscall.c:1881 [inline]
>   SyS_bpf+0x11b4/0x4860 kernel/bpf/syscall.c:1846
>   entry_SYSCALL_64_fastpath+0x29/0xa0
> RIP: 0033:0x452f19
> RSP: 002b:00007fbb3b39ac58 EFLAGS: 00000212 ORIG_RAX: 0000000000000141
> RAX: ffffffffffffffda RBX: 000000000071bea0 RCX: 0000000000452f19
> RDX: 0000000000000018 RSI: 0000000020daf000 RDI: 0000000000000004
> RBP: 000000000000003e R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ef670
> R13: 00000000ffffffff R14: 00007fbb3b39b6d4 R15: 0000000000000000
> Code: 19 d3 ff e8 81 98 ed ff 4d 85 e4 0f 85 30 ff ff ff e8 73 98 ed ff 49 8d 7f 20 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e f2 0a 00 00 48 8b b5 98 fd
> RIP: trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 RSP: ffff8801aa44f628
> ---[ end trace b4eb675edf4c4059 ]---
> Kernel panic - not syncing: Fatal exception
> Dumping ftrace buffer:
>     (ftrace buffer empty)
> Kernel Offset: disabled
> Rebooting in 86400 seconds..
>
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 584e022..d7ea962 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -591,9 +591,100 @@  static void trie_free(struct bpf_map *map)
 	raw_spin_unlock(&trie->lock);
 }
 
-static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key)
 {
-	return -ENOTSUPP;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key, *next_key = _next_key;
+	struct lpm_trie_node *node, *next_node = NULL, *parent;
+	struct lpm_trie_node **node_stack = NULL;
+	struct lpm_trie_node __rcu **root;
+	int err = 0, stack_ptr = -1;
+	unsigned int next_bit;
+	size_t matchlen;
+
+	/* The get_next_key follows postorder. For the 4 node example in
+	 * the top of this file, the trie_get_next_key() returns the following
+	 * one after another:
+	 *   192.168.0.0/24
+	 *   192.168.1.0/24
+	 *   192.168.128.0/24
+	 *   192.168.0.0/16
+	 *
+	 * The idea is to return more specific keys before less specific ones.
+	 */
+
+	/* Empty trie */
+	if (!rcu_dereference(trie->root))
+		return -ENOENT;
+
+	/* For invalid key, find the leftmost node in the trie */
+	if (!key || key->prefixlen > trie->max_prefixlen) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *),
+			     GFP_USER | __GFP_NOWARN);
+	if (!node_stack)
+		return -ENOMEM;
+
+	/* Try to find the exact node for the given key */
+	for (node = rcu_dereference(trie->root); node;) {
+		node_stack[++stack_ptr] = node;
+		matchlen = longest_prefix_match(trie, node, key);
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		node = rcu_dereference(node->child[next_bit]);
+	}
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		root = &trie->root;
+		goto find_leftmost;
+	}
+
+	/* The node with the exactly-matching key has been found,
+	 * find the first node in postorder after the matched node.
+	 */
+	node = node_stack[stack_ptr];
+	while (stack_ptr > 0) {
+		parent = node_stack[stack_ptr - 1];
+		if (rcu_dereference(parent->child[0]) == node &&
+		    rcu_dereference(parent->child[1])) {
+			root = &parent->child[1];
+			goto find_leftmost;
+		}
+		if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) {
+			next_node = parent;
+			goto do_copy;
+		}
+
+		node = parent;
+		stack_ptr--;
+	}
+
+	/* did not find anything */
+	err = -ENOENT;
+	goto free_stack;
+
+find_leftmost:
+	/* Find the leftmost non-intermediate node, all intermediate nodes
+	 * have exact two children, so this function will never return NULL.
+	 */
+	for (node = rcu_dereference(*root); node;) {
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+			next_node = node;
+		node = rcu_dereference(node->child[0]);
+	}
+do_copy:
+	next_key->prefixlen = next_node->prefixlen;
+	memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data),
+	       next_node->data, trie->data_size);
+free_stack:
+	kfree(node_stack);
+	return err;
 }
 
 const struct bpf_map_ops trie_map_ops = {