diff mbox series

[net-next,1/3] bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE

Message ID 20170918193057.37644-2-kraigatgoog@gmail.com
State Accepted, archived
Delegated to: David Miller
Headers show
Series Implement delete for BPF LPM trie | expand

Commit Message

Craig Gallek Sept. 18, 2017, 7:30 p.m. UTC
From: Craig Gallek <kraig@google.com>

This is a simple non-recursive delete operation.  It prunes paths
of empty nodes in the tree, but it does not try to further compress
the tree as nodes are removed.

Signed-off-by: Craig Gallek <kraig@google.com>
---
 kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov Sept. 18, 2017, 10:53 p.m. UTC | #1
On 9/18/17 12:30 PM, Craig Gallek wrote:
> From: Craig Gallek <kraig@google.com>
>
> This is a simple non-recursive delete operation.  It prunes paths
> of empty nodes in the tree, but it does not try to further compress
> the tree as nodes are removed.
>
> Signed-off-by: Craig Gallek <kraig@google.com>
> ---
>  kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 77 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index 1b767844a76f..9d58a576b2ae 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -389,10 +389,84 @@ static int trie_update_elem(struct bpf_map *map,
>  	return ret;
>  }
>
> -static int trie_delete_elem(struct bpf_map *map, void *key)
> +/* Called from syscall or from eBPF program */
> +static int trie_delete_elem(struct bpf_map *map, void *_key)
>  {
> -	/* TODO */
> -	return -ENOSYS;
> +	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
> +	struct bpf_lpm_trie_key *key = _key;
> +	struct lpm_trie_node __rcu **trim;
> +	struct lpm_trie_node *node;
> +	unsigned long irq_flags;
> +	unsigned int next_bit;
> +	size_t matchlen = 0;
> +	int ret = 0;
> +
> +	if (key->prefixlen > trie->max_prefixlen)
> +		return -EINVAL;
> +
> +	raw_spin_lock_irqsave(&trie->lock, irq_flags);
> +
> +	/* Walk the tree looking for an exact key/length match and keeping
> +	 * track of where we could begin trimming the tree.  The trim-point
> +	 * is the sub-tree along the walk consisting of only single-child
> +	 * intermediate nodes and ending at a leaf node that we want to
> +	 * remove.
> +	 */
> +	trim = &trie->root;
> +	node = rcu_dereference_protected(
> +		trie->root, lockdep_is_held(&trie->lock));
> +	while (node) {
> +		matchlen = longest_prefix_match(trie, node, key);
> +
> +		if (node->prefixlen != matchlen ||
> +		    node->prefixlen == key->prefixlen)
> +			break;

curious why there is no need to do
'node->prefixlen == trie->max_prefixlen' in the above
like update/lookup do?

> +
> +		next_bit = extract_bit(key->data, node->prefixlen);
> +		/* If we hit a node that has more than one child or is a valid
> +		 * prefix itself, do not remove it. Reset the root of the trim
> +		 * path to its descendant on our path.
> +		 */
> +		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
> +		    (node->child[0] && node->child[1]))
> +			trim = &node->child[next_bit];
> +		node = rcu_dereference_protected(
> +			node->child[next_bit], lockdep_is_held(&trie->lock));
> +	}
> +
> +	if (!node || node->prefixlen != key->prefixlen ||
> +	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	trie->n_entries--;
> +
> +	/* If the node we are removing is not a leaf node, simply mark it
> +	 * as intermediate and we are done.
> +	 */
> +	if (rcu_access_pointer(node->child[0]) ||
> +	    rcu_access_pointer(node->child[1])) {
> +		node->flags |= LPM_TREE_NODE_FLAG_IM;
> +		goto out;
> +	}
> +
> +	/* trim should now point to the slot holding the start of a path from
> +	 * zero or more intermediate nodes to our leaf node for deletion.
> +	 */
> +	while ((node = rcu_dereference_protected(
> +			*trim, lockdep_is_held(&trie->lock)))) {
> +		RCU_INIT_POINTER(*trim, NULL);
> +		trim = rcu_access_pointer(node->child[0]) ?
> +			&node->child[0] :
> +			&node->child[1];
> +		kfree_rcu(node, rcu);

can it be that some of the nodes this loop walks have
both child[0] and [1] ?
Craig Gallek Sept. 19, 2017, 3:08 p.m. UTC | #2
On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitov <ast@fb.com> wrote:
Thanks for the review!  Please correct me if I'm wrong...

> On 9/18/17 12:30 PM, Craig Gallek wrote:
>>
>> From: Craig Gallek <kraig@google.com>
>>
>> This is a simple non-recursive delete operation.  It prunes paths
>> of empty nodes in the tree, but it does not try to further compress
>> the tree as nodes are removed.
>>
>> Signed-off-by: Craig Gallek <kraig@google.com>
>> ---
>>  kernel/bpf/lpm_trie.c | 80
>> +++++++++++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 77 insertions(+), 3 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 1b767844a76f..9d58a576b2ae 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -389,10 +389,84 @@ static int trie_update_elem(struct bpf_map *map,
>>         return ret;
>>  }
>>
>> -static int trie_delete_elem(struct bpf_map *map, void *key)
>> +/* Called from syscall or from eBPF program */
>> +static int trie_delete_elem(struct bpf_map *map, void *_key)
>>  {
>> -       /* TODO */
>> -       return -ENOSYS;
>> +       struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
>> +       struct bpf_lpm_trie_key *key = _key;
>> +       struct lpm_trie_node __rcu **trim;
>> +       struct lpm_trie_node *node;
>> +       unsigned long irq_flags;
>> +       unsigned int next_bit;
>> +       size_t matchlen = 0;
>> +       int ret = 0;
>> +
>> +       if (key->prefixlen > trie->max_prefixlen)
>> +               return -EINVAL;
>> +
>> +       raw_spin_lock_irqsave(&trie->lock, irq_flags);
>> +
>> +       /* Walk the tree looking for an exact key/length match and keeping
>> +        * track of where we could begin trimming the tree.  The
>> trim-point
>> +        * is the sub-tree along the walk consisting of only single-child
>> +        * intermediate nodes and ending at a leaf node that we want to
>> +        * remove.
>> +        */
>> +       trim = &trie->root;
>> +       node = rcu_dereference_protected(
>> +               trie->root, lockdep_is_held(&trie->lock));
>> +       while (node) {
>> +               matchlen = longest_prefix_match(trie, node, key);
>> +
>> +               if (node->prefixlen != matchlen ||
>> +                   node->prefixlen == key->prefixlen)
>> +                       break;
>
>
> curious why there is no need to do
> 'node->prefixlen == trie->max_prefixlen' in the above
> like update/lookup do?
I don't believe the node->prefixlen == trie->max_prefixlen check in
trie_update_elem is necessary. In order to get to this third clause,
it implies that the first two clauses evaluated false.  Which happens
when we find an exact prefix match for the current node, but the
to-be-inserted key prefix is different.  If the node we are comparing
against had a prefix of max_prefixlen, it would not be possible to
have both a full prefix match but different prefix lengths.  This
assumes that there are no nodes in the tree with > max_prefixlen
prefixes, but that is handled earlier in the update function.

There's a similar (I believe) unnecessary max_prefixlen check in
trie_lookup_elem.  The function should behave the same way without
that check, but at least in this case it's used as an early-out and
saves a few lines of execution.

>
>> +
>> +               next_bit = extract_bit(key->data, node->prefixlen);
>> +               /* If we hit a node that has more than one child or is a
>> valid
>> +                * prefix itself, do not remove it. Reset the root of the
>> trim
>> +                * path to its descendant on our path.
>> +                */
>> +               if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
>> +                   (node->child[0] && node->child[1]))
>> +                       trim = &node->child[next_bit];
>> +               node = rcu_dereference_protected(
>> +                       node->child[next_bit],
>> lockdep_is_held(&trie->lock));
>> +       }
>> +
>> +       if (!node || node->prefixlen != key->prefixlen ||
>> +           (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>> +               ret = -ENOENT;
>> +               goto out;
>> +       }
>> +
>> +       trie->n_entries--;
>> +
>> +       /* If the node we are removing is not a leaf node, simply mark it
>> +        * as intermediate and we are done.
>> +        */
>> +       if (rcu_access_pointer(node->child[0]) ||
>> +           rcu_access_pointer(node->child[1])) {
>> +               node->flags |= LPM_TREE_NODE_FLAG_IM;
>> +               goto out;
>> +       }
>> +
>> +       /* trim should now point to the slot holding the start of a path
>> from
>> +        * zero or more intermediate nodes to our leaf node for deletion.
>> +        */
>> +       while ((node = rcu_dereference_protected(
>> +                       *trim, lockdep_is_held(&trie->lock)))) {
>> +               RCU_INIT_POINTER(*trim, NULL);
>> +               trim = rcu_access_pointer(node->child[0]) ?
>> +                       &node->child[0] :
>> +                       &node->child[1];
>> +               kfree_rcu(node, rcu);
>
>
> can it be that some of the nodes this loop walks have
> both child[0] and [1] ?
No, the loop above will push trim down the walk every time it
encounters a node with two children.  The only other trim assignment
is the initial trim = &trie->root.  But the only time we would skip
the assignment in the loop is if the node being removed is the root.
If the root had multiple children and is being removed, it would be
handled by the case that turns the node into an intermediate node
rather than walking the trim path freeing things.
Daniel Borkmann Sept. 19, 2017, 4:12 p.m. UTC | #3
On 09/19/2017 05:08 PM, Craig Gallek wrote:
> On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitov <ast@fb.com> wrote:
>> On 9/18/17 12:30 PM, Craig Gallek wrote:
[...]
>>> +
>>> +               next_bit = extract_bit(key->data, node->prefixlen);
>>> +               /* If we hit a node that has more than one child or is a
>>> valid
>>> +                * prefix itself, do not remove it. Reset the root of the
>>> trim
>>> +                * path to its descendant on our path.
>>> +                */
>>> +               if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
>>> +                   (node->child[0] && node->child[1]))
>>> +                       trim = &node->child[next_bit];
>>> +               node = rcu_dereference_protected(
>>> +                       node->child[next_bit],
>>> lockdep_is_held(&trie->lock));
>>> +       }
>>> +
>>> +       if (!node || node->prefixlen != key->prefixlen ||
>>> +           (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>>> +               ret = -ENOENT;
>>> +               goto out;
>>> +       }
>>> +
>>> +       trie->n_entries--;
>>> +
>>> +       /* If the node we are removing is not a leaf node, simply mark it
>>> +        * as intermediate and we are done.
>>> +        */
>>> +       if (rcu_access_pointer(node->child[0]) ||
>>> +           rcu_access_pointer(node->child[1])) {
>>> +               node->flags |= LPM_TREE_NODE_FLAG_IM;
>>> +               goto out;
>>> +       }
>>> +
>>> +       /* trim should now point to the slot holding the start of a path
>>> from
>>> +        * zero or more intermediate nodes to our leaf node for deletion.
>>> +        */
>>> +       while ((node = rcu_dereference_protected(
>>> +                       *trim, lockdep_is_held(&trie->lock)))) {
>>> +               RCU_INIT_POINTER(*trim, NULL);
>>> +               trim = rcu_access_pointer(node->child[0]) ?
>>> +                       &node->child[0] :
>>> +                       &node->child[1];
>>> +               kfree_rcu(node, rcu);
>>
>> can it be that some of the nodes this loop walks have
>> both child[0] and [1] ?
> No, the loop above will push trim down the walk every time it
> encounters a node with two children.  The only other trim assignment
> is the initial trim = &trie->root.  But the only time we would skip
> the assignment in the loop is if the node being removed is the root.
> If the root had multiple children and is being removed, it would be
> handled by the case that turns the node into an intermediate node
> rather than walking the trim path freeing things.

Looks good to me. We should probably still merge nodes once we turn
a real node into an im which just has a single child attached to it;
parent can be im or real node. Thus, we don't need to traverse this
extra one on lookup.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Daniel Mack Sept. 19, 2017, 7:48 p.m. UTC | #4
Hi,

Thanks for working on this, Craig!

On 09/19/2017 06:12 PM, Daniel Borkmann wrote:
> On 09/19/2017 05:08 PM, Craig Gallek wrote:
>> On Mon, Sep 18, 2017 at 6:53 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>> On 9/18/17 12:30 PM, Craig Gallek wrote:
> [...]
>>>> +
>>>> +               next_bit = extract_bit(key->data, node->prefixlen);
>>>> +               /* If we hit a node that has more than one child or is a
>>>> valid
>>>> +                * prefix itself, do not remove it. Reset the root of the
>>>> trim
>>>> +                * path to its descendant on our path.
>>>> +                */
>>>> +               if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
>>>> +                   (node->child[0] && node->child[1]))
>>>> +                       trim = &node->child[next_bit];
>>>> +               node = rcu_dereference_protected(
>>>> +                       node->child[next_bit],
>>>> lockdep_is_held(&trie->lock));
>>>> +       }
>>>> +
>>>> +       if (!node || node->prefixlen != key->prefixlen ||
>>>> +           (node->flags & LPM_TREE_NODE_FLAG_IM)) {
>>>> +               ret = -ENOENT;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>> +       trie->n_entries--;
>>>> +
>>>> +       /* If the node we are removing is not a leaf node, simply mark it
>>>> +        * as intermediate and we are done.
>>>> +        */
>>>> +       if (rcu_access_pointer(node->child[0]) ||
>>>> +           rcu_access_pointer(node->child[1])) {
>>>> +               node->flags |= LPM_TREE_NODE_FLAG_IM;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>> +       /* trim should now point to the slot holding the start of a path
>>>> from
>>>> +        * zero or more intermediate nodes to our leaf node for deletion.
>>>> +        */
>>>> +       while ((node = rcu_dereference_protected(
>>>> +                       *trim, lockdep_is_held(&trie->lock)))) {
>>>> +               RCU_INIT_POINTER(*trim, NULL);
>>>> +               trim = rcu_access_pointer(node->child[0]) ?
>>>> +                       &node->child[0] :
>>>> +                       &node->child[1];
>>>> +               kfree_rcu(node, rcu);
>>>
>>> can it be that some of the nodes this loop walks have
>>> both child[0] and [1] ?
>> No, the loop above will push trim down the walk every time it
>> encounters a node with two children.  The only other trim assignment
>> is the initial trim = &trie->root.  But the only time we would skip
>> the assignment in the loop is if the node being removed is the root.
>> If the root had multiple children and is being removed, it would be
>> handled by the case that turns the node into an intermediate node
>> rather than walking the trim path freeing things.
> 
> Looks good to me. We should probably still merge nodes once we turn
> a real node into an im which just has a single child attached to it;
> parent can be im or real node. Thus, we don't need to traverse this
> extra one on lookup.

Right, but only if the the parent of the node allows us to do that,
because the 'next bit' in the lookup key has to match the slot index.

To illustrate, consider the following trie with no IM nodes:

                      +----------------+
                      |       (1)  (R) |
                      | 192.168.0.0/16 |
                      |    value: 1    |
                      |   [0]    [1]   |
                      +----------------+
                           |      |
            +----------------+  +------------------+
            |       (2)      |  |        (3)       |
            | 192.168.0.0/23 |  | 192.168.128.0/24 |
            |    value: 2    |  |     value: 3     |
            |   [0]    [1]   |  |    [0]    [1]    |
            +----------------+  +------------------+
                        |
                      +----------------+
                      |       (4)      |
                      | 192.168.1.0/24 |
                      |     value: 4   |
                      |   [0]    [1]   |
                      +----------------+

If you now try to delete (2), the node has to stay around because (3)
and (4) share the same value in bit 17 (1). If, however, (4) had a
prefix of 192.168.0.0/24, then (2) should be removed completely, and (4)
should be directly attached to (1) as child[0].

With this implementation, a situation in which multiple IM nodes appear
in a chain cannot emerge. And that again should make your trimming
algorithm simpler, because you only need to find an exact match, and
then handle three distinct cases:

a) the node as 0 children: simply remove it and nullify the pointer in
the parent

b) the node has 1 child: apply logic I described above

c) the node has 2 children: turn the node into an IM


Makes sense?


Thanks,
Daniel
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..9d58a576b2ae 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,84 @@  static int trie_update_elem(struct bpf_map *map,
 	return ret;
 }
 
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
-	/* TODO */
-	return -ENOSYS;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie_node __rcu **trim;
+	struct lpm_trie_node *node;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Walk the tree looking for an exact key/length match and keeping
+	 * track of where we could begin trimming the tree.  The trim-point
+	 * is the sub-tree along the walk consisting of only single-child
+	 * intermediate nodes and ending at a leaf node that we want to
+	 * remove.
+	 */
+	trim = &trie->root;
+	node = rcu_dereference_protected(
+		trie->root, lockdep_is_held(&trie->lock));
+	while (node) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		/* If we hit a node that has more than one child or is a valid
+		 * prefix itself, do not remove it. Reset the root of the trim
+		 * path to its descendant on our path.
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
+		    (node->child[0] && node->child[1]))
+			trim = &node->child[next_bit];
+		node = rcu_dereference_protected(
+			node->child[next_bit], lockdep_is_held(&trie->lock));
+	}
+
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	trie->n_entries--;
+
+	/* If the node we are removing is not a leaf node, simply mark it
+	 * as intermediate and we are done.
+	 */
+	if (rcu_access_pointer(node->child[0]) ||
+	    rcu_access_pointer(node->child[1])) {
+		node->flags |= LPM_TREE_NODE_FLAG_IM;
+		goto out;
+	}
+
+	/* trim should now point to the slot holding the start of a path from
+	 * zero or more intermediate nodes to our leaf node for deletion.
+	 */
+	while ((node = rcu_dereference_protected(
+			*trim, lockdep_is_held(&trie->lock)))) {
+		RCU_INIT_POINTER(*trim, NULL);
+		trim = rcu_access_pointer(node->child[0]) ?
+			&node->child[0] :
+			&node->child[1];
+		kfree_rcu(node, rcu);
+	}
+
+out:
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
 }
 
 #define LPM_DATA_SIZE_MAX	256