diff mbox series

[ovs-dev,v9,08/11] netdev-offload-tc: Introduce group ID management API

Message ID 20201215033812.145975-9-cmi@nvidia.com
State Changes Requested
Headers show
Series Add offload support for sFlow | expand

Commit Message

Chris Mi Dec. 15, 2020, 3:38 a.m. UTC
When offloading sample action to TC, userspace creates a unique ID
to map sFlow action and tunnel info and passes this ID to kernel instead
of the sFlow info. psample will send this ID and sampled packet to
userspace. Using the ID, userspace can recover the sFlow info and send
sampled packet to the right sFlow monitoring host.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
---
 lib/netdev-offload-tc.c | 260 ++++++++++++++++++++++++++++++++++++++++
 lib/tc.h                |   1 +
 2 files changed, 261 insertions(+)

Comments

Eelco Chaudron Jan. 12, 2021, 7:49 p.m. UTC | #1
On 15 Dec 2020, at 4:38, Chris Mi wrote:

> When offloading sample action to TC, userspace creates a unique ID
> to map sFlow action and tunnel info and passes this ID to kernel 
> instead
> of the sFlow info. psample will send this ID and sampled packet to
> userspace. Using the ID, userspace can recover the sFlow info and send
> sampled packet to the right sFlow monitoring host.
>
> Signed-off-by: Chris Mi <cmi@nvidia.com>
> Reviewed-by: Eli Britstein <elibr@nvidia.com>
> ---
>  lib/netdev-offload-tc.c | 260 
> ++++++++++++++++++++++++++++++++++++++++
>  lib/tc.h                |   1 +
>  2 files changed, 261 insertions(+)
>
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 9c5be9f92..5c3c811a2 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -39,6 +39,7 @@
>  #include "unaligned.h"
>  #include "util.h"
>  #include "dpif-provider.h"
> +#include "cmap.h"
>
>  VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>
> @@ -57,6 +58,255 @@ struct netlink_field {
>      int size;
>  };
>
> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow 
> */
> +struct gid_node {
> +    struct ovs_list exp_node OVS_GUARDED;
> +    struct cmap_node metadata_node;
> +    struct cmap_node id_node;
> +    struct ovs_refcount refcount;
> +    uint32_t hash;
> +    uint32_t id;
> +    const struct dpif_sflow_attr sflow;
> +};

All your variables/functions etc. all use gid (group ID) but this seems 
a bit general. Maybe you can make it more sample specific, i.e. sgid?

> +
> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
> +
> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
> +
> +static struct cmap gid_map = CMAP_INITIALIZER;
> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
> +
> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
> +    = OVS_LIST_INITIALIZER(&gid_expiring);
> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
> +    = OVS_LIST_INITIALIZER(&gid_expired);
> +
> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
> +
> +#define GID_RUN_INTERVAL   250     /* msec */
> +
> +static void
> +gid_node_free(struct gid_node *node)
> +{
> +    if (node->sflow.tunnel) {
free() does the null check, so this can just be 
free(node->sflow.tunnel);
> +        free(node->sflow.tunnel); +    }
> +    free(CONST_CAST(void *, node->sflow.sflow));
> +    free(node->sflow.userdata);
> +    free(node);
> +}
> +
> +static void
> +gid_cleanup(void)
> +{
> +    long long int now = time_msec();
> +    struct gid_node *node;
> +
> +    /* Do maintenance at most 4 times / sec. */
> +    ovs_rwlock_rdlock(&gid_rwlock);
> +    if (now - gid_last_run < GID_RUN_INTERVAL) {
> +        ovs_rwlock_unlock(&gid_rwlock);
> +        return;
> +    }
> +    ovs_rwlock_unlock(&gid_rwlock);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    gid_last_run = now;
> +
> +    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
> +        cmap_remove(&gid_map, &node->id_node, node->id);
> +        ovsrcu_postpone(gid_node_free, node);
> +    }
> +
> +    if (!ovs_list_is_empty(&gid_expiring)) {
> +        /* 'gid_expired' is now empty, move nodes in
> +         * 'gid_expiring' to it. */
> +        ovs_list_splice(&gid_expired,
> +                        ovs_list_front(&gid_expiring),
> +                        &gid_expiring);
> +    }

Trying to understand why we have two list? Why can't we directly cleanup 
from expiring list?

> +    ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
> quiescent
> + * state, caller should copy the contents. */
> +static const struct gid_node *
> +gid_find(uint32_t id)
> +{
> +    const struct cmap_node *node = cmap_find(&gid_map, id);
> +
> +    return node
> +        ? CONTAINER_OF(node, const struct gid_node, id_node)
> +        : NULL;
> +}
> +
> +static uint32_t
> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
> +{
> +    uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 
> 0);
> +    uint32_t hash2;
> +
> +    if (!sflow->tunnel) {
> +        return hash1;
> +    }
> +
> +    hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
> +    return hash_add(hash1, hash2);

I think this can be:
     uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);

     if (sflow->tunnel) {
         hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
     }
     return hash;


> +}
> +
> +static bool
> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
> +                      const struct dpif_sflow_attr *b)
> +{
> +    if (a->sflow->nla_len != b->sflow->nla_len
> +        || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
> +        return false;
> +    }
> +    if (!a->tunnel && !b->tunnel) {
> +        return true;
> +    }
> +    if (a->tunnel && b->tunnel) {
> +        return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
> +    }
> +
> +    return false;
> +}
> +
> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
> quiescent
> + * state, caller should take a reference. */
> +static struct gid_node *
> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct gid_node *node;
> +
> +    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, 
> &gid_metadata_map) {
> +        if (dpif_sflow_attr_equal(&node->sflow, target)) {
> +            return node;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +static struct gid_node *
> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
> +{
> +    struct gid_node *node;
> +
> +    do {
> +        node = gid_find_equal(target, hash);
> +        /* Try again if the node was released before we get the 
> reference. */
> +    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
> +
> +    return node;
> +}
> +
> +static void
> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
> +                      const struct dpif_sflow_attr *old)
> +{
> +    new->sflow = xmalloc(old->sflow->nla_len);
> +    memcpy(CONST_CAST(void *, new->sflow), old->sflow, 
> old->sflow->nla_len);
> +

There is xmemdup() you could use here, and below

> +    new->userdata_len = old->userdata_len;
> +    new->userdata = xmalloc(new->userdata_len);
> +    memcpy(new->userdata, old->userdata, new->userdata_len);
> +
> +    if (old->tunnel) {
> +        new->tunnel = xzalloc(sizeof *new->tunnel);
> +        memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
> +    } else {
> +        new->tunnel = NULL;
> +    }
> +}
> +
> +/* We use the id as the hash value, which works due to cmap internal 
> rehashing.
> + * We also only insert nodes with unique IDs, so all possible hash 
> collisions
> + * remain internal to the cmap. */
> +static struct gid_node *
> +gid_find__(uint32_t id)
> +    OVS_REQUIRES(gid_rwlock)
> +{
> +    struct cmap_node *node = cmap_find_protected(&gid_map, id);
> +
> +    return node ? CONTAINER_OF(node, struct gid_node, id_node) : 
> NULL;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata.
> + * The ID space is 2^^32, so there should never be a situation in 
> which all
> + * the IDs are used up.  We loop until we find a free one. */

I don't agree that we should assume we find an entry because if there is 
a bug we will loop forever. A simple fix would be to make sure we have 
wrapped around, if so quit with NULL. What might even be better is to 
have a fixed amount of tries (8K, 64K, or something), as I'm wondering 
how much 2^32 - 1 will take ;) Or some warning message if it takes more 
than xK retries? Just something that we know we might need to re-work 
this function.

> +static struct gid_node *
> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
> +{
> +    struct gid_node *node = xzalloc(sizeof *node);
> +
> +    node->hash = hash;
> +    ovs_refcount_init(&node->refcount);
> +    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, 
> &node->sflow),
> +                          sflow);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    for (;;) {
> +        node->id = next_group_id++;
> +        if (OVS_UNLIKELY(!node->id)) {
> +            next_group_id = 1;
> +            node->id = next_group_id++;
> +        }
> +        /* Find if the id is free. */
> +        if (OVS_LIKELY(!gid_find__(node->id))) {
> +            break;
> +        }
> +    }
> +    cmap_insert(&gid_map, &node->id_node, node->id);
> +    cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
> +    ovs_rwlock_unlock(&gid_rwlock);
> +    return node;
> +}
> +
> +/* Allocate a unique group id for the given set of flow metadata and
> +   optional actions. */
> +static uint32_t
> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
> +{
> +    uint32_t hash = dpif_sflow_attr_hash(sflow);
> +    struct gid_node *node = gid_ref_equal(sflow, hash);
> +
> +    if (!node) {
> +        node = gid_alloc__(sflow, hash);
> +    }

I think gid_alloc__() should be allowed to fail (see above), so check 
for it:
        return node ? node->id : 0

> +    return node->id;
> +}
> +
> +static void
> +gid_node_unref(const struct gid_node *node_)
> +    OVS_EXCLUDED(gid_rwlock)
> +{
> +    struct gid_node *node = CONST_CAST(struct gid_node *, node_);
> +
> +    ovs_rwlock_wrlock(&gid_rwlock);
> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
> +        /* Prevent re-use of this node by removing the node from
> +         * gid_metadata_map' */
> +        cmap_remove(&gid_metadata_map, &node->metadata_node, 
> node->hash);
> +        /* We keep the node in the 'gid_map' so that it can be found 
> as
> +         * long as it lingers, and add it to the 'gid_expiring' list. 
> */
> +        ovs_list_insert(&gid_expiring, &node->exp_node);

Why do we do a delayed remove here? Any sflow packets in the pipeline 
can be silently deleted, based on a failing sgid_find().

The only thing concern might be ID reuse. But I think with 32^2 IDs, and 
the current allocator schema, this is unlikely to happen.

It would make the code cleaner, and the rwlock can be a normal lock.

> +    }
> +    ovs_rwlock_unlock(&gid_rwlock);
> +}
> +
> +static void
> +gid_free(uint32_t id)
> +{
> +    const struct gid_node *node;

All use cases of gid_free() check for 0 before calling, maybe we can 
include it here and remove it from the other places. This makes it 
behave like free().

> +
> +    node = gid_find(id);
> +    if (node) {
> +        gid_node_unref(node);
> +    } else {
> +        VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
> +    }
> +}
> +
>  static bool
>  is_internal_port(const char *type)
>  {
> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, 
> const ovs_u128 *ufid)
>      if (!err) {
>          del_ufid_tc_mapping(ufid);
>      }
> +    if (id->sflow_group_id) {
> +        gid_free(id->sflow_group_id);
> +    }
>      return err;
>  }
>
> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>
>      *dump_out = dump;
>
> +    gid_cleanup();
> +

I guess this can be removed if we can delete the pigs directly, if not, 
we might want to put a comment here explaining why this is called from 
this function.

>      return 0;
>  }
>
> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, 
> struct match *match,
>              action->type = TC_ACT_GOTO;
>              action->chain = 0;  /* 0 is reserved and not used by 
> recirc. */
>              flower.action_count++;
> +        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
> +            struct dpif_sflow_attr sflow_attr;
> +
> +            memset(&sflow_attr, 0, sizeof sflow_attr);
> +            gid_alloc_ctx(&sflow_attr);

Guess here we should report an error if gid_alloc_ctx() fails.
  if (!gid_alloc_ctx(&sflow_attr)) {
     VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");

  }

>          } else {
>              VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>                          nl_attr_type(nla));
> diff --git a/lib/tc.h b/lib/tc.h
> index 281231c0d..cc2ad025d 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -273,6 +273,7 @@ struct tcf_id {
>      uint32_t chain;
>      uint16_t prio;
>      uint32_t handle;
> +    uint32_t sflow_group_id;
>  };
>
>  static inline struct tcf_id
> -- 
> 2.26.2
Chris Mi Jan. 22, 2021, 3:27 a.m. UTC | #2
On 1/13/2021 3:49 AM, Eelco Chaudron wrote:
>
>
> On 15 Dec 2020, at 4:38, Chris Mi wrote:
>
>> When offloading sample action to TC, userspace creates a unique ID
>> to map sFlow action and tunnel info and passes this ID to kernel instead
>> of the sFlow info. psample will send this ID and sampled packet to
>> userspace. Using the ID, userspace can recover the sFlow info and send
>> sampled packet to the right sFlow monitoring host.
>>
>> Signed-off-by: Chris Mi <cmi@nvidia.com>
>> Reviewed-by: Eli Britstein <elibr@nvidia.com>
>> ---
>>  lib/netdev-offload-tc.c | 260 ++++++++++++++++++++++++++++++++++++++++
>>  lib/tc.h                |   1 +
>>  2 files changed, 261 insertions(+)
>>
>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
>> index 9c5be9f92..5c3c811a2 100644
>> --- a/lib/netdev-offload-tc.c
>> +++ b/lib/netdev-offload-tc.c
>> @@ -39,6 +39,7 @@
>>  #include "unaligned.h"
>>  #include "util.h"
>>  #include "dpif-provider.h"
>> +#include "cmap.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
>>
>> @@ -57,6 +58,255 @@ struct netlink_field {
>>      int size;
>>  };
>>
>> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */
>> +struct gid_node {
>> +    struct ovs_list exp_node OVS_GUARDED;
>> +    struct cmap_node metadata_node;
>> +    struct cmap_node id_node;
>> +    struct ovs_refcount refcount;
>> +    uint32_t hash;
>> +    uint32_t id;
>> +    const struct dpif_sflow_attr sflow;
>> +};
>
> All your variables/functions etc. all use gid (group ID) but this 
> seems a bit general. Maybe you can make it more sample specific, i.e. 
> sgid?
Done.
>
>> +
>> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
>> +
>> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
>> +
>> +static struct cmap gid_map = CMAP_INITIALIZER;
>> +static struct cmap gid_metadata_map = CMAP_INITIALIZER;
>> +
>> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
>> +    = OVS_LIST_INITIALIZER(&gid_expiring);
>> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
>> +    = OVS_LIST_INITIALIZER(&gid_expired);
>> +
>> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
>> +
>> +#define GID_RUN_INTERVAL   250     /* msec */
>> +
>> +static void
>> +gid_node_free(struct gid_node *node)
>> +{
>> +    if (node->sflow.tunnel) {
> free() does the null check, so this can just be free(node->sflow.tunnel);
Done.
>> +        free(node->sflow.tunnel); + }
>> +    free(CONST_CAST(void *, node->sflow.sflow));
>> +    free(node->sflow.userdata);
>> +    free(node);
>> +}
>> +
>> +static void
>> +gid_cleanup(void)
>> +{
>> +    long long int now = time_msec();
>> +    struct gid_node *node;
>> +
>> +    /* Do maintenance at most 4 times / sec. */
>> +    ovs_rwlock_rdlock(&gid_rwlock);
>> +    if (now - gid_last_run < GID_RUN_INTERVAL) {
>> +        ovs_rwlock_unlock(&gid_rwlock);
>> +        return;
>> +    }
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    gid_last_run = now;
>> +
>> +    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
>> +        cmap_remove(&gid_map, &node->id_node, node->id);
>> +        ovsrcu_postpone(gid_node_free, node);
>> +    }
>> +
>> +    if (!ovs_list_is_empty(&gid_expiring)) {
>> +        /* 'gid_expired' is now empty, move nodes in
>> +         * 'gid_expiring' to it. */
>> +        ovs_list_splice(&gid_expired,
>> +                        ovs_list_front(&gid_expiring),
>> +                        &gid_expiring);
>> +    }
>
> Trying to understand why we have two list? Why can't we directly 
> cleanup from expiring list?
The code takes the recirc id management for example (ofproto-dpif-rid.c).
I'm not 100% sure the two lists are really needed here. But the recirc 
id code exists for a long time.
I think it is mature. And I don't want to re-invent something.

And when psample thread receives the sampled packets, it needs to lookup 
the sgid_node
based on the group id. But the sgid_node could be deleted anytime by the 
revalidator.
So I think two list can resolve this issue. We have 250 ms interval to 
wait for the sampled packets
on the fly to be processed.
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
>> quiescent
>> + * state, caller should copy the contents. */
>> +static const struct gid_node *
>> +gid_find(uint32_t id)
>> +{
>> +    const struct cmap_node *node = cmap_find(&gid_map, id);
>> +
>> +    return node
>> +        ? CONTAINER_OF(node, const struct gid_node, id_node)
>> +        : NULL;
>> +}
>> +
>> +static uint32_t
>> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
>> +{
>> +    uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 
>> 0);
>> +    uint32_t hash2;
>> +
>> +    if (!sflow->tunnel) {
>> +        return hash1;
>> +    }
>> +
>> +    hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
>> +    return hash_add(hash1, hash2);
>
> I think this can be:
>     uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);
>
>     if (sflow->tunnel) {
>         hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash);
>     }
>     return hash;
>
Done. Thanks.
>
>> +}
>> +
>> +static bool
>> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
>> +                      const struct dpif_sflow_attr *b)
>> +{
>> +    if (a->sflow->nla_len != b->sflow->nla_len
>> +        || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
>> +        return false;
>> +    }
>> +    if (!a->tunnel && !b->tunnel) {
>> +        return true;
>> +    }
>> +    if (a->tunnel && b->tunnel) {
>> +        return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +/* Lockless RCU protected lookup.  If node is needed accross RCU 
>> quiescent
>> + * state, caller should take a reference. */
>> +static struct gid_node *
>> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> +    struct gid_node *node;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, 
>> &gid_metadata_map) {
>> +        if (dpif_sflow_attr_equal(&node->sflow, target)) {
>> +            return node;
>> +        }
>> +    }
>> +    return NULL;
>> +}
>> +
>> +static struct gid_node *
>> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
>> +{
>> +    struct gid_node *node;
>> +
>> +    do {
>> +        node = gid_find_equal(target, hash);
>> +        /* Try again if the node was released before we get the 
>> reference. */
>> +    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
>> +
>> +    return node;
>> +}
>> +
>> +static void
>> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
>> +                      const struct dpif_sflow_attr *old)
>> +{
>> +    new->sflow = xmalloc(old->sflow->nla_len);
>> +    memcpy(CONST_CAST(void *, new->sflow), old->sflow, 
>> old->sflow->nla_len);
>> +
>
> There is xmemdup() you could use here, and below
Done. Thanks. The code is greatly simplified.
>
>> +    new->userdata_len = old->userdata_len;
>> +    new->userdata = xmalloc(new->userdata_len);
>> +    memcpy(new->userdata, old->userdata, new->userdata_len);
>> +
>> +    if (old->tunnel) {
>> +        new->tunnel = xzalloc(sizeof *new->tunnel);
>> +        memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
>> +    } else {
>> +        new->tunnel = NULL;
>> +    }
>> +}
>> +
>> +/* We use the id as the hash value, which works due to cmap internal 
>> rehashing.
>> + * We also only insert nodes with unique IDs, so all possible hash 
>> collisions
>> + * remain internal to the cmap. */
>> +static struct gid_node *
>> +gid_find__(uint32_t id)
>> +    OVS_REQUIRES(gid_rwlock)
>> +{
>> +    struct cmap_node *node = cmap_find_protected(&gid_map, id);
>> +
>> +    return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata.
>> + * The ID space is 2^^32, so there should never be a situation in 
>> which all
>> + * the IDs are used up.  We loop until we find a free one. */
>
> I don't agree that we should assume we find an entry because if there 
> is a bug we will loop forever. A simple fix would be to make sure we 
> have wrapped around, if so quit with NULL. What might even be better 
> is to have a fixed amount of tries (8K, 64K, or something), as I'm 
> wondering how much 2^32 - 1 will take ;) Or some warning message if it 
> takes more than xK retries? Just something that we know we might need 
> to re-work this function.
Done.
>
>> +static struct gid_node *
>> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
>> +{
>> +    struct gid_node *node = xzalloc(sizeof *node);
>> +
>> +    node->hash = hash;
>> +    ovs_refcount_init(&node->refcount);
>> +    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, 
>> &node->sflow),
>> +                          sflow);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    for (;;) {
>> +        node->id = next_group_id++;
>> +        if (OVS_UNLIKELY(!node->id)) {
>> +            next_group_id = 1;
>> +            node->id = next_group_id++;
>> +        }
>> +        /* Find if the id is free. */
>> +        if (OVS_LIKELY(!gid_find__(node->id))) {
>> +            break;
>> +        }
>> +    }
>> +    cmap_insert(&gid_map, &node->id_node, node->id);
>> +    cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +    return node;
>> +}
>> +
>> +/* Allocate a unique group id for the given set of flow metadata and
>> +   optional actions. */
>> +static uint32_t
>> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
>> +{
>> +    uint32_t hash = dpif_sflow_attr_hash(sflow);
>> +    struct gid_node *node = gid_ref_equal(sflow, hash);
>> +
>> +    if (!node) {
>> +        node = gid_alloc__(sflow, hash);
>> +    }
>
> I think gid_alloc__() should be allowed to fail (see above), so check 
> for it:
>        return node ? node->id : 0
Done.
>
>> +    return node->id;
>> +}
>> +
>> +static void
>> +gid_node_unref(const struct gid_node *node_)
>> +    OVS_EXCLUDED(gid_rwlock)
>> +{
>> +    struct gid_node *node = CONST_CAST(struct gid_node *, node_);
>> +
>> +    ovs_rwlock_wrlock(&gid_rwlock);
>> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
>> +        /* Prevent re-use of this node by removing the node from
>> +         * gid_metadata_map' */
>> +        cmap_remove(&gid_metadata_map, &node->metadata_node, 
>> node->hash);
>> +        /* We keep the node in the 'gid_map' so that it can be found as
>> +         * long as it lingers, and add it to the 'gid_expiring' 
>> list. */
>> +        ovs_list_insert(&gid_expiring, &node->exp_node);
>
> Why do we do a delayed remove here? Any sflow packets in the pipeline 
> can be silently deleted, based on a failing sgid_find().
Please see my above comment for the two list issue.
> The only thing concern might be ID reuse. But I think with 32^2 IDs, 
> and the current allocator schema, this is unlikely to happen.
>
> It would make the code cleaner, and the rwlock can be a normal lock.
>
>> +    }
>> +    ovs_rwlock_unlock(&gid_rwlock);
>> +}
>> +
>> +static void
>> +gid_free(uint32_t id)
>> +{
>> +    const struct gid_node *node;
>
> All use cases of gid_free() check for 0 before calling, maybe we can 
> include it here and remove it from the other places. This makes it 
> behave like free().
Done.
>
>> +
>> +    node = gid_find(id);
>> +    if (node) {
>> +        gid_node_unref(node);
>> +    } else {
>> +        VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
>> +    }
>> +}
>> +
>>  static bool
>>  is_internal_port(const char *type)
>>  {
>> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, 
>> const ovs_u128 *ufid)
>>      if (!err) {
>>          del_ufid_tc_mapping(ufid);
>>      }
>> +    if (id->sflow_group_id) {
>> +        gid_free(id->sflow_group_id);
>> +    }
>>      return err;
>>  }
>>
>> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev,
>>
>>      *dump_out = dump;
>>
>> +    gid_cleanup();
>> +
>
> I guess this can be removed if we can delete the pigs directly, if 
> not, we might want to put a comment here explaining why this is called 
> from this function.
OK, I'll put a comment here.
>
>>      return 0;
>>  }
>>
>> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, 
>> struct match *match,
>>              action->type = TC_ACT_GOTO;
>>              action->chain = 0;  /* 0 is reserved and not used by 
>> recirc. */
>>              flower.action_count++;
>> +        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
>> +            struct dpif_sflow_attr sflow_attr;
>> +
>> +            memset(&sflow_attr, 0, sizeof sflow_attr);
>> +            gid_alloc_ctx(&sflow_attr);
>
> Guess here we should report an error if gid_alloc_ctx() fails.
>  if (!gid_alloc_ctx(&sflow_attr)) {
>     VLOG_DBG_RL(&rl, "Failed allocating group id for sample action");
>
>  }
Will fix it in the last patch.
>
>>          } else {
>>              VLOG_DBG_RL(&rl, "unsupported put action type: %d",
>>                          nl_attr_type(nla));
>> diff --git a/lib/tc.h b/lib/tc.h
>> index 281231c0d..cc2ad025d 100644
>> --- a/lib/tc.h
>> +++ b/lib/tc.h
>> @@ -273,6 +273,7 @@ struct tcf_id {
>>      uint32_t chain;
>>      uint16_t prio;
>>      uint32_t handle;
>> +    uint32_t sflow_group_id;
>>  };
>>
>>  static inline struct tcf_id
>> -- 
>> 2.26.2
>
Eelco Chaudron Jan. 25, 2021, 9:29 a.m. UTC | #3
On 22 Jan 2021, at 4:27, Chris Mi wrote:

> On 1/13/2021 3:49 AM, Eelco Chaudron wrote:
>>
>>
>> On 15 Dec 2020, at 4:38, Chris Mi wrote:

<SNIP>

>>> +
>>> +static void
>>> +gid_cleanup(void)
>>> +{
>>> +    long long int now = time_msec();
>>> +    struct gid_node *node;
>>> +
>>> +    /* Do maintenance at most 4 times / sec. */
>>> +    ovs_rwlock_rdlock(&gid_rwlock);
>>> +    if (now - gid_last_run < GID_RUN_INTERVAL) {
>>> +        ovs_rwlock_unlock(&gid_rwlock);
>>> +        return;
>>> +    }
>>> +    ovs_rwlock_unlock(&gid_rwlock);
>>> +
>>> +    ovs_rwlock_wrlock(&gid_rwlock);
>>> +    gid_last_run = now;
>>> +
>>> +    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
>>> +        cmap_remove(&gid_map, &node->id_node, node->id);
>>> +        ovsrcu_postpone(gid_node_free, node);
>>> +    }
>>> +
>>> +    if (!ovs_list_is_empty(&gid_expiring)) {
>>> +        /* 'gid_expired' is now empty, move nodes in
>>> +         * 'gid_expiring' to it. */
>>> +        ovs_list_splice(&gid_expired,
>>> +                        
>>> ovs_list_front(&gid_expiring),
>>> +                        &gid_expiring);
>>> +    }
>>
>> Trying to understand why we have two list? Why can't we directly 
>> cleanup from expiring list?
> The code takes the recirc id management for example 
> (ofproto-dpif-rid.c).
> I'm not 100% sure the two lists are really needed here. But the recirc 
> id code exists for a long time.
> I think it is mature. And I don't want to re-invent something.
>
> And when psample thread receives the sampled packets, it needs to 
> lookup the sgid_node
> based on the group id. But the sgid_node could be deleted anytime by 
> the revalidator.
> So I think two list can resolve this issue. We have 250 ms interval to 
> wait for the sampled packets
> on the fly to be processed.

Just adding code because it exists somewhere else does not make sense to 
me. If you do not have a valid reason, why add all this complexity?

I would like some other reviewer's comments on this, maybe I missed 
something, Ilya?

>>> +    ovs_rwlock_unlock(&gid_rwlock);
>>> +}
>>> +

<SNIP>

>>> +
>>> +static void
>>> +gid_node_unref(const struct gid_node *node_)
>>> +    OVS_EXCLUDED(gid_rwlock)
>>> +{
>>> +    struct gid_node *node = CONST_CAST(struct gid_node *, 
>>> node_);
>>> +
>>> +    ovs_rwlock_wrlock(&gid_rwlock);
>>> +    if (node && ovs_refcount_unref(&node->refcount) == 1) {
>>> +        /* Prevent re-use of this node by removing the node 
>>> from
>>> +         * gid_metadata_map' */
>>> +        cmap_remove(&gid_metadata_map, &node->metadata_node, 
>>> node->hash);
>>> +        /* We keep the node in the 'gid_map' so that it can 
>>> be found as
>>> +         * long as it lingers, and add it to the 
>>> 'gid_expiring' list. */
>>> +        ovs_list_insert(&gid_expiring, &node->exp_node);
>>
>> Why do we do a delayed remove here? Any sflow packets in the pipeline 
>> can be silently deleted, based on a failing sgid_find().
> Please see my above comment for the two list issue.
>> The only thing concern might be ID reuse. But I think with 32^2 IDs, 
>> and the current allocator schema, this is unlikely to happen.
>>
>> It would make the code cleaner, and the rwlock can be a normal lock.
>>
>>> +    }
>>> +    ovs_rwlock_unlock(&gid_rwlock);
>>> +}
>>> +
>>> +static void
>>> +gid_free(uint32_t id)
>>> +{
>>> +    const struct gid_node *node;
>>

<SNIP>

>>>
>>> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev 
>>> *netdev,
>>>
>>>      *dump_out = dump;
>>>
>>> +    gid_cleanup();
>>> +
>>
>> I guess this can be removed if we can delete the pigs directly, if 
>> not, we might want to put a comment here explaining why this is 
>> called from this function.
> OK, I'll put a comment here.
>>
>>>      return 0;
>>>  }
>>>
>>> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, 
>>> struct match *match,
>>>              action->type = TC_ACT_GOTO;
>>>              action->chain = 0;  /* 0 is reserved and 
>>> not used by recirc. */
>>>              flower.action_count++;
>>> +        } else if (nl_attr_type(nla) == 
>>> OVS_ACTION_ATTR_SAMPLE) {
>>> +            struct dpif_sflow_attr sflow_attr;
>>> +
>>> +            memset(&sflow_attr, 0, sizeof sflow_attr);
>>> +            gid_alloc_ctx(&sflow_attr);
>>
diff mbox series

Patch

diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
index 9c5be9f92..5c3c811a2 100644
--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -39,6 +39,7 @@ 
 #include "unaligned.h"
 #include "util.h"
 #include "dpif-provider.h"
+#include "cmap.h"
 
 VLOG_DEFINE_THIS_MODULE(netdev_offload_tc);
 
@@ -57,6 +58,255 @@  struct netlink_field {
     int size;
 };
 
+/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */
+struct gid_node {
+    struct ovs_list exp_node OVS_GUARDED;
+    struct cmap_node metadata_node;
+    struct cmap_node id_node;
+    struct ovs_refcount refcount;
+    uint32_t hash;
+    uint32_t id;
+    const struct dpif_sflow_attr sflow;
+};
+
+static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER;
+
+static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock);
+
+static struct cmap gid_map = CMAP_INITIALIZER;
+static struct cmap gid_metadata_map = CMAP_INITIALIZER;
+
+static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock)
+    = OVS_LIST_INITIALIZER(&gid_expiring);
+static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock)
+    = OVS_LIST_INITIALIZER(&gid_expired);
+
+static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1;
+
+#define GID_RUN_INTERVAL   250     /* msec */
+
+static void
+gid_node_free(struct gid_node *node)
+{
+    if (node->sflow.tunnel) {
+        free(node->sflow.tunnel);
+    }
+    free(CONST_CAST(void *, node->sflow.sflow));
+    free(node->sflow.userdata);
+    free(node);
+}
+
+static void
+gid_cleanup(void)
+{
+    long long int now = time_msec();
+    struct gid_node *node;
+
+    /* Do maintenance at most 4 times / sec. */
+    ovs_rwlock_rdlock(&gid_rwlock);
+    if (now - gid_last_run < GID_RUN_INTERVAL) {
+        ovs_rwlock_unlock(&gid_rwlock);
+        return;
+    }
+    ovs_rwlock_unlock(&gid_rwlock);
+
+    ovs_rwlock_wrlock(&gid_rwlock);
+    gid_last_run = now;
+
+    LIST_FOR_EACH_POP (node, exp_node, &gid_expired) {
+        cmap_remove(&gid_map, &node->id_node, node->id);
+        ovsrcu_postpone(gid_node_free, node);
+    }
+
+    if (!ovs_list_is_empty(&gid_expiring)) {
+        /* 'gid_expired' is now empty, move nodes in
+         * 'gid_expiring' to it. */
+        ovs_list_splice(&gid_expired,
+                        ovs_list_front(&gid_expiring),
+                        &gid_expiring);
+    }
+    ovs_rwlock_unlock(&gid_rwlock);
+}
+
+/* Lockless RCU protected lookup.  If node is needed accross RCU quiescent
+ * state, caller should copy the contents. */
+static const struct gid_node *
+gid_find(uint32_t id)
+{
+    const struct cmap_node *node = cmap_find(&gid_map, id);
+
+    return node
+        ? CONTAINER_OF(node, const struct gid_node, id_node)
+        : NULL;
+}
+
+static uint32_t
+dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow)
+{
+    uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0);
+    uint32_t hash2;
+
+    if (!sflow->tunnel) {
+        return hash1;
+    }
+
+    hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0);
+    return hash_add(hash1, hash2);
+}
+
+static bool
+dpif_sflow_attr_equal(const struct dpif_sflow_attr *a,
+                      const struct dpif_sflow_attr *b)
+{
+    if (a->sflow->nla_len != b->sflow->nla_len
+        || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) {
+        return false;
+    }
+    if (!a->tunnel && !b->tunnel) {
+        return true;
+    }
+    if (a->tunnel && b->tunnel) {
+        return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel);
+    }
+
+    return false;
+}
+
+/* Lockless RCU protected lookup.  If node is needed accross RCU quiescent
+ * state, caller should take a reference. */
+static struct gid_node *
+gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash)
+{
+    struct gid_node *node;
+
+    CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, &gid_metadata_map) {
+        if (dpif_sflow_attr_equal(&node->sflow, target)) {
+            return node;
+        }
+    }
+    return NULL;
+}
+
+static struct gid_node *
+gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash)
+{
+    struct gid_node *node;
+
+    do {
+        node = gid_find_equal(target, hash);
+        /* Try again if the node was released before we get the reference. */
+    } while (node && !ovs_refcount_try_ref_rcu(&node->refcount));
+
+    return node;
+}
+
+static void
+dpif_sflow_attr_clone(struct dpif_sflow_attr *new,
+                      const struct dpif_sflow_attr *old)
+{
+    new->sflow = xmalloc(old->sflow->nla_len);
+    memcpy(CONST_CAST(void *, new->sflow), old->sflow, old->sflow->nla_len);
+
+    new->userdata_len = old->userdata_len;
+    new->userdata = xmalloc(new->userdata_len);
+    memcpy(new->userdata, old->userdata, new->userdata_len);
+
+    if (old->tunnel) {
+        new->tunnel = xzalloc(sizeof *new->tunnel);
+        memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel);
+    } else {
+        new->tunnel = NULL;
+    }
+}
+
+/* We use the id as the hash value, which works due to cmap internal rehashing.
+ * We also only insert nodes with unique IDs, so all possible hash collisions
+ * remain internal to the cmap. */
+static struct gid_node *
+gid_find__(uint32_t id)
+    OVS_REQUIRES(gid_rwlock)
+{
+    struct cmap_node *node = cmap_find_protected(&gid_map, id);
+
+    return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL;
+}
+
+/* Allocate a unique group id for the given set of flow metadata.
+ * The ID space is 2^^32, so there should never be a situation in which all
+ * the IDs are used up.  We loop until we find a free one. */
+static struct gid_node *
+gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash)
+{
+    struct gid_node *node = xzalloc(sizeof *node);
+
+    node->hash = hash;
+    ovs_refcount_init(&node->refcount);
+    dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, &node->sflow),
+                          sflow);
+
+    ovs_rwlock_wrlock(&gid_rwlock);
+    for (;;) {
+        node->id = next_group_id++;
+        if (OVS_UNLIKELY(!node->id)) {
+            next_group_id = 1;
+            node->id = next_group_id++;
+        }
+        /* Find if the id is free. */
+        if (OVS_LIKELY(!gid_find__(node->id))) {
+            break;
+        }
+    }
+    cmap_insert(&gid_map, &node->id_node, node->id);
+    cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash);
+    ovs_rwlock_unlock(&gid_rwlock);
+    return node;
+}
+
+/* Allocate a unique group id for the given set of flow metadata and
+   optional actions. */
+static uint32_t
+gid_alloc_ctx(const struct dpif_sflow_attr *sflow)
+{
+    uint32_t hash = dpif_sflow_attr_hash(sflow);
+    struct gid_node *node = gid_ref_equal(sflow, hash);
+
+    if (!node) {
+        node = gid_alloc__(sflow, hash);
+    }
+    return node->id;
+}
+
+static void
+gid_node_unref(const struct gid_node *node_)
+    OVS_EXCLUDED(gid_rwlock)
+{
+    struct gid_node *node = CONST_CAST(struct gid_node *, node_);
+
+    ovs_rwlock_wrlock(&gid_rwlock);
+    if (node && ovs_refcount_unref(&node->refcount) == 1) {
+        /* Prevent re-use of this node by removing the node from
+         * gid_metadata_map' */
+        cmap_remove(&gid_metadata_map, &node->metadata_node, node->hash);
+        /* We keep the node in the 'gid_map' so that it can be found as
+         * long as it lingers, and add it to the 'gid_expiring' list. */
+        ovs_list_insert(&gid_expiring, &node->exp_node);
+    }
+    ovs_rwlock_unlock(&gid_rwlock);
+}
+
+static void
+gid_free(uint32_t id)
+{
+    const struct gid_node *node;
+
+    node = gid_find(id);
+    if (node) {
+        gid_node_unref(node);
+    } else {
+        VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id);
+    }
+}
+
 static bool
 is_internal_port(const char *type)
 {
@@ -203,6 +453,9 @@  del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid)
     if (!err) {
         del_ufid_tc_mapping(ufid);
     }
+    if (id->sflow_group_id) {
+        gid_free(id->sflow_group_id);
+    }
     return err;
 }
 
@@ -398,6 +651,8 @@  netdev_tc_flow_dump_create(struct netdev *netdev,
 
     *dump_out = dump;
 
+    gid_cleanup();
+
     return 0;
 }
 
@@ -1797,6 +2052,11 @@  netdev_tc_flow_put(struct netdev *netdev, struct match *match,
             action->type = TC_ACT_GOTO;
             action->chain = 0;  /* 0 is reserved and not used by recirc. */
             flower.action_count++;
+        } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) {
+            struct dpif_sflow_attr sflow_attr;
+
+            memset(&sflow_attr, 0, sizeof sflow_attr);
+            gid_alloc_ctx(&sflow_attr);
         } else {
             VLOG_DBG_RL(&rl, "unsupported put action type: %d",
                         nl_attr_type(nla));
diff --git a/lib/tc.h b/lib/tc.h
index 281231c0d..cc2ad025d 100644
--- a/lib/tc.h
+++ b/lib/tc.h
@@ -273,6 +273,7 @@  struct tcf_id {
     uint32_t chain;
     uint16_t prio;
     uint32_t handle;
+    uint32_t sflow_group_id;
 };
 
 static inline struct tcf_id