Message ID | 20201215033812.145975-9-cmi@nvidia.com |
---|---|
State | Changes Requested |
Headers | show |
Series | Add offload support for sFlow | expand |
On 15 Dec 2020, at 4:38, Chris Mi wrote: > When offloading sample action to TC, userspace creates a unique ID > to map sFlow action and tunnel info and passes this ID to kernel > instead > of the sFlow info. psample will send this ID and sampled packet to > userspace. Using the ID, userspace can recover the sFlow info and send > sampled packet to the right sFlow monitoring host. > > Signed-off-by: Chris Mi <cmi@nvidia.com> > Reviewed-by: Eli Britstein <elibr@nvidia.com> > --- > lib/netdev-offload-tc.c | 260 > ++++++++++++++++++++++++++++++++++++++++ > lib/tc.h | 1 + > 2 files changed, 261 insertions(+) > > diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c > index 9c5be9f92..5c3c811a2 100644 > --- a/lib/netdev-offload-tc.c > +++ b/lib/netdev-offload-tc.c > @@ -39,6 +39,7 @@ > #include "unaligned.h" > #include "util.h" > #include "dpif-provider.h" > +#include "cmap.h" > > VLOG_DEFINE_THIS_MODULE(netdev_offload_tc); > > @@ -57,6 +58,255 @@ struct netlink_field { > int size; > }; > > +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow > */ > +struct gid_node { > + struct ovs_list exp_node OVS_GUARDED; > + struct cmap_node metadata_node; > + struct cmap_node id_node; > + struct ovs_refcount refcount; > + uint32_t hash; > + uint32_t id; > + const struct dpif_sflow_attr sflow; > +}; All your variables/functions etc. all use gid (group ID) but this seems a bit general. Maybe you can make it more sample specific, i.e. sgid? > + > +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER; > + > +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock); > + > +static struct cmap gid_map = CMAP_INITIALIZER; > +static struct cmap gid_metadata_map = CMAP_INITIALIZER; > + > +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock) > + = OVS_LIST_INITIALIZER(&gid_expiring); > +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock) > + = OVS_LIST_INITIALIZER(&gid_expired); > + > +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1; > + > +#define GID_RUN_INTERVAL 250 /* msec */ > + > +static void > +gid_node_free(struct gid_node *node) > +{ > + if (node->sflow.tunnel) { free() does the null check, so this can just be free(node->sflow.tunnel); > + free(node->sflow.tunnel); + } > + free(CONST_CAST(void *, node->sflow.sflow)); > + free(node->sflow.userdata); > + free(node); > +} > + > +static void > +gid_cleanup(void) > +{ > + long long int now = time_msec(); > + struct gid_node *node; > + > + /* Do maintenance at most 4 times / sec. */ > + ovs_rwlock_rdlock(&gid_rwlock); > + if (now - gid_last_run < GID_RUN_INTERVAL) { > + ovs_rwlock_unlock(&gid_rwlock); > + return; > + } > + ovs_rwlock_unlock(&gid_rwlock); > + > + ovs_rwlock_wrlock(&gid_rwlock); > + gid_last_run = now; > + > + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) { > + cmap_remove(&gid_map, &node->id_node, node->id); > + ovsrcu_postpone(gid_node_free, node); > + } > + > + if (!ovs_list_is_empty(&gid_expiring)) { > + /* 'gid_expired' is now empty, move nodes in > + * 'gid_expiring' to it. */ > + ovs_list_splice(&gid_expired, > + ovs_list_front(&gid_expiring), > + &gid_expiring); > + } Trying to understand why we have two list? Why can't we directly cleanup from expiring list? > + ovs_rwlock_unlock(&gid_rwlock); > +} > + > +/* Lockless RCU protected lookup. If node is needed accross RCU > quiescent > + * state, caller should copy the contents. */ > +static const struct gid_node * > +gid_find(uint32_t id) > +{ > + const struct cmap_node *node = cmap_find(&gid_map, id); > + > + return node > + ? CONTAINER_OF(node, const struct gid_node, id_node) > + : NULL; > +} > + > +static uint32_t > +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow) > +{ > + uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, > 0); > + uint32_t hash2; > + > + if (!sflow->tunnel) { > + return hash1; > + } > + > + hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0); > + return hash_add(hash1, hash2); I think this can be: uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0); if (sflow->tunnel) { hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash); } return hash; > +} > + > +static bool > +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a, > + const struct dpif_sflow_attr *b) > +{ > + if (a->sflow->nla_len != b->sflow->nla_len > + || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) { > + return false; > + } > + if (!a->tunnel && !b->tunnel) { > + return true; > + } > + if (a->tunnel && b->tunnel) { > + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel); > + } > + > + return false; > +} > + > +/* Lockless RCU protected lookup. If node is needed accross RCU > quiescent > + * state, caller should take a reference. */ > +static struct gid_node * > +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash) > +{ > + struct gid_node *node; > + > + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, > &gid_metadata_map) { > + if (dpif_sflow_attr_equal(&node->sflow, target)) { > + return node; > + } > + } > + return NULL; > +} > + > +static struct gid_node * > +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash) > +{ > + struct gid_node *node; > + > + do { > + node = gid_find_equal(target, hash); > + /* Try again if the node was released before we get the > reference. */ > + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount)); > + > + return node; > +} > + > +static void > +dpif_sflow_attr_clone(struct dpif_sflow_attr *new, > + const struct dpif_sflow_attr *old) > +{ > + new->sflow = xmalloc(old->sflow->nla_len); > + memcpy(CONST_CAST(void *, new->sflow), old->sflow, > old->sflow->nla_len); > + There is xmemdup() you could use here, and below > + new->userdata_len = old->userdata_len; > + new->userdata = xmalloc(new->userdata_len); > + memcpy(new->userdata, old->userdata, new->userdata_len); > + > + if (old->tunnel) { > + new->tunnel = xzalloc(sizeof *new->tunnel); > + memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel); > + } else { > + new->tunnel = NULL; > + } > +} > + > +/* We use the id as the hash value, which works due to cmap internal > rehashing. > + * We also only insert nodes with unique IDs, so all possible hash > collisions > + * remain internal to the cmap. */ > +static struct gid_node * > +gid_find__(uint32_t id) > + OVS_REQUIRES(gid_rwlock) > +{ > + struct cmap_node *node = cmap_find_protected(&gid_map, id); > + > + return node ? CONTAINER_OF(node, struct gid_node, id_node) : > NULL; > +} > + > +/* Allocate a unique group id for the given set of flow metadata. > + * The ID space is 2^^32, so there should never be a situation in > which all > + * the IDs are used up. We loop until we find a free one. */ I don't agree that we should assume we find an entry because if there is a bug we will loop forever. A simple fix would be to make sure we have wrapped around, if so quit with NULL. What might even be better is to have a fixed amount of tries (8K, 64K, or something), as I'm wondering how much 2^32 - 1 will take ;) Or some warning message if it takes more than xK retries? Just something that we know we might need to re-work this function. > +static struct gid_node * > +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash) > +{ > + struct gid_node *node = xzalloc(sizeof *node); > + > + node->hash = hash; > + ovs_refcount_init(&node->refcount); > + dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, > &node->sflow), > + sflow); > + > + ovs_rwlock_wrlock(&gid_rwlock); > + for (;;) { > + node->id = next_group_id++; > + if (OVS_UNLIKELY(!node->id)) { > + next_group_id = 1; > + node->id = next_group_id++; > + } > + /* Find if the id is free. */ > + if (OVS_LIKELY(!gid_find__(node->id))) { > + break; > + } > + } > + cmap_insert(&gid_map, &node->id_node, node->id); > + cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash); > + ovs_rwlock_unlock(&gid_rwlock); > + return node; > +} > + > +/* Allocate a unique group id for the given set of flow metadata and > + optional actions. */ > +static uint32_t > +gid_alloc_ctx(const struct dpif_sflow_attr *sflow) > +{ > + uint32_t hash = dpif_sflow_attr_hash(sflow); > + struct gid_node *node = gid_ref_equal(sflow, hash); > + > + if (!node) { > + node = gid_alloc__(sflow, hash); > + } I think gid_alloc__() should be allowed to fail (see above), so check for it: return node ? node->id : 0 > + return node->id; > +} > + > +static void > +gid_node_unref(const struct gid_node *node_) > + OVS_EXCLUDED(gid_rwlock) > +{ > + struct gid_node *node = CONST_CAST(struct gid_node *, node_); > + > + ovs_rwlock_wrlock(&gid_rwlock); > + if (node && ovs_refcount_unref(&node->refcount) == 1) { > + /* Prevent re-use of this node by removing the node from > + * gid_metadata_map' */ > + cmap_remove(&gid_metadata_map, &node->metadata_node, > node->hash); > + /* We keep the node in the 'gid_map' so that it can be found > as > + * long as it lingers, and add it to the 'gid_expiring' list. > */ > + ovs_list_insert(&gid_expiring, &node->exp_node); Why do we do a delayed remove here? Any sflow packets in the pipeline can be silently deleted, based on a failing sgid_find(). The only thing concern might be ID reuse. But I think with 32^2 IDs, and the current allocator schema, this is unlikely to happen. It would make the code cleaner, and the rwlock can be a normal lock. > + } > + ovs_rwlock_unlock(&gid_rwlock); > +} > + > +static void > +gid_free(uint32_t id) > +{ > + const struct gid_node *node; All use cases of gid_free() check for 0 before calling, maybe we can include it here and remove it from the other places. This makes it behave like free(). > + > + node = gid_find(id); > + if (node) { > + gid_node_unref(node); > + } else { > + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id); > + } > +} > + > static bool > is_internal_port(const char *type) > { > @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, > const ovs_u128 *ufid) > if (!err) { > del_ufid_tc_mapping(ufid); > } > + if (id->sflow_group_id) { > + gid_free(id->sflow_group_id); > + } > return err; > } > > @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev, > > *dump_out = dump; > > + gid_cleanup(); > + I guess this can be removed if we can delete the pigs directly, if not, we might want to put a comment here explaining why this is called from this function. > return 0; > } > > @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, > struct match *match, > action->type = TC_ACT_GOTO; > action->chain = 0; /* 0 is reserved and not used by > recirc. */ > flower.action_count++; > + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) { > + struct dpif_sflow_attr sflow_attr; > + > + memset(&sflow_attr, 0, sizeof sflow_attr); > + gid_alloc_ctx(&sflow_attr); Guess here we should report an error if gid_alloc_ctx() fails. if (!gid_alloc_ctx(&sflow_attr)) { VLOG_DBG_RL(&rl, "Failed allocating group id for sample action"); } > } else { > VLOG_DBG_RL(&rl, "unsupported put action type: %d", > nl_attr_type(nla)); > diff --git a/lib/tc.h b/lib/tc.h > index 281231c0d..cc2ad025d 100644 > --- a/lib/tc.h > +++ b/lib/tc.h > @@ -273,6 +273,7 @@ struct tcf_id { > uint32_t chain; > uint16_t prio; > uint32_t handle; > + uint32_t sflow_group_id; > }; > > static inline struct tcf_id > -- > 2.26.2
On 1/13/2021 3:49 AM, Eelco Chaudron wrote: > > > On 15 Dec 2020, at 4:38, Chris Mi wrote: > >> When offloading sample action to TC, userspace creates a unique ID >> to map sFlow action and tunnel info and passes this ID to kernel instead >> of the sFlow info. psample will send this ID and sampled packet to >> userspace. Using the ID, userspace can recover the sFlow info and send >> sampled packet to the right sFlow monitoring host. >> >> Signed-off-by: Chris Mi <cmi@nvidia.com> >> Reviewed-by: Eli Britstein <elibr@nvidia.com> >> --- >> lib/netdev-offload-tc.c | 260 ++++++++++++++++++++++++++++++++++++++++ >> lib/tc.h | 1 + >> 2 files changed, 261 insertions(+) >> >> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c >> index 9c5be9f92..5c3c811a2 100644 >> --- a/lib/netdev-offload-tc.c >> +++ b/lib/netdev-offload-tc.c >> @@ -39,6 +39,7 @@ >> #include "unaligned.h" >> #include "util.h" >> #include "dpif-provider.h" >> +#include "cmap.h" >> >> VLOG_DEFINE_THIS_MODULE(netdev_offload_tc); >> >> @@ -57,6 +58,255 @@ struct netlink_field { >> int size; >> }; >> >> +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */ >> +struct gid_node { >> + struct ovs_list exp_node OVS_GUARDED; >> + struct cmap_node metadata_node; >> + struct cmap_node id_node; >> + struct ovs_refcount refcount; >> + uint32_t hash; >> + uint32_t id; >> + const struct dpif_sflow_attr sflow; >> +}; > > All your variables/functions etc. all use gid (group ID) but this > seems a bit general. Maybe you can make it more sample specific, i.e. > sgid? Done. > >> + >> +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER; >> + >> +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock); >> + >> +static struct cmap gid_map = CMAP_INITIALIZER; >> +static struct cmap gid_metadata_map = CMAP_INITIALIZER; >> + >> +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock) >> + = OVS_LIST_INITIALIZER(&gid_expiring); >> +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock) >> + = OVS_LIST_INITIALIZER(&gid_expired); >> + >> +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1; >> + >> +#define GID_RUN_INTERVAL 250 /* msec */ >> + >> +static void >> +gid_node_free(struct gid_node *node) >> +{ >> + if (node->sflow.tunnel) { > free() does the null check, so this can just be free(node->sflow.tunnel); Done. >> + free(node->sflow.tunnel); + } >> + free(CONST_CAST(void *, node->sflow.sflow)); >> + free(node->sflow.userdata); >> + free(node); >> +} >> + >> +static void >> +gid_cleanup(void) >> +{ >> + long long int now = time_msec(); >> + struct gid_node *node; >> + >> + /* Do maintenance at most 4 times / sec. */ >> + ovs_rwlock_rdlock(&gid_rwlock); >> + if (now - gid_last_run < GID_RUN_INTERVAL) { >> + ovs_rwlock_unlock(&gid_rwlock); >> + return; >> + } >> + ovs_rwlock_unlock(&gid_rwlock); >> + >> + ovs_rwlock_wrlock(&gid_rwlock); >> + gid_last_run = now; >> + >> + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) { >> + cmap_remove(&gid_map, &node->id_node, node->id); >> + ovsrcu_postpone(gid_node_free, node); >> + } >> + >> + if (!ovs_list_is_empty(&gid_expiring)) { >> + /* 'gid_expired' is now empty, move nodes in >> + * 'gid_expiring' to it. */ >> + ovs_list_splice(&gid_expired, >> + ovs_list_front(&gid_expiring), >> + &gid_expiring); >> + } > > Trying to understand why we have two list? Why can't we directly > cleanup from expiring list? The code takes the recirc id management for example (ofproto-dpif-rid.c). I'm not 100% sure the two lists are really needed here. But the recirc id code exists for a long time. I think it is mature. And I don't want to re-invent something. And when psample thread receives the sampled packets, it needs to lookup the sgid_node based on the group id. But the sgid_node could be deleted anytime by the revalidator. So I think two list can resolve this issue. We have 250 ms interval to wait for the sampled packets on the fly to be processed. >> + ovs_rwlock_unlock(&gid_rwlock); >> +} >> + >> +/* Lockless RCU protected lookup. If node is needed accross RCU >> quiescent >> + * state, caller should copy the contents. */ >> +static const struct gid_node * >> +gid_find(uint32_t id) >> +{ >> + const struct cmap_node *node = cmap_find(&gid_map, id); >> + >> + return node >> + ? CONTAINER_OF(node, const struct gid_node, id_node) >> + : NULL; >> +} >> + >> +static uint32_t >> +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow) >> +{ >> + uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, >> 0); >> + uint32_t hash2; >> + >> + if (!sflow->tunnel) { >> + return hash1; >> + } >> + >> + hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0); >> + return hash_add(hash1, hash2); > > I think this can be: > uint32_t hash = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0); > > if (sflow->tunnel) { > hash = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, hash); > } > return hash; > Done. Thanks. > >> +} >> + >> +static bool >> +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a, >> + const struct dpif_sflow_attr *b) >> +{ >> + if (a->sflow->nla_len != b->sflow->nla_len >> + || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) { >> + return false; >> + } >> + if (!a->tunnel && !b->tunnel) { >> + return true; >> + } >> + if (a->tunnel && b->tunnel) { >> + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel); >> + } >> + >> + return false; >> +} >> + >> +/* Lockless RCU protected lookup. If node is needed accross RCU >> quiescent >> + * state, caller should take a reference. */ >> +static struct gid_node * >> +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash) >> +{ >> + struct gid_node *node; >> + >> + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, >> &gid_metadata_map) { >> + if (dpif_sflow_attr_equal(&node->sflow, target)) { >> + return node; >> + } >> + } >> + return NULL; >> +} >> + >> +static struct gid_node * >> +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash) >> +{ >> + struct gid_node *node; >> + >> + do { >> + node = gid_find_equal(target, hash); >> + /* Try again if the node was released before we get the >> reference. */ >> + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount)); >> + >> + return node; >> +} >> + >> +static void >> +dpif_sflow_attr_clone(struct dpif_sflow_attr *new, >> + const struct dpif_sflow_attr *old) >> +{ >> + new->sflow = xmalloc(old->sflow->nla_len); >> + memcpy(CONST_CAST(void *, new->sflow), old->sflow, >> old->sflow->nla_len); >> + > > There is xmemdup() you could use here, and below Done. Thanks. The code is greatly simplified. > >> + new->userdata_len = old->userdata_len; >> + new->userdata = xmalloc(new->userdata_len); >> + memcpy(new->userdata, old->userdata, new->userdata_len); >> + >> + if (old->tunnel) { >> + new->tunnel = xzalloc(sizeof *new->tunnel); >> + memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel); >> + } else { >> + new->tunnel = NULL; >> + } >> +} >> + >> +/* We use the id as the hash value, which works due to cmap internal >> rehashing. >> + * We also only insert nodes with unique IDs, so all possible hash >> collisions >> + * remain internal to the cmap. */ >> +static struct gid_node * >> +gid_find__(uint32_t id) >> + OVS_REQUIRES(gid_rwlock) >> +{ >> + struct cmap_node *node = cmap_find_protected(&gid_map, id); >> + >> + return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL; >> +} >> + >> +/* Allocate a unique group id for the given set of flow metadata. >> + * The ID space is 2^^32, so there should never be a situation in >> which all >> + * the IDs are used up. We loop until we find a free one. */ > > I don't agree that we should assume we find an entry because if there > is a bug we will loop forever. A simple fix would be to make sure we > have wrapped around, if so quit with NULL. What might even be better > is to have a fixed amount of tries (8K, 64K, or something), as I'm > wondering how much 2^32 - 1 will take ;) Or some warning message if it > takes more than xK retries? Just something that we know we might need > to re-work this function. Done. > >> +static struct gid_node * >> +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash) >> +{ >> + struct gid_node *node = xzalloc(sizeof *node); >> + >> + node->hash = hash; >> + ovs_refcount_init(&node->refcount); >> + dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, >> &node->sflow), >> + sflow); >> + >> + ovs_rwlock_wrlock(&gid_rwlock); >> + for (;;) { >> + node->id = next_group_id++; >> + if (OVS_UNLIKELY(!node->id)) { >> + next_group_id = 1; >> + node->id = next_group_id++; >> + } >> + /* Find if the id is free. */ >> + if (OVS_LIKELY(!gid_find__(node->id))) { >> + break; >> + } >> + } >> + cmap_insert(&gid_map, &node->id_node, node->id); >> + cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash); >> + ovs_rwlock_unlock(&gid_rwlock); >> + return node; >> +} >> + >> +/* Allocate a unique group id for the given set of flow metadata and >> + optional actions. */ >> +static uint32_t >> +gid_alloc_ctx(const struct dpif_sflow_attr *sflow) >> +{ >> + uint32_t hash = dpif_sflow_attr_hash(sflow); >> + struct gid_node *node = gid_ref_equal(sflow, hash); >> + >> + if (!node) { >> + node = gid_alloc__(sflow, hash); >> + } > > I think gid_alloc__() should be allowed to fail (see above), so check > for it: > return node ? node->id : 0 Done. > >> + return node->id; >> +} >> + >> +static void >> +gid_node_unref(const struct gid_node *node_) >> + OVS_EXCLUDED(gid_rwlock) >> +{ >> + struct gid_node *node = CONST_CAST(struct gid_node *, node_); >> + >> + ovs_rwlock_wrlock(&gid_rwlock); >> + if (node && ovs_refcount_unref(&node->refcount) == 1) { >> + /* Prevent re-use of this node by removing the node from >> + * gid_metadata_map' */ >> + cmap_remove(&gid_metadata_map, &node->metadata_node, >> node->hash); >> + /* We keep the node in the 'gid_map' so that it can be found as >> + * long as it lingers, and add it to the 'gid_expiring' >> list. */ >> + ovs_list_insert(&gid_expiring, &node->exp_node); > > Why do we do a delayed remove here? Any sflow packets in the pipeline > can be silently deleted, based on a failing sgid_find(). Please see my above comment for the two list issue. > The only thing concern might be ID reuse. But I think with 32^2 IDs, > and the current allocator schema, this is unlikely to happen. > > It would make the code cleaner, and the rwlock can be a normal lock. > >> + } >> + ovs_rwlock_unlock(&gid_rwlock); >> +} >> + >> +static void >> +gid_free(uint32_t id) >> +{ >> + const struct gid_node *node; > > All use cases of gid_free() check for 0 before calling, maybe we can > include it here and remove it from the other places. This makes it > behave like free(). Done. > >> + >> + node = gid_find(id); >> + if (node) { >> + gid_node_unref(node); >> + } else { >> + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id); >> + } >> +} >> + >> static bool >> is_internal_port(const char *type) >> { >> @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, >> const ovs_u128 *ufid) >> if (!err) { >> del_ufid_tc_mapping(ufid); >> } >> + if (id->sflow_group_id) { >> + gid_free(id->sflow_group_id); >> + } >> return err; >> } >> >> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev, >> >> *dump_out = dump; >> >> + gid_cleanup(); >> + > > I guess this can be removed if we can delete the pigs directly, if > not, we might want to put a comment here explaining why this is called > from this function. OK, I'll put a comment here. > >> return 0; >> } >> >> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, >> struct match *match, >> action->type = TC_ACT_GOTO; >> action->chain = 0; /* 0 is reserved and not used by >> recirc. */ >> flower.action_count++; >> + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) { >> + struct dpif_sflow_attr sflow_attr; >> + >> + memset(&sflow_attr, 0, sizeof sflow_attr); >> + gid_alloc_ctx(&sflow_attr); > > Guess here we should report an error if gid_alloc_ctx() fails. > if (!gid_alloc_ctx(&sflow_attr)) { > VLOG_DBG_RL(&rl, "Failed allocating group id for sample action"); > > } Will fix it in the last patch. > >> } else { >> VLOG_DBG_RL(&rl, "unsupported put action type: %d", >> nl_attr_type(nla)); >> diff --git a/lib/tc.h b/lib/tc.h >> index 281231c0d..cc2ad025d 100644 >> --- a/lib/tc.h >> +++ b/lib/tc.h >> @@ -273,6 +273,7 @@ struct tcf_id { >> uint32_t chain; >> uint16_t prio; >> uint32_t handle; >> + uint32_t sflow_group_id; >> }; >> >> static inline struct tcf_id >> -- >> 2.26.2 >
On 22 Jan 2021, at 4:27, Chris Mi wrote: > On 1/13/2021 3:49 AM, Eelco Chaudron wrote: >> >> >> On 15 Dec 2020, at 4:38, Chris Mi wrote: <SNIP> >>> + >>> +static void >>> +gid_cleanup(void) >>> +{ >>> + long long int now = time_msec(); >>> + struct gid_node *node; >>> + >>> + /* Do maintenance at most 4 times / sec. */ >>> + ovs_rwlock_rdlock(&gid_rwlock); >>> + if (now - gid_last_run < GID_RUN_INTERVAL) { >>> + ovs_rwlock_unlock(&gid_rwlock); >>> + return; >>> + } >>> + ovs_rwlock_unlock(&gid_rwlock); >>> + >>> + ovs_rwlock_wrlock(&gid_rwlock); >>> + gid_last_run = now; >>> + >>> + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) { >>> + cmap_remove(&gid_map, &node->id_node, node->id); >>> + ovsrcu_postpone(gid_node_free, node); >>> + } >>> + >>> + if (!ovs_list_is_empty(&gid_expiring)) { >>> + /* 'gid_expired' is now empty, move nodes in >>> + * 'gid_expiring' to it. */ >>> + ovs_list_splice(&gid_expired, >>> + >>> ovs_list_front(&gid_expiring), >>> + &gid_expiring); >>> + } >> >> Trying to understand why we have two list? Why can't we directly >> cleanup from expiring list? > The code takes the recirc id management for example > (ofproto-dpif-rid.c). > I'm not 100% sure the two lists are really needed here. But the recirc > id code exists for a long time. > I think it is mature. And I don't want to re-invent something. > > And when psample thread receives the sampled packets, it needs to > lookup the sgid_node > based on the group id. But the sgid_node could be deleted anytime by > the revalidator. > So I think two list can resolve this issue. We have 250 ms interval to > wait for the sampled packets > on the fly to be processed. Just adding code because it exists somewhere else does not make sense to me. If you do not have a valid reason, why add all this complexity? I would like some other reviewer's comments on this, maybe I missed something, Ilya? >>> + ovs_rwlock_unlock(&gid_rwlock); >>> +} >>> + <SNIP> >>> + >>> +static void >>> +gid_node_unref(const struct gid_node *node_) >>> + OVS_EXCLUDED(gid_rwlock) >>> +{ >>> + struct gid_node *node = CONST_CAST(struct gid_node *, >>> node_); >>> + >>> + ovs_rwlock_wrlock(&gid_rwlock); >>> + if (node && ovs_refcount_unref(&node->refcount) == 1) { >>> + /* Prevent re-use of this node by removing the node >>> from >>> + * gid_metadata_map' */ >>> + cmap_remove(&gid_metadata_map, &node->metadata_node, >>> node->hash); >>> + /* We keep the node in the 'gid_map' so that it can >>> be found as >>> + * long as it lingers, and add it to the >>> 'gid_expiring' list. */ >>> + ovs_list_insert(&gid_expiring, &node->exp_node); >> >> Why do we do a delayed remove here? Any sflow packets in the pipeline >> can be silently deleted, based on a failing sgid_find(). > Please see my above comment for the two list issue. >> The only thing concern might be ID reuse. But I think with 32^2 IDs, >> and the current allocator schema, this is unlikely to happen. >> >> It would make the code cleaner, and the rwlock can be a normal lock. >> >>> + } >>> + ovs_rwlock_unlock(&gid_rwlock); >>> +} >>> + >>> +static void >>> +gid_free(uint32_t id) >>> +{ >>> + const struct gid_node *node; >> <SNIP> >>> >>> @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev >>> *netdev, >>> >>> *dump_out = dump; >>> >>> + gid_cleanup(); >>> + >> >> I guess this can be removed if we can delete the pigs directly, if >> not, we might want to put a comment here explaining why this is >> called from this function. > OK, I'll put a comment here. >> >>> return 0; >>> } >>> >>> @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, >>> struct match *match, >>> action->type = TC_ACT_GOTO; >>> action->chain = 0; /* 0 is reserved and >>> not used by recirc. */ >>> flower.action_count++; >>> + } else if (nl_attr_type(nla) == >>> OVS_ACTION_ATTR_SAMPLE) { >>> + struct dpif_sflow_attr sflow_attr; >>> + >>> + memset(&sflow_attr, 0, sizeof sflow_attr); >>> + gid_alloc_ctx(&sflow_attr); >>
diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c index 9c5be9f92..5c3c811a2 100644 --- a/lib/netdev-offload-tc.c +++ b/lib/netdev-offload-tc.c @@ -39,6 +39,7 @@ #include "unaligned.h" #include "util.h" #include "dpif-provider.h" +#include "cmap.h" VLOG_DEFINE_THIS_MODULE(netdev_offload_tc); @@ -57,6 +58,255 @@ struct netlink_field { int size; }; +/* This maps a psample group ID to struct dpif_sflow_attr for sFlow */ +struct gid_node { + struct ovs_list exp_node OVS_GUARDED; + struct cmap_node metadata_node; + struct cmap_node id_node; + struct ovs_refcount refcount; + uint32_t hash; + uint32_t id; + const struct dpif_sflow_attr sflow; +}; + +static struct ovs_rwlock gid_rwlock = OVS_RWLOCK_INITIALIZER; + +static long long int gid_last_run OVS_GUARDED_BY(gid_rwlock); + +static struct cmap gid_map = CMAP_INITIALIZER; +static struct cmap gid_metadata_map = CMAP_INITIALIZER; + +static struct ovs_list gid_expiring OVS_GUARDED_BY(gid_rwlock) + = OVS_LIST_INITIALIZER(&gid_expiring); +static struct ovs_list gid_expired OVS_GUARDED_BY(gid_rwlock) + = OVS_LIST_INITIALIZER(&gid_expired); + +static uint32_t next_group_id OVS_GUARDED_BY(gid_rwlock) = 1; + +#define GID_RUN_INTERVAL 250 /* msec */ + +static void +gid_node_free(struct gid_node *node) +{ + if (node->sflow.tunnel) { + free(node->sflow.tunnel); + } + free(CONST_CAST(void *, node->sflow.sflow)); + free(node->sflow.userdata); + free(node); +} + +static void +gid_cleanup(void) +{ + long long int now = time_msec(); + struct gid_node *node; + + /* Do maintenance at most 4 times / sec. */ + ovs_rwlock_rdlock(&gid_rwlock); + if (now - gid_last_run < GID_RUN_INTERVAL) { + ovs_rwlock_unlock(&gid_rwlock); + return; + } + ovs_rwlock_unlock(&gid_rwlock); + + ovs_rwlock_wrlock(&gid_rwlock); + gid_last_run = now; + + LIST_FOR_EACH_POP (node, exp_node, &gid_expired) { + cmap_remove(&gid_map, &node->id_node, node->id); + ovsrcu_postpone(gid_node_free, node); + } + + if (!ovs_list_is_empty(&gid_expiring)) { + /* 'gid_expired' is now empty, move nodes in + * 'gid_expiring' to it. */ + ovs_list_splice(&gid_expired, + ovs_list_front(&gid_expiring), + &gid_expiring); + } + ovs_rwlock_unlock(&gid_rwlock); +} + +/* Lockless RCU protected lookup. If node is needed accross RCU quiescent + * state, caller should copy the contents. */ +static const struct gid_node * +gid_find(uint32_t id) +{ + const struct cmap_node *node = cmap_find(&gid_map, id); + + return node + ? CONTAINER_OF(node, const struct gid_node, id_node) + : NULL; +} + +static uint32_t +dpif_sflow_attr_hash(const struct dpif_sflow_attr *sflow) +{ + uint32_t hash1 = hash_bytes(sflow->sflow, sflow->sflow->nla_len, 0); + uint32_t hash2; + + if (!sflow->tunnel) { + return hash1; + } + + hash2 = hash_bytes(sflow->tunnel, sizeof *sflow->tunnel, 0); + return hash_add(hash1, hash2); +} + +static bool +dpif_sflow_attr_equal(const struct dpif_sflow_attr *a, + const struct dpif_sflow_attr *b) +{ + if (a->sflow->nla_len != b->sflow->nla_len + || memcmp(a->sflow, b->sflow, a->sflow->nla_len)) { + return false; + } + if (!a->tunnel && !b->tunnel) { + return true; + } + if (a->tunnel && b->tunnel) { + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel); + } + + return false; +} + +/* Lockless RCU protected lookup. If node is needed accross RCU quiescent + * state, caller should take a reference. */ +static struct gid_node * +gid_find_equal(const struct dpif_sflow_attr *target, uint32_t hash) +{ + struct gid_node *node; + + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, &gid_metadata_map) { + if (dpif_sflow_attr_equal(&node->sflow, target)) { + return node; + } + } + return NULL; +} + +static struct gid_node * +gid_ref_equal(const struct dpif_sflow_attr *target, uint32_t hash) +{ + struct gid_node *node; + + do { + node = gid_find_equal(target, hash); + /* Try again if the node was released before we get the reference. */ + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount)); + + return node; +} + +static void +dpif_sflow_attr_clone(struct dpif_sflow_attr *new, + const struct dpif_sflow_attr *old) +{ + new->sflow = xmalloc(old->sflow->nla_len); + memcpy(CONST_CAST(void *, new->sflow), old->sflow, old->sflow->nla_len); + + new->userdata_len = old->userdata_len; + new->userdata = xmalloc(new->userdata_len); + memcpy(new->userdata, old->userdata, new->userdata_len); + + if (old->tunnel) { + new->tunnel = xzalloc(sizeof *new->tunnel); + memcpy(new->tunnel, old->tunnel, sizeof *new->tunnel); + } else { + new->tunnel = NULL; + } +} + +/* We use the id as the hash value, which works due to cmap internal rehashing. + * We also only insert nodes with unique IDs, so all possible hash collisions + * remain internal to the cmap. */ +static struct gid_node * +gid_find__(uint32_t id) + OVS_REQUIRES(gid_rwlock) +{ + struct cmap_node *node = cmap_find_protected(&gid_map, id); + + return node ? CONTAINER_OF(node, struct gid_node, id_node) : NULL; +} + +/* Allocate a unique group id for the given set of flow metadata. + * The ID space is 2^^32, so there should never be a situation in which all + * the IDs are used up. We loop until we find a free one. */ +static struct gid_node * +gid_alloc__(const struct dpif_sflow_attr *sflow, uint32_t hash) +{ + struct gid_node *node = xzalloc(sizeof *node); + + node->hash = hash; + ovs_refcount_init(&node->refcount); + dpif_sflow_attr_clone(CONST_CAST(struct dpif_sflow_attr *, &node->sflow), + sflow); + + ovs_rwlock_wrlock(&gid_rwlock); + for (;;) { + node->id = next_group_id++; + if (OVS_UNLIKELY(!node->id)) { + next_group_id = 1; + node->id = next_group_id++; + } + /* Find if the id is free. */ + if (OVS_LIKELY(!gid_find__(node->id))) { + break; + } + } + cmap_insert(&gid_map, &node->id_node, node->id); + cmap_insert(&gid_metadata_map, &node->metadata_node, node->hash); + ovs_rwlock_unlock(&gid_rwlock); + return node; +} + +/* Allocate a unique group id for the given set of flow metadata and + optional actions. */ +static uint32_t +gid_alloc_ctx(const struct dpif_sflow_attr *sflow) +{ + uint32_t hash = dpif_sflow_attr_hash(sflow); + struct gid_node *node = gid_ref_equal(sflow, hash); + + if (!node) { + node = gid_alloc__(sflow, hash); + } + return node->id; +} + +static void +gid_node_unref(const struct gid_node *node_) + OVS_EXCLUDED(gid_rwlock) +{ + struct gid_node *node = CONST_CAST(struct gid_node *, node_); + + ovs_rwlock_wrlock(&gid_rwlock); + if (node && ovs_refcount_unref(&node->refcount) == 1) { + /* Prevent re-use of this node by removing the node from + * gid_metadata_map' */ + cmap_remove(&gid_metadata_map, &node->metadata_node, node->hash); + /* We keep the node in the 'gid_map' so that it can be found as + * long as it lingers, and add it to the 'gid_expiring' list. */ + ovs_list_insert(&gid_expiring, &node->exp_node); + } + ovs_rwlock_unlock(&gid_rwlock); +} + +static void +gid_free(uint32_t id) +{ + const struct gid_node *node; + + node = gid_find(id); + if (node) { + gid_node_unref(node); + } else { + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id); + } +} + static bool is_internal_port(const char *type) { @@ -203,6 +453,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid) if (!err) { del_ufid_tc_mapping(ufid); } + if (id->sflow_group_id) { + gid_free(id->sflow_group_id); + } return err; } @@ -398,6 +651,8 @@ netdev_tc_flow_dump_create(struct netdev *netdev, *dump_out = dump; + gid_cleanup(); + return 0; } @@ -1797,6 +2052,11 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, action->type = TC_ACT_GOTO; action->chain = 0; /* 0 is reserved and not used by recirc. */ flower.action_count++; + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) { + struct dpif_sflow_attr sflow_attr; + + memset(&sflow_attr, 0, sizeof sflow_attr); + gid_alloc_ctx(&sflow_attr); } else { VLOG_DBG_RL(&rl, "unsupported put action type: %d", nl_attr_type(nla)); diff --git a/lib/tc.h b/lib/tc.h index 281231c0d..cc2ad025d 100644 --- a/lib/tc.h +++ b/lib/tc.h @@ -273,6 +273,7 @@ struct tcf_id { uint32_t chain; uint16_t prio; uint32_t handle; + uint32_t sflow_group_id; }; static inline struct tcf_id