| Message ID | 20260401091318.2671624-3-elibr@nvidia.com |
|---|---|
| State | New |
| Delegated to: | Eelco Chaudron |
| Headers | show |
| Series | netdev-doca | expand |
| Context | Check | Description |
|---|---|---|
| ovsrobot/apply-robot | success | apply and check: success |
| ovsrobot/github-robot-_Build_and_Test | fail | github build: failed |
On 1 Apr 2026, at 11:13, Eli Britstein wrote: > From: Gaetan Rivet <gaetanr@nvidia.com> > > Add a reference-counted key-value map. > > Duplicates take a reference on the original entry within the map, > leaving it in place. To be able to execute an entry creation after > determining whether it is already present or not in the map, > store relevant initialization and de-initialization functions. > > Signed-off-by: Gaetan Rivet <gaetanr@nvidia.com> > Co-authored-by: Eli Britstein <elibr@nvidia.com> > Signed-off-by: Eli Britstein <elibr@nvidia.com> Thanks Eli/Gaetan for following up with the v3. Find some comments below. Cheers, Eelco > --- > lib/automake.mk | 2 + > lib/refmap.c | 485 ++++++++++++++++++ > lib/refmap.h | 130 +++++ > tests/automake.mk | 1 + > tests/library.at | 5 + > tests/test-refmap.c | 894 ++++++++++++++++++++++++++++++++++ > utilities/checkpatch_dict.txt | 2 + > 7 files changed, 1519 insertions(+) > create mode 100644 lib/refmap.c > create mode 100644 lib/refmap.h > create mode 100644 tests/test-refmap.c > > diff --git a/lib/automake.mk b/lib/automake.mk > index c6e988906..cb6458b0d 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -323,6 +323,8 @@ lib_libopenvswitch_la_SOURCES = \ > lib/rculist.h \ > lib/reconnect.c \ > lib/reconnect.h \ > + lib/refmap.c \ > + lib/refmap.h \ > lib/rstp.c \ > lib/rstp.h \ > lib/rstp-common.h \ > diff --git a/lib/refmap.c b/lib/refmap.c > new file mode 100644 > index 000000000..c2c435238 > --- /dev/null > +++ b/lib/refmap.c > @@ -0,0 +1,485 @@ > +/* > + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. > + * All rights reserved. > + * SPDX-License-Identifier: Apache-2.0 > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#include <config.h> > + > +#include "cmap.h" > +#include "hash.h" > +#include "fatal-signal.h" Should be second include. > +#include "ovs-atomic.h" > +#include "ovs-thread.h" > +#include "refmap.h" > +#include "timeval.h" > + > +#include "openvswitch/list.h" > +#include "openvswitch/vlog.h" > + > +VLOG_DEFINE_THIS_MODULE(refmap); > +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600); > + > +static struct ovs_mutex refmap_destroy_lock = OVS_MUTEX_INITIALIZER; > +static struct ovs_list refmap_destroy_list > + OVS_GUARDED_BY(refmap_destroy_lock) = > + OVS_LIST_INITIALIZER(&refmap_destroy_list); > + > +struct refmap { > + struct cmap map; > + struct ovs_mutex map_lock; > + size_t key_size; > + size_t value_size; > + refmap_value_init value_init; > + refmap_value_uninit value_uninit; > + refmap_value_format value_format; > + char *name; > + struct ovs_list in_destroy_list; > +}; > + > +struct refmap_node { > + struct ovsrcu_node rcu_node; > + /* CMAP related: */ > + struct cmap_node map_node; > + uint32_t hash; > + /* Content: */ > + struct ovs_refcount refcount; > + /* Key, then Value follows. */ > +}; > + > +static void > +refmap_destroy__(struct refmap *rfm, bool global_destroy); This could simple become: +refmap_destroy__(struct refmap *, bool global_destroy); > + > +static void > +refmap_destroy_unregister_protected(struct refmap *rfm) > + OVS_REQUIRES(refmap_destroy_lock) > +{ > + ovs_list_remove(&rfm->in_destroy_list); > +} > + > +static void > +refmap_destroy_unregister(struct refmap *rfm) > + OVS_EXCLUDED(refmap_destroy_lock) > +{ > + ovs_mutex_lock(&refmap_destroy_lock); > + refmap_destroy_unregister_protected(rfm); > + ovs_mutex_unlock(&refmap_destroy_lock); > +} > + > +static void > +refmap_destroy_register(struct refmap *rfm) > + OVS_EXCLUDED(refmap_destroy_lock) > +{ > + ovs_mutex_lock(&refmap_destroy_lock); > + ovs_list_push_back(&refmap_destroy_list, &rfm->in_destroy_list); > + ovs_mutex_unlock(&refmap_destroy_lock); > +} > + > +static void > +refmap_destroy_all(void *aux OVS_UNUSED) > +{ > + struct refmap *rfm; > + > + ovs_mutex_lock(&refmap_destroy_lock); > + LIST_FOR_EACH_SAFE (rfm, in_destroy_list, &refmap_destroy_list) { > + refmap_destroy_unregister_protected(rfm); > + refmap_destroy__(rfm, true); > + } > + ovs_mutex_unlock(&refmap_destroy_lock); > + ovs_mutex_destroy(&refmap_destroy_lock); > +} > + > +static void > +refmap_fatal_signal_hook(void *aux OVS_UNUSED) > +{ > + /* This argument is only for the type check in 'ovsrcu_postpone', > + * it is not otherwise used. */ > + static int dummy_arg; > + > + /* Do not run all destroys right in the signal handler. > + * Let other modules execute their own cleanup, and then > + * iterate over any remaining to warn about leaks. */ > + ovsrcu_postpone(refmap_destroy_all, &dummy_arg); > +} > + > +struct refmap * > +refmap_create(const char *name, > + size_t key_size, > + size_t value_size, > + refmap_value_init value_init, > + refmap_value_uninit value_uninit, > + refmap_value_format value_format) > +{ > + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; > + struct refmap *rfm; > + > + ovs_assert(value_init && value_uninit); > + > + if (ovsthread_once_start(&once)) { > + fatal_signal_add_hook(refmap_fatal_signal_hook, NULL, NULL, true); > + ovsthread_once_done(&once); > + } > + > + rfm = xzalloc(sizeof *rfm); > + rfm->name = xstrdup(name); > + rfm->key_size = key_size; > + rfm->value_size = value_size; > + rfm->value_init = value_init; > + rfm->value_uninit = value_uninit; > + rfm->value_format = value_format; > + > + ovs_mutex_init(&rfm->map_lock); > + cmap_init(&rfm->map); > + > + refmap_destroy_register(rfm); > + > + return rfm; > +} > + > +static void > +refmap_destroy__(struct refmap *rfm, bool global_destroy) > +{ > + bool leaks_detected = false; > + > + if (!rfm) { > + return; > + } > + > + VLOG_DBG("%s: destroying the map", rfm->name); > + > + ovs_mutex_lock(&rfm->map_lock); > + if (!cmap_is_empty(&rfm->map)) { > + struct refmap_node *node; > + > + VLOG_WARN("%s: %s called with elements remaining in the map", > + rfm->name, __func__); > + leaks_detected = true; > + CMAP_FOR_EACH (node, map_node, &rfm->map) { > + /* No need to remove the node from the CMAP, it will > + * be destroyed immediately. */ > + ovsrcu_postpone_embedded(free, node, rcu_node); > + } > + } > + cmap_destroy(&rfm->map); > + ovs_mutex_unlock(&rfm->map_lock); > + > + ovs_mutex_destroy(&rfm->map_lock); > + free(rfm->name); > + free(rfm); > + > + /* During the very last stage of execution of RCU callbacks, > + * the VLOG subsystem has been disabled. All logs are thus muted. > + * If leaks are detected, abort the process, even though we were > + * exiting due to a fatal signal. The SIGABRT generated will still > + * be visible. */ > + if (global_destroy && leaks_detected) { > + ovs_abort(-1, "Refmap values leak detected."); Looke like none of the ovs_abort() messages end with a dot. ovs_abort(-1, "Refmap values leak detected"); From the earlier discussion; EC> Do we need to call value_uninit? EB> Those nodes were leaks in the refmap, at this point they are destroyed as OVS is stopping, it is not possible to call value_uninit without crashing. EC> But is this not only true for 'global_destroy == true'. What about the genuine refmap_destroy() call? If this remains the case, we should clearly add this to the API documentation. You chose not to call value_uninit() in the non-global_destroy case. Should we document this more explicitly, or should we call ovs_assert() to prevent this bad programming behavior? * WARNING: The caller MUST ensure the map is empty before calling * refmap_destroy(). If the map contains remaining elements, their * values will NOT have value_uninit() called, which will leak any * resources managed by those values (file descriptors, allocated * memory, etc.). The node memory will be freed but resource cleanup * will not occur. > + } > +} > + > +void > +refmap_destroy(struct refmap *rfm) > +{ > + if (!rfm) { > + return; > + } > + > + refmap_destroy_unregister(rfm); > + refmap_destroy__(rfm, false); We had an earlier discussion, where I suggested this should be done under RCU. However you mentioned this is by design, and it's MT-unsafe. We need to clearly document this with an example in the documentation on how this should be synchronized. Something along the lines of: refmap_destroy() MUST NOT be called while any other thread might be accessing the refmap, even through MT-safe operations like refmap_for_each() or refmap_try_ref(). Or make it RCU safe and avoid any headaches in the future :) > +} > + > +static size_t > +refmap_aligned_key_size(struct refmap *rfm) > +{ > + return ROUND_UP(rfm->key_size, 8); > +} > + > +static void * > +refmap_node_key(struct refmap_node *node) > +{ > + if (!node) { > + return NULL; > + } > + > + return node + 1; > +} > + > +static void * > +refmap_node_value(struct refmap *rfm, struct refmap_node *node) > +{ > + if (!node) { > + return NULL; > + } > + > + return ((char *) refmap_node_key(node)) + refmap_aligned_key_size(rfm); > +} > + > +static size_t > +refmap_node_total_size(struct refmap *rfm) > +{ > + return sizeof(struct refmap_node) + > + refmap_aligned_key_size(rfm) + rfm->value_size; > +} > + > +static struct refmap_node * > +refmap_node_from_value(struct refmap *rfm, void *value) > +{ > + size_t offset = sizeof(struct refmap_node) + refmap_aligned_key_size(rfm); > + > + if ((uintptr_t) value < offset) { > + return NULL; > + } Add a new line, similar to refmap_node_value() > + return (void *) (((char *) value) - offset); > +} > + > +static void > +log_node(struct refmap *rfm, const char *prefix, struct refmap_node *node) > +{ > + void *key, *value; > + struct ds s; > + > + if (OVS_LIKELY(VLOG_DROP_DBG(&rl) || !rfm->value_format)) { > + return; > + } > + > + key = refmap_node_key(node); > + value = refmap_node_value(rfm, node); > + > + ds_init(&s); > + ds_put_cstr(&s, ", '"); > + rfm->value_format(&s, key, value); Isn't the format option callback optional? If not, the ovs_assert in refmap_create() needs updating. > + ds_put_cstr(&s, "'"); > + VLOG_DBG("%s: %p %s, refcnt=%d%s", rfm->name, value, prefix, The refcnt is unsigned, so "%u" Why not use "%s: value=%p %s, refcnt=~%u, '%s'" and avoid the ds_put_cstr() calls? Note the value=%p as else it's just a random pointer, or maybe a bit cleaner, like: myrefmap[unref]: value=0xffffffere, refcnt=~1, 'k,v=eelco,redhat' "%s[%s]: value=%p, refcnt=~%u, '%s'" > + ovs_refcount_read(&node->refcount), ds_cstr(&s)); Should we include the refcount? As it might cause false security? The value could be different than the value that is actually used in the function. Maybe print it like "refcnt=~%d," so when people use the message for debugging they notice the ~ which might trigger some additional thinking. > + ds_destroy(&s); > +} > + > +/* Increments 'refcount', but only if it is over one. > + * > + * Returns false if the refcount was zero or one. > + * In refmap, the last reference is only used to synchronize between > + * value init and uninit in case of contention. In such state, the > + * object is not valid anymore for external readers, until the > + * value_{init,uninit} critical section is completed. > + */ > +static inline bool > +refmap_refcount_try_ref_one(struct ovs_refcount *refcount) I assume this is the attempt to avoid the add/delete raise callback. As you mentioned this is not fixed, I'm not reviewing the 'complex' reference count attempt. Not sure but could this maybe be fixed simpler doing something like this (not tested or well thought trought): This is the old code with some annotations:: static inline bool ovs_refcount_unref_if_not_last(struct ovs_refcount *refcount) { unsigned int count; atomic_read_explicit(&refcount->count, &count, memory_order_acquire); ovs_assert(count > 0); while (count > 1) { if (atomic_compare_exchange_weak_explicit( &refcount->count, &count, count - 1, memory_order_release, memory_order_relaxed)) { return true; } } return false; } bool refmap_unref(struct refmap *rfm, void *value) { struct refmap_node *node; unsigned int old_refcount; if (!value) { return false; } node = refmap_node_from_value(rfm, value); if (!node) { return false; } if (ovs_refcount_unref_if_not_last(&node->refcount)) { return false; } ovs_mutex_lock(&rfm->map_lock); old_refcount = ovs_refcount_unref(&node->refcount); if (old_refcount == 1) { /* We transitioned 1→0 under lock. Safe to cleanup. */ rfm->value_uninit(refmap_node_value(rfm, node)); cmap_remove(&rfm->map, &node->map_node, node->hash); ovs_mutex_unlock(&rfm->map_lock); ovsrcu_postpone_inline(free, node, rcu_node); return true; } ovs_mutex_unlock(&rfm->map_lock); return false; } > +{ > + unsigned int count; > + > + atomic_read_explicit(&refcount->count, &count, memory_order_relaxed); > + do { > + if (count <= 1) { > + return false; > + } > + } while (!atomic_compare_exchange_weak_explicit(&refcount->count, &count, > + count + 1, > + memory_order_relaxed, > + memory_order_relaxed)); > + return true; > +} > + > +void > +refmap_for_each(struct refmap *rfm, > + void (*cb)(void *value, void *key, void *arg), void *arg) From our earlier discussion: EC> A macro for this would be more in line with existing implementations and EC> avoids the requirement for a callback function for simple operations, i.e., EC> something like REFMAP_FOR_EACH(). EB> We want to call the cb only under a reference count that is internal in EB> the refmap. EB> We can ask the user to explicitly call ref/unref before/after the "cb" EB> code, but I think it's better to have this as part of the EB> infrastructure, with the downside of a "cb" code that is not inline with EB> the loop. EB> Do you have a suggestion how to do it in a macro? I did some quick/dirty coding, and this is a potential diff (might need some cleanup bit it looks nicer with the code inline): diff --git a/lib/refmap.c b/lib/refmap.c index c2c435238..86b5850e1 100644 --- a/lib/refmap.c +++ b/lib/refmap.c @@ -291,6 +291,38 @@ refmap_refcount_try_ref_one(struct ovs_refcount *refcount) return true; } +/* Helper function for REFMAP_FOR_EACH macro. + * Advances to the next refmap entry, handling ref/unref automatically. + * Returns true if a value was found, false when iteration is complete. */ +bool +refmap_iter_next(struct refmap_iter *iter, void **value, void **key) +{ + struct refmap_node *node; + + /* Unref previous value if any. */ + if (iter->prev_value) { + refmap_unref(iter->rfm, iter->prev_value); + iter->prev_value = NULL; + } + + /* Initialize on first call (cursor.impl is NULL from zero-init). */ + if (!iter->cursor.impl) { + iter->cursor = cmap_cursor_start(&iter->rfm->map); + } + + /* Find next refable node. */ + CMAP_CURSOR_FOR_EACH_CONTINUE (node, map_node, &iter->cursor) { + if (refmap_refcount_try_ref_one(&node->refcount)) { + *key = refmap_node_key(node); + *value = refmap_node_value(iter->rfm, node); + iter->prev_value = *value; + return true; + } + } + + return false; +} + void refmap_for_each(struct refmap *rfm, void (*cb)(void *value, void *key, void *arg), void *arg) diff --git a/lib/refmap.h b/lib/refmap.h index e41f1d888..2d7b6a265 100644 --- a/lib/refmap.h +++ b/lib/refmap.h @@ -23,7 +23,9 @@ #include <stddef.h> #include <stdint.h> +#include <stdbool.h> +#include "cmap.h" + #include "openvswitch/dynamic-string.h" /* @@ -86,6 +88,31 @@ typedef void (*refmap_value_uninit)(void *value); typedef struct ds *(*refmap_value_format)(struct ds *s, void *key, void *value); +/* Iterator context for REFMAP_FOR_EACH macro. */ +struct refmap_iter { + struct refmap *rfm; + struct cmap_cursor cursor; + void *prev_value; +}; + +/* Helper function for REFMAP_FOR_EACH macro. + * Advances to the next refmap entry, handling ref/unref automatically. + * Returns true if a value was found, false when iteration is complete. */ +bool refmap_iter_next(struct refmap_iter *, void **value, void **key); + +/* Iterates through all key-value pairs in REFMAP. Each iteration assigns + * VALUE (void *) and KEY (void *) to the current entry's value and key. + * + * The iterator holds a reference during each iteration, automatically + * calling refmap_unref() after the loop body completes. + * + * If you break out of the loop, then you need to manually call + * refmap_unref(REFMAP, VALUE) to release the reference. */ +#define REFMAP_FOR_EACH(VALUE, KEY, REFMAP) \ + for (struct refmap_iter refmap_iter__ = { (REFMAP), {0}, NULL }; \ + refmap_iter_next(&refmap_iter__, &(VALUE), &(KEY)); \ + ) + /* Allocate and return a map handle. * * The user must ensure the 'key' is fully initialized, including potential diff --git a/tests/test-refmap.c b/tests/test-refmap.c index 4639088bc..533df5407 100644 --- a/tests/test-refmap.c +++ b/tests/test-refmap.c @@ -126,20 +126,6 @@ struct check_refmap_ctx { int count; }; -static void -check_refmap_cb(void *value_, void *key_, void *arg_) -{ - struct check_refmap_ctx *ctx = arg_; - struct key *key = key_; - - ovs_assert(key->idx < N); - if (ctx->values) { - ovs_assert(ctx->values[key->idx] == value_); - } - - ctx->count++; -} - static void check_refmap(struct refmap *rfm, struct value **values, int n_expected) { @@ -148,8 +134,19 @@ check_refmap(struct refmap *rfm, struct value **values, int n_expected) .values = values, .count = 0, }; + void *value, *key; + + REFMAP_FOR_EACH (value, key, rfm) { + struct key *k = key; + + ovs_assert(k->idx < N); + if (ctx.values) { + ovs_assert(ctx.values[k->idx] == value); + } + + ctx.count++; + } - refmap_for_each(rfm, check_refmap_cb, &ctx); ovs_assert(ctx.count == n_expected); } @@ -161,25 +158,6 @@ struct iter_modify_ctx { int unref_count; }; -static void -iter_ref_cb(void *value_, void *key_, void *arg_) -{ - struct iter_modify_ctx *ctx = arg_; - struct key *key = key_; - - ovs_assert(refmap_try_ref_value(ctx->rfm, value_)); - ctx->extra_refs[key->idx] = value_; - ctx->ref_count++; -} - -static void -iter_unref_cb(void *value_, void *key_ OVS_UNUSED, void *arg_) -{ - struct iter_modify_ctx *ctx = arg_; - - refmap_unref(ctx->rfm, value_); - ctx->unref_count++; -} struct try_ref_race_ctx { struct refmap *rfm; @@ -187,14 +165,6 @@ struct try_ref_race_ctx { atomic_bool stop; }; -static void -race_for_each_cb(void *value_, void *key_ OVS_UNUSED, void *arg_ OVS_UNUSED) -{ - struct value *value = value_; - - ovs_assert(value->hdl); -} - static void * try_ref_racer(void *arg) { @@ -216,7 +186,13 @@ try_ref_racer(void *arg) refmap_unref(ctx->rfm, value); } - refmap_for_each(ctx->rfm, race_for_each_cb, NULL); + void *iter_value, *iter_key; + + REFMAP_FOR_EACH (iter_value, iter_key, ctx->rfm) { + struct value *v = iter_value; + + ovs_assert(v->hdl); + } } return NULL; @@ -269,6 +245,7 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) struct key keys[N]; struct refmap *rfm; uint32_t args[N]; + void *value, *key; rfm = refmap_create("check-rfm", sizeof(struct key), sizeof(struct value), value_init, value_uninit, value_format); @@ -280,7 +257,6 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) struct arg arg = { .ptr = &args[i], }; - struct value *value; keys[i].idx = i; args[i] = i; @@ -333,7 +309,15 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) im_ctx.keys = keys; im_ctx.extra_refs = (struct value **) extra_refs; memset(extra_refs, 0, sizeof extra_refs); - refmap_for_each(rfm, iter_ref_cb, &im_ctx); + + REFMAP_FOR_EACH (value, key, rfm) { + struct key *k = key; + + ovs_assert(refmap_try_ref_value(im_ctx.rfm, value)); + im_ctx.extra_refs[k->idx] = value; + im_ctx.ref_count++; + } + ovs_assert(im_ctx.ref_count == N); for (int i = 0; i < N; i++) { ovs_assert(extra_refs[i] == values[i]); @@ -345,7 +329,12 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) /* Drop extra refs from within refmap_for_each callback. */ memset(&im_ctx, 0, sizeof im_ctx); im_ctx.rfm = rfm; - refmap_for_each(rfm, iter_unref_cb, &im_ctx); + + REFMAP_FOR_EACH (value, key, rfm) { + refmap_unref(im_ctx.rfm, value); + im_ctx.unref_count++; + } + ovs_assert(im_ctx.unref_count == N); for (int i = 0; i < N; i++) { ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); > + struct refmap_node *node; > + > + CMAP_FOR_EACH (node, map_node, &rfm->map) { > + void *value; > + > + if (!refmap_refcount_try_ref_one(&node->refcount)) { > + continue; > + } > + log_node(rfm, "foreach", node); > + value = refmap_node_value(rfm, node); > + cb(value, refmap_node_key(node), arg); > + refmap_unref(rfm, value); > + } > +} > + > +static uint32_t > +refmap_key_hash(const struct refmap *rfm, const void *key) > +{ > + return hash_bytes(key, rfm->key_size, 0); > +} > + > +static void * > +refmap_lookup_protected(struct refmap *rfm, void *key, uint32_t hash) This should return the actual node struct, i.e. 'static struct refmap_node *' > +{ > + struct refmap_node *node; > + > + CMAP_FOR_EACH_WITH_HASH_PROTECTED (node, map_node, hash, &rfm->map) { > + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && > + ovs_refcount_read(&node->refcount) > 1) { > + return node; > + } > + } > + > + return NULL; > +} > + > +static void * > +refmap_lookup(struct refmap *rfm, void *key, uint32_t hash) This should return the actual node struct, i.e. 'static struct refmap_node *' > +{ > + struct refmap_node *node; > + > + CMAP_FOR_EACH_WITH_HASH (node, map_node, hash, &rfm->map) { > + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && > + ovs_refcount_read(&node->refcount) > 1) { > + return node; > + } > + } > + > + return NULL; > +} > + > +void * > +refmap_try_ref(struct refmap *rfm, void *key) > +{ > + struct refmap_node *node; > + > + node = refmap_lookup(rfm, key, refmap_key_hash(rfm, key)); > + if (!node || !refmap_refcount_try_ref_one(&node->refcount)) { > + return NULL; > + } > + > + log_node(rfm, "try_ref", node); > + return refmap_node_value(rfm, node); > +} > + > +void * > +refmap_ref(struct refmap *rfm, void *key, void *arg) > +{ > + struct refmap_node *node; > + bool error = false; > + uint32_t hash; > + void *value; > + > + hash = refmap_key_hash(rfm, key); > + > + node = refmap_lookup(rfm, key, hash); > + if (node && refmap_refcount_try_ref_one(&node->refcount)) { > + value = refmap_node_value(rfm, node); > + goto out; > + } > + > + ovs_mutex_lock(&rfm->map_lock); > + > + node = refmap_lookup_protected(rfm, key, hash); > + if (node && refmap_refcount_try_ref_one(&node->refcount)) { > + ovs_mutex_unlock(&rfm->map_lock); > + value = refmap_node_value(rfm, node); > + goto out; > + } > + > + node = xzalloc(refmap_node_total_size(rfm)); > + node->hash = hash; > + ovs_refcount_init(&node->refcount); > + memcpy(refmap_node_key(node), key, rfm->key_size); > + value = refmap_node_value(rfm, node); > + if (rfm->value_init(value, arg) == 0) { > + cmap_insert(&rfm->map, &node->map_node, node->hash); > + ovs_refcount_ref(&node->refcount); > + } else { > + value = NULL; > + error = true; > + VLOG_WARN("%s: value_init failed", rfm->name); > + } > + ovs_mutex_unlock(&rfm->map_lock); > + > +out: > + if (error) { > + free(node); > + return NULL; > + } > + > + log_node(rfm, "ref", node); > + > + return value; > +} > + > +bool > +refmap_try_ref_value(struct refmap *rfm, void *value) > +{ > + struct refmap_node *node; > + > + if (!value) { > + return false; > + } > + > + node = refmap_node_from_value(rfm, value); > + if (!node || !refmap_refcount_try_ref_one(&node->refcount)) { Should we log_node() the refmap_refcount_() failure here? > + return false; > + } > + > + log_node(rfm, "try_ref_value", node); > + return true; > +} > + > +bool > +refmap_unref(struct refmap *rfm, void *value) > +{ > + struct refmap_node *node; > + > + if (!value) { > + return false; > + } > + > + node = refmap_node_from_value(rfm, value); > + if (!node) { > + return false; > + } > + > + log_node(rfm, "unref", node); > + > + if (ovs_refcount_unref(&node->refcount) == 2) { > + ovs_mutex_lock(&rfm->map_lock); > + if (ovs_refcount_read(&node->refcount) > 1) { > + ovs_mutex_unlock(&rfm->map_lock); > + return false; > + } > + rfm->value_uninit(refmap_node_value(rfm, node)); > + cmap_remove(&rfm->map, &node->map_node, node->hash); > + ovs_assert(ovs_refcount_unref(&node->refcount) == 1); > + ovs_mutex_unlock(&rfm->map_lock); > + ovsrcu_postpone_embedded(free, node, rcu_node); > + return true; > + } > + return false; > +} > + > +void * > +refmap_key_from_value(struct refmap *rfm, void *value) > +{ > + return refmap_node_key(refmap_node_from_value(rfm, value)); > +} > + > +unsigned int > +refmap_value_refcount_read(struct refmap *rfm, void *value) > +{ > + struct refmap_node *node; > + > + if (!value) { > + return 0; > + } > + > + node = refmap_node_from_value(rfm, value); > + if (node) { > + return ovs_refcount_read(&node->refcount) - 1; > + } > + > + return 0; > +} > diff --git a/lib/refmap.h b/lib/refmap.h > new file mode 100644 > index 000000000..e41f1d888 > --- /dev/null > +++ b/lib/refmap.h > @@ -0,0 +1,130 @@ > +/* > + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. > + * All rights reserved. > + * SPDX-License-Identifier: Apache-2.0 > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef REFMAP_H > +#define REFMAP_H > + > +#include <config.h> > + > +#include <stddef.h> > +#include <stdint.h> > + > +#include "openvswitch/dynamic-string.h" > + > +/* > + * Reference map > + * ============= > + * > + * This key-value store acts like a regular concurrent hashmap, > + * except that insertion takes a reference on the value if already > + * present. > + * The key provided must be fully initialized, including potential pad bytes. > + * > + * As the value creation is dependent on it being already present > + * within the structure and the user cannot predict that, this structure > + * requires definitions for value_init and value_uninit functions, > + * that will be called only at creation (first reference taken) and > + * destruction (last reference released). > + * > + * Example: > + * 1. struct key key; > + * 2. memset(&key, 0, sizeof key); > + * 3. refmap_create() > + * 4. value = refmap_ref(key); > + * Since it's the first reference for <key>, value_init is called. > + * 5. refmap_ref(key); > + * This is not the first reference for <key>. Only ref-count is updated. > + * 6. refmap_unref(value); > + * This is not the last reference released. Only ref-count is updated. > + * 7. refmap_unref(value); > + * This is the last reference released. value_uninit is immediatelly immediatelly -> immediately > + * called, while the value memory is freed after RCU grace period. > + * > + * Thread safety > + * ============= > + * > + * MT-unsafe: > + * * refmap_create > + * * refmap_destroy See earlier comment about refmap_destroy(). I would prefer to make this MT-safe, maybe with something like this (not tested or thought trough): static void refmap_destroy_rcu__(void *rfm_) { struct refmap *rfm = rfm_; refmap_destroy__(rfm, false); } struct refmap { atomic_bool destroyed; // Add this field ... }; void refmap_destroy(struct refmap *rfm) { if (!rfm) { return; } atomic_store(&rfm->destroyed, true); // Mark as destroyed refmap_destroy_unregister(rfm); ovsrcu_postpone(refmap_destroy_rcu__, rfm); } void *refmap_ref(struct refmap *rfm, const void *key, void *arg) { if (atomic_load(&rfm->destroyed)) { return NULL; } // ... rest of existing code } Also need similar checks in refmap_try_ref() and possibly refmap_for_each(). > + * > + * MT-safe: > + * * refmap_for_each > + * * refmap_ref > + * * refmap_try_ref > + * * refmap_try_ref_value > + * * refmap_unref > + * > + */ > + > +struct refmap; > + > +/* Called once on a newly created 'value', i.e. when the first > + * reference is taken. */ > +typedef int (*refmap_value_init)(void *value, void *arg); > + > +/* Called once on the last dereference to value. */ > +typedef void (*refmap_value_uninit)(void *value); > + > +/* Format a (key, value, arg) tuple in 's'. This is an optional (can be NULL) arg value is no longer present. > + * callback, used for debug log purposes. > + */ > +typedef struct ds *(*refmap_value_format)(struct ds *s, void *key, > + void *value); > + > +/* Allocate and return a map handle. > + * > + * The user must ensure the 'key' is fully initialized, including potential > + * pad bytes. > + */ > +struct refmap *refmap_create(const char *name, > + size_t key_size, > + size_t value_size, > + refmap_value_init value_init, > + refmap_value_uninit value_uninit, > + refmap_value_format value_format); refmap_value_init, refmap_value_uninit, refmap_value_format); No need to add variable names if it's clear what the argument is. > + > +/* Frees the map memory. > + * > + * The client is responsible for unreferencing any data previously held in > + * the map. */ See earlier comment on adding a more explicit comment. > +void refmap_destroy(struct refmap *rfm); Same as earlier, refmap_destroy(struct refmap *), same for all *rfm's below. > + > +/* refmap_try_ref takes a reference for the found value upon success. > + * It's the user's responsibility to unref it. */ This help might need to be more general as it applies to all three functions below. > +void *refmap_try_ref(struct refmap *rfm, void *key); > +void *refmap_ref(struct refmap *rfm, void *key, void *arg); > +bool refmap_try_ref_value(struct refmap *rfm, void *value); New line here. > +void refmap_for_each(struct refmap *rfm, > + void (*cb)(void *value, void *key, void *arg), > + void *arg); > +/* The refmap_value_refcount_read() API requires the caller to hold a > + * reference, so a returned value of 1 only indicates you were the sole owner > + * at the moment of the read, but may no longer be by the time you receive the > + * value. This makes it unsuitable for logic decisions and only useful for > + * debug logging. > + */ This help text should move down to the function definition. > +void *refmap_key_from_value(struct refmap *rfm, void *value); > + > +/* Return 'true' if it was the last 'value' dereference and > + * 'value_uninit' has been called. */ > +bool refmap_unref(struct refmap *rfm, void *value); > + > +unsigned int > +refmap_value_refcount_read(struct refmap *rfm, void *value); > + > +#endif /* REFMAP_H */ > diff --git a/tests/automake.mk b/tests/automake.mk > index a9d972a86..a74e56454 100644 > --- a/tests/automake.mk > +++ b/tests/automake.mk > @@ -502,6 +502,7 @@ tests_ovstest_SOURCES = \ > tests/test-rcu.c \ > tests/test-rculist.c \ > tests/test-reconnect.c \ > + tests/test-refmap.c \ > tests/test-rstp.c \ > tests/test-sflow.c \ > tests/test-sha1.c \ > diff --git a/tests/library.at b/tests/library.at > index 449f15fd5..3662d488e 100644 > --- a/tests/library.at > +++ b/tests/library.at > @@ -44,6 +44,11 @@ AT_CHECK([ovstest test-ccmap check 1], [0], [... > ]) > AT_CLEANUP > > +AT_SETUP([refmap]) > +AT_KEYWORDS([refmap]) > +AT_CHECK([ovstest test-refmap check], [0], []) > +AT_CLEANUP > + > AT_SETUP([atomic operations]) > AT_CHECK([ovstest test-atomic]) > AT_CLEANUP > diff --git a/tests/test-refmap.c b/tests/test-refmap.c > new file mode 100644 > index 000000000..4639088bc > --- /dev/null > +++ b/tests/test-refmap.c I did not look at the test cases in details, but it looks like most case are covered. However it might be nice to add a test case were the value_init() function is failing. Also a test for NULL value_format() would show the existing bug. > @@ -0,0 +1,894 @@ > +/* > + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. > + * All rights reserved. > + * SPDX-License-Identifier: Apache-2.0 > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#include <config.h> > + > +#undef NDEBUG > +#include <assert.h> > +#include <errno.h> > +#include <getopt.h> > +#include <string.h> > + > +#include "ovs-atomic.h" > +#include "ovs-numa.h" > +#include "ovs-rcu.h" > +#include "ovs-thread.h" > +#include "ovstest.h" > +#include "random.h" > +#include "refmap.h" > +#include "timeval.h" > +#include "util.h" > + > +#include "openvswitch/util.h" > +#include "openvswitch/vlog.h" > + > +#define N 100 > + > +static struct refmap_test_params { > + unsigned int n_threads; > + unsigned int n_ids; > + int step_idx; > + bool debug; > + bool csv_format; > +} params = { > + .n_threads = 1, > + .n_ids = N, > + .debug = false, > + .csv_format = false, > +}; > + > +DEFINE_STATIC_PER_THREAD_DATA(unsigned int, thread_id, OVSTHREAD_ID_UNSET); > + > +static unsigned int > +thread_id(void) > +{ > + static atomic_count next_id = ATOMIC_COUNT_INIT(0); > + unsigned int id = *thread_id_get(); > + > + if (OVS_UNLIKELY(id == OVSTHREAD_ID_UNSET)) { > + id = atomic_count_inc(&next_id); > + *thread_id_get() = id; > + } > + > + return id; > +} > + > +struct key { > + size_t idx; > + bool b; > + uint8_t pad[7]; > +}; > + > +struct value { > + uint32_t *hdl; > +}; > + > +struct arg { > + uint32_t *ptr; > +}; > + > +static int > +value_init(void *value_, void *arg_) > +{ > + struct value *value = value_; > + struct arg *arg = arg_; > + > + /* Verify that we don't double-init value. */ > + ovs_assert(!value->hdl); > + > + *arg->ptr = 1; > + value->hdl = arg->ptr; > + return 0; > +} > + > +static void > +value_uninit(void *value_) > +{ > + struct value *value = value_; > + > + /* Verify that we don't double-uninit value. */ > + ovs_assert(value->hdl); > + > + *value->hdl = 2; > + value->hdl = NULL; > +} > + > +static struct ds * > +value_format(struct ds *s, void *key_, void *value_) > +{ > + struct key *key = key_; > + struct value *value = value_; > + > + ds_put_format(s, "idx=%"PRIuSIZE", b=%s, hdl=%p", > + key->idx, key->b ? "1" : "0", > + value->hdl); > + return s; > +} > + > +struct check_refmap_ctx { > + struct refmap *rfm; > + struct value **values; > + int count; > +}; > + > +static void > +check_refmap_cb(void *value_, void *key_, void *arg_) > +{ > + struct check_refmap_ctx *ctx = arg_; > + struct key *key = key_; > + > + ovs_assert(key->idx < N); > + if (ctx->values) { > + ovs_assert(ctx->values[key->idx] == value_); > + } > + > + ctx->count++; > +} > + > +static void > +check_refmap(struct refmap *rfm, struct value **values, int n_expected) > +{ > + struct check_refmap_ctx ctx = { > + .rfm = rfm, > + .values = values, > + .count = 0, > + }; > + > + refmap_for_each(rfm, check_refmap_cb, &ctx); > + ovs_assert(ctx.count == n_expected); > +} > + > +struct iter_modify_ctx { > + struct refmap *rfm; > + struct key *keys; > + struct value **extra_refs; > + int ref_count; > + int unref_count; > +}; > + > +static void > +iter_ref_cb(void *value_, void *key_, void *arg_) > +{ > + struct iter_modify_ctx *ctx = arg_; > + struct key *key = key_; > + > + ovs_assert(refmap_try_ref_value(ctx->rfm, value_)); > + ctx->extra_refs[key->idx] = value_; > + ctx->ref_count++; > +} > + > +static void > +iter_unref_cb(void *value_, void *key_ OVS_UNUSED, void *arg_) > +{ > + struct iter_modify_ctx *ctx = arg_; > + > + refmap_unref(ctx->rfm, value_); > + ctx->unref_count++; > +} > + > +struct try_ref_race_ctx { > + struct refmap *rfm; > + struct key key; > + atomic_bool stop; > +}; > + > +static void > +race_for_each_cb(void *value_, void *key_ OVS_UNUSED, void *arg_ OVS_UNUSED) > +{ > + struct value *value = value_; > + > + ovs_assert(value->hdl); > +} > + > +static void * > +try_ref_racer(void *arg) > +{ > + struct try_ref_race_ctx *ctx = arg; > + > + for (;;) { > + bool stop_; > + > + atomic_read(&ctx->stop, &stop_); > + if (stop_) { > + break; > + } > + > + void *value = refmap_try_ref(ctx->rfm, &ctx->key); > + if (value) { > + struct value *v = value; > + > + ovs_assert(v->hdl); > + refmap_unref(ctx->rfm, value); > + } > + > + refmap_for_each(ctx->rfm, race_for_each_cb, NULL); > + } > + > + return NULL; > +} > + > +/* Stress-test that try_ref rejects entries at refcount 1 (the internal > + * reference used during value init/uninit synchronization). > + * > + * Thread A repeatedly creates and destroys the same entry. > + * Thread B continuously calls try_ref and for_each. */ > +static void > +check_try_ref_race(void) > +{ > + struct try_ref_race_ctx race_ctx; > + pthread_t worker; > + struct refmap *rfm; > + > + rfm = refmap_create("try-ref-race", sizeof(struct key), > + sizeof(struct value), value_init, value_uninit, > + value_format); > + > + memset(&race_ctx.key, 0, sizeof race_ctx.key); > + race_ctx.key.idx = 0; > + race_ctx.rfm = rfm; > + atomic_init(&race_ctx.stop, false); > + > + worker = ovs_thread_create("try-ref-racer", try_ref_racer, &race_ctx); > + > + for (int i = 0; i < 10000; i++) { > + uint32_t arg_val = 0; > + struct arg arg = { .ptr = &arg_val }; > + void *value; > + > + value = refmap_ref(rfm, &race_ctx.key, &arg); > + refmap_unref(rfm, value); > + } > + > + atomic_store(&race_ctx.stop, true); > + xpthread_join(worker, NULL); > + > + refmap_destroy(rfm); > +} > + > +static void > +run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) > +{ > + struct iter_modify_ctx im_ctx; > + struct value *extra_refs[N]; > + struct value *values[N]; > + struct key keys[N]; > + struct refmap *rfm; > + uint32_t args[N]; > + > + rfm = refmap_create("check-rfm", sizeof(struct key), sizeof(struct value), > + value_init, value_uninit, value_format); > + > + check_refmap(rfm, NULL, 0); > + > + memset(keys, 0, sizeof keys); > + for (int i = 0; i < N; i++) { > + struct arg arg = { > + .ptr = &args[i], > + }; > + struct value *value; > + > + keys[i].idx = i; > + args[i] = i; > + ovs_assert(!refmap_try_ref(rfm, &keys[i])); > + value = refmap_ref(rfm, &keys[i], &arg); > + ovs_assert(value); > + ovs_assert(value == refmap_ref(rfm, &keys[i], &arg)); > + refmap_unref(rfm, value); > + ovs_assert(value == refmap_try_ref(rfm, &keys[i])); > + refmap_unref(rfm, value); > + values[i] = value; > + } > + > + check_refmap(rfm, (struct value **) values, N); > + > + for (int i = 0; i < N; i++) { > + /* Verify that value_init is properly called. */ > + ovs_assert(values[i]->hdl == &args[i]); > + ovs_assert(args[i] == 1); > + } > + > + /* Verify refmap_value_refcount_read: each value has one user ref. */ > + for (int i = 0; i < N; i++) { > + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); > + } > + > + ovs_assert(refmap_value_refcount_read(rfm, NULL) == 0); > + > + /* Verify refmap_key_from_value. */ > + for (int i = 0; i < N; i++) { > + struct key *k = refmap_key_from_value(rfm, values[i]); > + ovs_assert(k->idx == keys[i].idx); > + } > + > + /* Verify refmap_try_ref_value and refcount changes. */ > + for (int i = 0; i < N; i++) { > + ovs_assert(refmap_try_ref_value(rfm, values[i])); > + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 2); > + refmap_unref(rfm, values[i]); > + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); > + } > + > + ovs_assert(!refmap_try_ref_value(rfm, NULL)); > + > + check_refmap(rfm, (struct value **) values, N); > + > + /* Take extra refs from within refmap_for_each callback. */ > + memset(&im_ctx, 0, sizeof im_ctx); > + im_ctx.rfm = rfm; > + im_ctx.keys = keys; > + im_ctx.extra_refs = (struct value **) extra_refs; > + memset(extra_refs, 0, sizeof extra_refs); > + refmap_for_each(rfm, iter_ref_cb, &im_ctx); > + ovs_assert(im_ctx.ref_count == N); > + for (int i = 0; i < N; i++) { > + ovs_assert(extra_refs[i] == values[i]); > + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 2); > + } > + > + check_refmap(rfm, (struct value **) values, N); > + > + /* Drop extra refs from within refmap_for_each callback. */ > + memset(&im_ctx, 0, sizeof im_ctx); > + im_ctx.rfm = rfm; > + refmap_for_each(rfm, iter_unref_cb, &im_ctx); > + ovs_assert(im_ctx.unref_count == N); > + for (int i = 0; i < N; i++) { > + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); > + } > + > + check_refmap(rfm, (struct value **) values, N); > + > + for (int i = 0; i < N; i++) { > + refmap_unref(rfm, values[i]); > + } > + > + for (int i = 0; i < N; i++) { > + ovs_assert(!refmap_try_ref(rfm, &keys[i])); > + } > + > + for (int i = 0; i < N; i++) { > + /* Verify that value_uninit is executed. */ > + ovs_assert(args[i] == 2); > + } > + > + check_refmap(rfm, NULL, 0); > + > + refmap_destroy(rfm); > + > + check_try_ref_race(); > +} > + > +static uint32_t *ids; > +static void **values; > +static atomic_uint *thread_working_ms; /* Measured work time. */ > + > +static struct ovs_barrier barrier_outer; > +static struct ovs_barrier barrier_inner; > + > +static atomic_uint running_time_ms; > +static atomic_bool stop; > + > +static unsigned int > +elapsed(unsigned int start) > +{ > + unsigned int running_time_ms_; > + > + atomic_read(&running_time_ms, &running_time_ms_); > + > + return running_time_ms_ - start; > +} > + > +static void * > +clock_main(void *arg OVS_UNUSED) > +{ > + struct timeval start; > + struct timeval end; > + > + xgettimeofday(&start); > + for (;;) { > + bool stop_; > + > + atomic_read(&stop, &stop_); > + if (stop_) { > + break; > + } > + > + xgettimeofday(&end); > + atomic_store(&running_time_ms, > + timeval_to_msec(&end) - timeval_to_msec(&start)); > + xnanosleep(100 * 1000); > + } > + > + return NULL; > +} > + > +enum step_id { > + STEP_NONE, > + STEP_ALLOC, > + STEP_REF, > + STEP_UNREF, > + STEP_FREE, > + STEP_MIXED, > + STEP_POS_QUERY, > + STEP_NEG_QUERY, > +}; > + > +static const char *step_names[] = { > + [STEP_NONE] = "<bug>", > + [STEP_ALLOC] = "alloc", > + [STEP_REF] = "ref", > + [STEP_UNREF] = "unref", > + [STEP_FREE] = "free", > + [STEP_MIXED] = "mixed", > + [STEP_POS_QUERY] = "pos-query", > + [STEP_NEG_QUERY] = "neg-query", > +}; > + > +#define MAX_N_STEP 10 > + > +#define FOREACH_STEP(STEP_VAR, SCHEDULE) \ > + for (int __idx = 0, STEP_VAR = (SCHEDULE)[__idx]; \ > + (STEP_VAR = (SCHEDULE)[__idx]) != STEP_NONE; \ > + __idx++) > + > +struct test_case { > + int idx; > + enum step_id schedule[MAX_N_STEP]; > +}; > + > +static void > +print_header(void) > +{ > + if (params.csv_format) { > + return; > + } > + > + printf("Benchmarking n=%u on %u thread%s.\n", > + params.n_ids, params.n_threads, > + params.n_threads > 1 ? "s" : ""); > + > + printf(" step\\thread: "); > + printf(" Avg"); > + for (size_t i = 0; i < params.n_threads; i++) { > + printf(" %3" PRIuSIZE, i + 1); > + } > + > + printf("\n"); > +} > + > +static void > +print_test_header(struct test_case *test) > +{ > + if (params.csv_format) { > + return; > + } > + > + printf("[%d]---------------------------", test->idx); > + for (size_t i = 0; i < params.n_threads; i++) { > + printf("-------"); > + } > + > + printf("\n"); > +} > + > +static void > +print_test_result(struct test_case *test, enum step_id step, int step_idx) > +{ > + char test_name[50]; > + uint32_t *twm; > + uint32_t avg; > + size_t i; > + > + twm = xcalloc(params.n_threads, sizeof *twm); > + for (i = 0; i < params.n_threads; i++) { > + atomic_read(&thread_working_ms[i], &twm[i]); > + } > + > + avg = 0; > + for (i = 0; i < params.n_threads; i++) { > + avg += twm[i]; > + } > + > + ovs_assert(params.n_threads); > + avg /= params.n_threads; > + > + snprintf(test_name, sizeof test_name, "%d.%d-%s", > + test->idx, step_idx, > + step_names[step]); > + if (params.csv_format) { > + printf("%s,%" PRIu32, test_name, avg); > + } else { > + printf("%*s: ", 18, test_name); > + printf(" %6" PRIu32, avg); > + for (i = 0; i < params.n_threads; i++) { > + printf(" %6" PRIu32, twm[i]); > + } > + printf(" ms"); > + } > + > + printf("\n"); > + > + free(twm); > +} > + > +static struct test_case test_cases[] = { > + { > + .schedule = { > + STEP_ALLOC, > + STEP_FREE, > + }, > + }, > + { > + .schedule = { > + STEP_ALLOC, > + STEP_REF, > + STEP_UNREF, > + STEP_FREE, > + }, > + }, > + { > + .schedule = { > + STEP_MIXED, > + STEP_FREE, > + }, > + }, > + { > + .schedule = { > + STEP_ALLOC, > + STEP_POS_QUERY, > + /* Test negative query with map full. */ > + STEP_NEG_QUERY, > + STEP_FREE, > + /* Test negative query with map empty. */ > + STEP_NEG_QUERY, > + }, > + }, > +}; > + > +static void > +swap_ptr(void **a, void **b) > +{ > + void *t; > + t = *a; > + *a = *b; > + *b = t; > +} > + > +struct aux { > + struct test_case test; > + struct refmap *rfm; > +}; > + > +static void * > +benchmark_thread_worker(void *aux_) > +{ > + unsigned int tid = thread_id(); > + unsigned int n_ids_per_thread; > + unsigned int start_idx; > + struct aux *aux = aux_; > + struct refmap *rfm; > + unsigned int start; > + uint32_t *th_ids; > + void **th_privs; > + void *value; > + size_t i; > + > + n_ids_per_thread = params.n_ids / params.n_threads; > + start_idx = tid * n_ids_per_thread; > + th_privs = &values[start_idx]; > + th_ids = &ids[start_idx]; > + > + for (;;) { > + bool stop_; > + > + ovs_barrier_block(&barrier_outer); > + atomic_read(&stop, &stop_); > + if (stop_) { > + break; > + } > + > + /* Wait for main thread to finish initializing > + * rfm and step schedule. */ > + ovs_barrier_block(&barrier_inner); > + rfm = aux->rfm; > + > + FOREACH_STEP(step, aux->test.schedule) { > + ovs_barrier_block(&barrier_inner); > + atomic_read(&running_time_ms, &start); > + switch (step) { > + case STEP_ALLOC: > + case STEP_REF: > + for (i = 0; i < n_ids_per_thread; i++) { > + struct key key = { > + .idx = start_idx + i, > + }; > + struct arg arg = { > + .ptr = &th_ids[i], > + }; > + > + th_privs[i] = refmap_ref(rfm, &key, &arg); > + } > + break; > + case STEP_POS_QUERY: > + for (i = 0; i < n_ids_per_thread; i++) { > + struct key key = { > + .idx = start_idx + i, > + }; > + value = refmap_try_ref(rfm, &key); > + refmap_unref(rfm, value); > + } > + break; > + case STEP_NEG_QUERY: > + for (i = 0; i < n_ids_per_thread; i++) { > + struct key key = { > + .idx = params.n_ids + 1, > + }; > + value = refmap_try_ref(rfm, &key); > + refmap_unref(rfm, value); > + } > + break; > + case STEP_UNREF: > + case STEP_FREE: > + for (i = 0; i < n_ids_per_thread; i++) { > + refmap_unref(rfm, th_privs[i]); > + } > + break; > + case STEP_MIXED: > + for (i = 0; i < n_ids_per_thread; i++) { > + struct arg arg; > + struct key key; > + int shuffled; > + > + /* Mixed mode is doing: > + * 1. Alloc. > + * 2. Shuffle two elements. > + * 3. Delete shuffled element. > + * 4. Alloc again. > + * The loop ends with all elements allocated. > + */ > + > + memset(&key, 0, sizeof key); > + key.idx = start_idx + i; > + shuffled = random_range(i + 1); > + > + arg.ptr = &th_ids[i]; > + th_privs[i] = refmap_ref(rfm, &key, &arg); > + swap_ptr(&th_privs[i], &th_privs[shuffled]); > + refmap_unref(rfm, th_privs[i]); > + arg.ptr = &th_ids[i]; > + th_privs[i] = refmap_ref(rfm, &key, &arg); > + } > + break; > + default: > + fprintf(stderr, "[%u]: Reached step %d\n", > + tid, step); > + OVS_NOT_REACHED(); > + break; > + } > + atomic_store(&thread_working_ms[tid], elapsed(start)); > + ovs_barrier_block(&barrier_inner); > + /* Main thread prints result now. */ > + } > + } > + > + return NULL; > +} > + > +static void > +benchmark_thread_main(struct aux *aux) > +{ > + int step_idx; > + > + memset(ids, 0, params.n_ids * sizeof *ids); > + memset(values, 0, params.n_ids * sizeof *values); > + > + aux->rfm = refmap_create("benchmark-rfm", sizeof(struct key), > + sizeof(struct value), value_init, value_uninit, > + value_format); ovs_assert(!aux->rfm); > + > + print_test_header(&aux->test); > + ovs_barrier_block(&barrier_inner); > + /* Init is done, worker can start preparing to work. */ > + step_idx = 0; > + FOREACH_STEP(step, aux->test.schedule) { > + ovs_barrier_block(&barrier_inner); > + /* Workers do the scheduled work now. */ > + ovs_barrier_block(&barrier_inner); > + print_test_result(&aux->test, step, step_idx++); > + } > + > + refmap_destroy(aux->rfm); > +} > + > +static bool > +parse_benchmark_params(int argc, char *argv[]) > +{ > + long int l_threads = 0; > + long int l_ids = 0; > + bool valid = true; > + long int l; > + int i; > + > + params.step_idx = -1; > + for (i = 0; i < argc; i++) { > + if (!strcmp(argv[i], "benchmark") || > + !strcmp(argv[i], "debug")) { > + continue; > + } else if (!strcmp(argv[i], "csv")) { > + params.csv_format = true; > + } else if (!strncmp(argv[i], "step=", 5)) { > + if (!str_to_long(&argv[i][5], 10, &l)) { > + fprintf(stderr, > + "Invalid parameter '%s', expected positive integer.\n", > + argv[i]); > + valid = false; > + goto out; > + } > + > + params.step_idx = l; > + } else { > + if (!str_to_long(argv[i], 10, &l)) { > + fprintf(stderr, > + "Invalid parameter '%s', expected positive integer.\n", > + argv[i]); > + valid = false; > + goto out; > + } > + > + if (l_ids == 0) { > + l_ids = l; > + } else if (l_threads == 0) { > + l_threads = l; > + } else { > + fprintf(stderr, > + "Invalid parameter '%s', too many integer values.\n", > + argv[i]); > + valid = false; > + goto out; > + } > + } > + } > + > + if (l_ids != 0) { > + params.n_ids = l_ids; > + } else { > + fprintf(stderr, "Invalid parameters: no number of elements given.\n"); > + valid = false; > + } > + > + if (l_threads != 0) { > + params.n_threads = l_threads; > + } else { > + fprintf(stderr, "Invalid parameters: no number of threads given.\n"); > + valid = false; > + } > + > +out: > + return valid; > +} > + > +static void > +run_benchmark(struct ovs_cmdl_context *ctx) > +{ > + pthread_t *threads; > + pthread_t clock; > + struct aux aux; > + size_t i; > + > + if (!parse_benchmark_params(ctx->argc, ctx->argv)) { > + return; > + } > + > + ids = xcalloc(params.n_ids, sizeof *ids); > + values = xcalloc(params.n_ids, sizeof *values); > + thread_working_ms = xcalloc(params.n_threads, > + sizeof *thread_working_ms); > + for (i = 0; i < params.n_threads; i++) { > + atomic_init(&thread_working_ms[i], 0); > + } > + > + atomic_init(&stop, false); > + > + clock = ovs_thread_create("clock", clock_main, NULL); > + > + ovsrcu_quiesce_start(); > + ovs_barrier_init(&barrier_outer, params.n_threads + 1); > + ovs_barrier_init(&barrier_inner, params.n_threads + 1); > + threads = xmalloc(params.n_threads * sizeof *threads); > + for (i = 0; i < params.n_threads; i++) { > + threads[i] = ovs_thread_create("worker", > + benchmark_thread_worker, &aux); > + } > + > + print_header(); > + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { > + test_cases[i].idx = i; > + if (params.step_idx != -1 && > + params.step_idx != i) { > + continue; > + } > + /* If we don't block workers from progressing now, > + * there would be a race for access to aux.test, > + * leading to some workers not respecting the schedule. > + */ > + ovs_barrier_block(&barrier_outer); > + memcpy(&aux.test, &test_cases[i], sizeof aux.test); > + benchmark_thread_main(&aux); > + } > + > + atomic_store(&stop, true); > + ovs_barrier_block(&barrier_outer); > + > + for (i = 0; i < params.n_threads; i++) { > + xpthread_join(threads[i], NULL); > + } > + > + free(threads); > + > + ovs_barrier_destroy(&barrier_outer); > + ovs_barrier_destroy(&barrier_inner); > + free(ids); > + free(values); > + free(thread_working_ms); > + xpthread_join(clock, NULL); > +} > + > +static const struct ovs_cmdl_command commands[] = { > + {"check", "[debug]", 0, 1, run_check, OVS_RO}, > + {"benchmark", "<nb elem> <nb threads> [step=<uint>] [csv]", 0, 4, > + run_benchmark, OVS_RO}, > + {NULL, NULL, 0, 0, NULL, OVS_RO}, > +}; > + > +static void > +parse_test_params(int argc, char *argv[]) > +{ > + int i; > + > + for (i = 0; i < argc; i++) { > + if (!strcmp(argv[i], "debug")) { > + params.debug = true; > + } > + } > +} > + > +static void > +refmap_test_main(int argc, char *argv[]) > +{ > + struct ovs_cmdl_context ctx = { > + .argc = argc - optind, > + .argv = argv + optind, > + }; > + > + parse_test_params(argc - optind, argv + optind); > + > + vlog_set_levels(NULL, VLF_ANY_DESTINATION, VLL_OFF); > + if (params.debug) { > + vlog_set_levels_from_string_assert("refmap:console:dbg"); > + } > + > + /* Quiesce to start the RCU. */ > + ovsrcu_quiesce(); > + > + set_program_name(argv[0]); > + ovs_cmdl_run_command(&ctx, commands); > + > + ovsrcu_exit(); > +} > + > +OVSTEST_REGISTER("test-refmap", refmap_test_main); > diff --git a/utilities/checkpatch_dict.txt b/utilities/checkpatch_dict.txt > index c1f43e5af..5ad599c1d 100644 > --- a/utilities/checkpatch_dict.txt > +++ b/utilities/checkpatch_dict.txt > @@ -107,6 +107,7 @@ icmp > icmp4 > icmpv6 > idl > +ie > ifdef > ifindex > initializer > @@ -232,6 +233,7 @@ rebased > recirc > recirculation > recirculations > +refmap > revalidate > revalidation > revalidator > -- > 2.34.1
On 15/04/2026 12:23, Eelco Chaudron wrote: > External email: Use caution opening links or attachments > > > On 1 Apr 2026, at 11:13, Eli Britstein wrote: > >> From: Gaetan Rivet <gaetanr@nvidia.com> >> >> Add a reference-counted key-value map. >> >> Duplicates take a reference on the original entry within the map, >> leaving it in place. To be able to execute an entry creation after >> determining whether it is already present or not in the map, >> store relevant initialization and de-initialization functions. >> >> Signed-off-by: Gaetan Rivet <gaetanr@nvidia.com> >> Co-authored-by: Eli Britstein <elibr@nvidia.com> >> Signed-off-by: Eli Britstein <elibr@nvidia.com> > Thanks Eli/Gaetan for following up with the v3. > Find some comments below. > > Cheers, > > Eelco > >> --- >> lib/automake.mk | 2 + >> lib/refmap.c | 485 ++++++++++++++++++ >> lib/refmap.h | 130 +++++ >> tests/automake.mk | 1 + >> tests/library.at | 5 + >> tests/test-refmap.c | 894 ++++++++++++++++++++++++++++++++++ >> utilities/checkpatch_dict.txt | 2 + >> 7 files changed, 1519 insertions(+) >> create mode 100644 lib/refmap.c >> create mode 100644 lib/refmap.h >> create mode 100644 tests/test-refmap.c >> >> diff --git a/lib/automake.mk b/lib/automake.mk >> index c6e988906..cb6458b0d 100644 >> --- a/lib/automake.mk >> +++ b/lib/automake.mk >> @@ -323,6 +323,8 @@ lib_libopenvswitch_la_SOURCES = \ >> lib/rculist.h \ >> lib/reconnect.c \ >> lib/reconnect.h \ >> + lib/refmap.c \ >> + lib/refmap.h \ >> lib/rstp.c \ >> lib/rstp.h \ >> lib/rstp-common.h \ >> diff --git a/lib/refmap.c b/lib/refmap.c >> new file mode 100644 >> index 000000000..c2c435238 >> --- /dev/null >> +++ b/lib/refmap.c >> @@ -0,0 +1,485 @@ >> +/* >> + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. >> + * All rights reserved. >> + * SPDX-License-Identifier: Apache-2.0 >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +#include <config.h> >> + >> +#include "cmap.h" >> +#include "hash.h" >> +#include "fatal-signal.h" > Should be second include. Ack > >> +#include "ovs-atomic.h" >> +#include "ovs-thread.h" >> +#include "refmap.h" >> +#include "timeval.h" >> + >> +#include "openvswitch/list.h" >> +#include "openvswitch/vlog.h" >> + >> +VLOG_DEFINE_THIS_MODULE(refmap); >> +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600); >> + >> +static struct ovs_mutex refmap_destroy_lock = OVS_MUTEX_INITIALIZER; >> +static struct ovs_list refmap_destroy_list >> + OVS_GUARDED_BY(refmap_destroy_lock) = >> + OVS_LIST_INITIALIZER(&refmap_destroy_list); >> + >> +struct refmap { >> + struct cmap map; >> + struct ovs_mutex map_lock; >> + size_t key_size; >> + size_t value_size; >> + refmap_value_init value_init; >> + refmap_value_uninit value_uninit; >> + refmap_value_format value_format; >> + char *name; >> + struct ovs_list in_destroy_list; >> +}; >> + >> +struct refmap_node { >> + struct ovsrcu_node rcu_node; >> + /* CMAP related: */ >> + struct cmap_node map_node; >> + uint32_t hash; >> + /* Content: */ >> + struct ovs_refcount refcount; >> + /* Key, then Value follows. */ >> +}; >> + >> +static void >> +refmap_destroy__(struct refmap *rfm, bool global_destroy); > This could simple become: > > +refmap_destroy__(struct refmap *, bool global_destroy); Ack > >> + >> +static void >> +refmap_destroy_unregister_protected(struct refmap *rfm) >> + OVS_REQUIRES(refmap_destroy_lock) ... >> + if (global_destroy && leaks_detected) { >> + ovs_abort(-1, "Refmap values leak detected."); > Looke like none of the ovs_abort() messages end with a dot. > ovs_abort(-1, "Refmap values leak detected"); > > From the earlier discussion; > > EC> Do we need to call value_uninit? > EB> Those nodes were leaks in the refmap, at this point they are destroyed as > OVS is stopping, it is not possible to call value_uninit without crashing. > > EC> But is this not only true for 'global_destroy == true'. What about the > genuine refmap_destroy() call? If this remains the case, we should clearly > add this to the API documentation. > > You chose not to call value_uninit() in the non-global_destroy case. > Should we document this more explicitly, or should we call ovs_assert() > to prevent this bad programming behavior? > > * WARNING: The caller MUST ensure the map is empty before calling > * refmap_destroy(). If the map contains remaining elements, their > * values will NOT have value_uninit() called, which will leak any > * resources managed by those values (file descriptors, allocated > * memory, etc.). The node memory will be freed but resource cleanup > * will not occur. The documentation I put is the same as appears in cmap. I'll take the last paragraph to v4. As no assert in cmap, I prefer the same here. > >> + } >> +} >> + >> +void >> +refmap_destroy(struct refmap *rfm) >> +{ >> + if (!rfm) { >> + return; >> + } >> + >> + refmap_destroy_unregister(rfm); >> + refmap_destroy__(rfm, false); > We had an earlier discussion, where I suggested this should be done > under RCU. However you mentioned this is by design, and it's MT-unsafe. > > We need to clearly document this with an example in the documentation > on how this should be synchronized. Something along the lines of: > > refmap_destroy() MUST NOT be called while any other thread > might be accessing the refmap, even through MT-safe operations like > refmap_for_each() or refmap_try_ref(). > > Or make it RCU safe and avoid any headaches in the future :) I enhanced the doc about it, with the last paragraph. An example seems like an overkill. I find it as a needless complication. I think it's clear enough as-is. > >> +} >> + >> +static size_t >> +refmap_aligned_key_size(struct refmap *rfm) >> +{ >> + return ROUND_UP(rfm->key_size, 8); >> +} >> + >> +static void * >> +refmap_node_key(struct refmap_node *node) >> +{ >> + if (!node) { >> + return NULL; >> + } >> + >> + return node + 1; >> +} >> + >> +static void * >> +refmap_node_value(struct refmap *rfm, struct refmap_node *node) >> +{ >> + if (!node) { >> + return NULL; >> + } >> + >> + return ((char *) refmap_node_key(node)) + refmap_aligned_key_size(rfm); >> +} >> + >> +static size_t >> +refmap_node_total_size(struct refmap *rfm) >> +{ >> + return sizeof(struct refmap_node) + >> + refmap_aligned_key_size(rfm) + rfm->value_size; >> +} >> + >> +static struct refmap_node * >> +refmap_node_from_value(struct refmap *rfm, void *value) >> +{ >> + size_t offset = sizeof(struct refmap_node) + refmap_aligned_key_size(rfm); >> + >> + if ((uintptr_t) value < offset) { >> + return NULL; >> + } > Add a new line, similar to refmap_node_value() Ack > >> + return (void *) (((char *) value) - offset); >> +} >> + >> +static void >> +log_node(struct refmap *rfm, const char *prefix, struct refmap_node *node) >> +{ >> + void *key, *value; >> + struct ds s; >> + >> + if (OVS_LIKELY(VLOG_DROP_DBG(&rl) || !rfm->value_format)) { >> + return; >> + } >> + >> + key = refmap_node_key(node); >> + value = refmap_node_value(rfm, node); >> + >> + ds_init(&s); >> + ds_put_cstr(&s, ", '"); >> + rfm->value_format(&s, key, value); > Isn't the format option callback optional? If not, the ovs_assert in > refmap_create() needs updating. It is. there is no assert there. > >> + ds_put_cstr(&s, "'"); >> + VLOG_DBG("%s: %p %s, refcnt=%d%s", rfm->name, value, prefix, > The refcnt is unsigned, so "%u" > > Why not use "%s: value=%p %s, refcnt=~%u, '%s'" and avoid the ds_put_cstr() calls? > > Note the value=%p as else it's just a random pointer, or maybe a bit cleaner, like: > > myrefmap[unref]: value=0xffffffere, refcnt=~1, 'k,v=eelco,redhat' > > "%s[%s]: value=%p, refcnt=~%u, '%s'" > >> + ovs_refcount_read(&node->refcount), ds_cstr(&s)); > Should we include the refcount? As it might cause false security? The value > could be different than the value that is actually used in the function. > Maybe print it like "refcnt=~%d," so when people use the message for > debugging they notice the ~ which might trigger some additional thinking. I took your format. the refcount value is important to debug. when debugging, let's say a single thread, we want to be able to see all refs are covered with unrefs and it behaves correctly. > >> + ds_destroy(&s); >> +} >> + >> +/* Increments 'refcount', but only if it is over one. >> + * >> + * Returns false if the refcount was zero or one. >> + * In refmap, the last reference is only used to synchronize between >> + * value init and uninit in case of contention. In such state, the >> + * object is not valid anymore for external readers, until the >> + * value_{init,uninit} critical section is completed. >> + */ >> +static inline bool >> +refmap_refcount_try_ref_one(struct ovs_refcount *refcount) > I assume this is the attempt to avoid the add/delete raise callback. > As you mentioned this is not fixed, I'm not reviewing the 'complex' > reference count attempt. Not sure but could this maybe be fixed simpler > doing something like this (not tested or well thought trought): > > This is the old code with some annotations:: > > > static inline bool > ovs_refcount_unref_if_not_last(struct ovs_refcount *refcount) > { > unsigned int count; > > atomic_read_explicit(&refcount->count, &count, > memory_order_acquire); > ovs_assert(count > 0); > while (count > 1) { > if (atomic_compare_exchange_weak_explicit( > &refcount->count, &count, count - 1, > memory_order_release, memory_order_relaxed)) { > return true; > } > } > return false; > } > > bool > refmap_unref(struct refmap *rfm, void *value) > { > struct refmap_node *node; > unsigned int old_refcount; > > if (!value) { > return false; > } > > node = refmap_node_from_value(rfm, value); > if (!node) { > return false; > } > > if (ovs_refcount_unref_if_not_last(&node->refcount)) { > return false; > } > > ovs_mutex_lock(&rfm->map_lock); > old_refcount = ovs_refcount_unref(&node->refcount); > > if (old_refcount == 1) { > /* We transitioned 1→0 under lock. Safe to cleanup. */ > rfm->value_uninit(refmap_node_value(rfm, node)); > cmap_remove(&rfm->map, &node->map_node, node->hash); > ovs_mutex_unlock(&rfm->map_lock); > ovsrcu_postpone_inline(free, node, rcu_node); > return true; > } > > ovs_mutex_unlock(&rfm->map_lock); > return false; > } I took it. It works (added coverage in test-refmap). Thanks! > >> +{ >> + unsigned int count; >> + >> + atomic_read_explicit(&refcount->count, &count, memory_order_relaxed); >> + do { >> + if (count <= 1) { >> + return false; >> + } >> + } while (!atomic_compare_exchange_weak_explicit(&refcount->count, &count, >> + count + 1, >> + memory_order_relaxed, >> + memory_order_relaxed)); >> + return true; >> +} >> + >> +void >> +refmap_for_each(struct refmap *rfm, >> + void (*cb)(void *value, void *key, void *arg), void *arg) > From our earlier discussion: > > EC> A macro for this would be more in line with existing implementations and > EC> avoids the requirement for a callback function for simple operations, i.e., > EC> something like REFMAP_FOR_EACH(). > > EB> We want to call the cb only under a reference count that is internal in > EB> the refmap. > > EB> We can ask the user to explicitly call ref/unref before/after the "cb" > EB> code, but I think it's better to have this as part of the > EB> infrastructure, with the downside of a "cb" code that is not inline with > EB> the loop. > > EB> Do you have a suggestion how to do it in a macro? > > I did some quick/dirty coding, and this is a potential diff (might need > some cleanup bit it looks nicer with the code inline): > > diff --git a/lib/refmap.c b/lib/refmap.c > index c2c435238..86b5850e1 100644 > --- a/lib/refmap.c > +++ b/lib/refmap.c > @@ -291,6 +291,38 @@ refmap_refcount_try_ref_one(struct ovs_refcount *refcount) > return true; > } > > +/* Helper function for REFMAP_FOR_EACH macro. > + * Advances to the next refmap entry, handling ref/unref automatically. > + * Returns true if a value was found, false when iteration is complete. */ > +bool > +refmap_iter_next(struct refmap_iter *iter, void **value, void **key) > +{ > + struct refmap_node *node; > + > + /* Unref previous value if any. */ > + if (iter->prev_value) { > + refmap_unref(iter->rfm, iter->prev_value); > + iter->prev_value = NULL; > + } > + > + /* Initialize on first call (cursor.impl is NULL from zero-init). */ > + if (!iter->cursor.impl) { > + iter->cursor = cmap_cursor_start(&iter->rfm->map); > + } > + > + /* Find next refable node. */ > + CMAP_CURSOR_FOR_EACH_CONTINUE (node, map_node, &iter->cursor) { > + if (refmap_refcount_try_ref_one(&node->refcount)) { > + *key = refmap_node_key(node); > + *value = refmap_node_value(iter->rfm, node); > + iter->prev_value = *value; > + return true; > + } > + } > + > + return false; > +} > + > void > refmap_for_each(struct refmap *rfm, > void (*cb)(void *value, void *key, void *arg), void *arg) > diff --git a/lib/refmap.h b/lib/refmap.h > index e41f1d888..2d7b6a265 100644 > --- a/lib/refmap.h > +++ b/lib/refmap.h > @@ -23,7 +23,9 @@ > > #include <stddef.h> > #include <stdint.h> > +#include <stdbool.h> > > +#include "cmap.h" > + > #include "openvswitch/dynamic-string.h" > > /* > @@ -86,6 +88,31 @@ typedef void (*refmap_value_uninit)(void *value); > typedef struct ds *(*refmap_value_format)(struct ds *s, void *key, > void *value); > > +/* Iterator context for REFMAP_FOR_EACH macro. */ > +struct refmap_iter { > + struct refmap *rfm; > + struct cmap_cursor cursor; > + void *prev_value; > +}; > + > +/* Helper function for REFMAP_FOR_EACH macro. > + * Advances to the next refmap entry, handling ref/unref automatically. > + * Returns true if a value was found, false when iteration is complete. */ > +bool refmap_iter_next(struct refmap_iter *, void **value, void **key); > + > +/* Iterates through all key-value pairs in REFMAP. Each iteration assigns > + * VALUE (void *) and KEY (void *) to the current entry's value and key. > + * > + * The iterator holds a reference during each iteration, automatically > + * calling refmap_unref() after the loop body completes. > + * > + * If you break out of the loop, then you need to manually call > + * refmap_unref(REFMAP, VALUE) to release the reference. */ > +#define REFMAP_FOR_EACH(VALUE, KEY, REFMAP) \ > + for (struct refmap_iter refmap_iter__ = { (REFMAP), {0}, NULL }; \ > + refmap_iter_next(&refmap_iter__, &(VALUE), &(KEY)); \ > + ) > + > /* Allocate and return a map handle. > * > * The user must ensure the 'key' is fully initialized, including potential > diff --git a/tests/test-refmap.c b/tests/test-refmap.c > index 4639088bc..533df5407 100644 > --- a/tests/test-refmap.c > +++ b/tests/test-refmap.c > @@ -126,20 +126,6 @@ struct check_refmap_ctx { > int count; > }; > > -static void > -check_refmap_cb(void *value_, void *key_, void *arg_) > -{ > - struct check_refmap_ctx *ctx = arg_; > - struct key *key = key_; > - > - ovs_assert(key->idx < N); > - if (ctx->values) { > - ovs_assert(ctx->values[key->idx] == value_); > - } > - > - ctx->count++; > -} > - > static void > check_refmap(struct refmap *rfm, struct value **values, int n_expected) > { > @@ -148,8 +134,19 @@ check_refmap(struct refmap *rfm, struct value **values, int n_expected) > .values = values, > .count = 0, > }; > + void *value, *key; > + > + REFMAP_FOR_EACH (value, key, rfm) { > + struct key *k = key; > + > + ovs_assert(k->idx < N); > + if (ctx.values) { > + ovs_assert(ctx.values[k->idx] == value); > + } > + > + ctx.count++; > + } > > - refmap_for_each(rfm, check_refmap_cb, &ctx); > ovs_assert(ctx.count == n_expected); > } > > @@ -161,25 +158,6 @@ struct iter_modify_ctx { > int unref_count; > }; > > -static void > -iter_ref_cb(void *value_, void *key_, void *arg_) > -{ > - struct iter_modify_ctx *ctx = arg_; > - struct key *key = key_; > - > - ovs_assert(refmap_try_ref_value(ctx->rfm, value_)); > - ctx->extra_refs[key->idx] = value_; > - ctx->ref_count++; > -} > - > -static void > -iter_unref_cb(void *value_, void *key_ OVS_UNUSED, void *arg_) > -{ > - struct iter_modify_ctx *ctx = arg_; > - > - refmap_unref(ctx->rfm, value_); > - ctx->unref_count++; > -} > > struct try_ref_race_ctx { > struct refmap *rfm; > @@ -187,14 +165,6 @@ struct try_ref_race_ctx { > atomic_bool stop; > }; > > -static void > -race_for_each_cb(void *value_, void *key_ OVS_UNUSED, void *arg_ OVS_UNUSED) > -{ > - struct value *value = value_; > - > - ovs_assert(value->hdl); > -} > - > static void * > try_ref_racer(void *arg) > { > @@ -216,7 +186,13 @@ try_ref_racer(void *arg) > refmap_unref(ctx->rfm, value); > } > > - refmap_for_each(ctx->rfm, race_for_each_cb, NULL); > + void *iter_value, *iter_key; > + > + REFMAP_FOR_EACH (iter_value, iter_key, ctx->rfm) { > + struct value *v = iter_value; > + > + ovs_assert(v->hdl); > + } > } > > return NULL; > @@ -269,6 +245,7 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) > struct key keys[N]; > struct refmap *rfm; > uint32_t args[N]; > + void *value, *key; > > rfm = refmap_create("check-rfm", sizeof(struct key), sizeof(struct value), > value_init, value_uninit, value_format); > @@ -280,7 +257,6 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) > struct arg arg = { > .ptr = &args[i], > }; > - struct value *value; > > keys[i].idx = i; > args[i] = i; > @@ -333,7 +309,15 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) > im_ctx.keys = keys; > im_ctx.extra_refs = (struct value **) extra_refs; > memset(extra_refs, 0, sizeof extra_refs); > - refmap_for_each(rfm, iter_ref_cb, &im_ctx); > + > + REFMAP_FOR_EACH (value, key, rfm) { > + struct key *k = key; > + > + ovs_assert(refmap_try_ref_value(im_ctx.rfm, value)); > + im_ctx.extra_refs[k->idx] = value; > + im_ctx.ref_count++; > + } > + > ovs_assert(im_ctx.ref_count == N); > for (int i = 0; i < N; i++) { > ovs_assert(extra_refs[i] == values[i]); > @@ -345,7 +329,12 @@ run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) > /* Drop extra refs from within refmap_for_each callback. */ > memset(&im_ctx, 0, sizeof im_ctx); > im_ctx.rfm = rfm; > - refmap_for_each(rfm, iter_unref_cb, &im_ctx); > + > + REFMAP_FOR_EACH (value, key, rfm) { > + refmap_unref(im_ctx.rfm, value); > + im_ctx.unref_count++; > + } > + > ovs_assert(im_ctx.unref_count == N); > for (int i = 0; i < N; i++) { > ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); I took it. It works. Thanks! > >> + struct refmap_node *node; >> + >> + CMAP_FOR_EACH (node, map_node, &rfm->map) { >> + void *value; >> + >> + if (!refmap_refcount_try_ref_one(&node->refcount)) { >> + continue; >> + } >> + log_node(rfm, "foreach", node); >> + value = refmap_node_value(rfm, node); >> + cb(value, refmap_node_key(node), arg); >> + refmap_unref(rfm, value); >> + } >> +} >> + >> +static uint32_t >> +refmap_key_hash(const struct refmap *rfm, const void *key) >> +{ >> + return hash_bytes(key, rfm->key_size, 0); >> +} >> + >> +static void * >> +refmap_lookup_protected(struct refmap *rfm, void *key, uint32_t hash) > This should return the actual node struct, i.e. 'static struct refmap_node *' Ack > >> +{ >> + struct refmap_node *node; >> + >> + CMAP_FOR_EACH_WITH_HASH_PROTECTED (node, map_node, hash, &rfm->map) { >> + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && >> + ovs_refcount_read(&node->refcount) > 1) { >> + return node; >> + } >> + } >> + >> + return NULL; >> +} >> + >> +static void * >> +refmap_lookup(struct refmap *rfm, void *key, uint32_t hash) > This should return the actual node struct, i.e. 'static struct refmap_node *' Ack > >> +{ >> + struct refmap_node *node; >> + >> + CMAP_FOR_EACH_WITH_HASH (node, map_node, hash, &rfm->map) { >> + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && ... >> + >> +bool >> +refmap_try_ref_value(struct refmap *rfm, void *value) >> +{ >> + struct refmap_node *node; >> + >> + if (!value) { >> + return false; >> + } >> + >> + node = refmap_node_from_value(rfm, value); >> + if (!node || !refmap_refcount_try_ref_one(&node->refcount)) { > Should we log_node() the refmap_refcount_() failure here? I don't see much value, but I can add it. > >> + return false; >> + } >> + >> + log_node(rfm, "try_ref_value", node); >> + return true; >> +} >> + >> +bool >> +refmap_unref(struct refmap *rfm, void *value) >> +{ >> + struct refmap_node *node; ... >> + * 7. refmap_unref(value); >> + * This is the last reference released. value_uninit is immediatelly > immediatelly -> immediately Ack > >> + * called, while the value memory is freed after RCU grace period. >> + * >> + * Thread safety >> + * ============= >> + * >> + * MT-unsafe: >> + * * refmap_create >> + * * refmap_destroy > See earlier comment about refmap_destroy(). I would prefer to make this MT-safe, > maybe with something like this (not tested or thought trough): > > static void refmap_destroy_rcu__(void *rfm_) > { > struct refmap *rfm = rfm_; > refmap_destroy__(rfm, false); > } > > struct refmap { > atomic_bool destroyed; // Add this field > ... > }; > > void refmap_destroy(struct refmap *rfm) > { > if (!rfm) { > return; > } > atomic_store(&rfm->destroyed, true); // Mark as destroyed > refmap_destroy_unregister(rfm); > ovsrcu_postpone(refmap_destroy_rcu__, rfm); > } > > void *refmap_ref(struct refmap *rfm, const void *key, void *arg) > { > if (atomic_load(&rfm->destroyed)) { > return NULL; > } > // ... rest of existing code > } > > Also need similar checks in refmap_try_ref() and possibly refmap_for_each(). Same as above. I think it over-complicates something that should be clear enough. > >> + * >> + * MT-safe: >> + * * refmap_for_each >> + * * refmap_ref >> + * * refmap_try_ref >> + * * refmap_try_ref_value >> + * * refmap_unref >> + * >> + */ >> + >> +struct refmap; >> + >> +/* Called once on a newly created 'value', i.e. when the first >> + * reference is taken. */ >> +typedef int (*refmap_value_init)(void *value, void *arg); >> + >> +/* Called once on the last dereference to value. */ >> +typedef void (*refmap_value_uninit)(void *value); >> + >> +/* Format a (key, value, arg) tuple in 's'. This is an optional (can be NULL) > arg value is no longer present. Ack > >> + * callback, used for debug log purposes. >> + */ >> +typedef struct ds *(*refmap_value_format)(struct ds *s, void *key, >> + void *value); >> + >> +/* Allocate and return a map handle. >> + * >> + * The user must ensure the 'key' is fully initialized, including potential >> + * pad bytes. >> + */ >> +struct refmap *refmap_create(const char *name, >> + size_t key_size, >> + size_t value_size, >> + refmap_value_init value_init, >> + refmap_value_uninit value_uninit, >> + refmap_value_format value_format); > refmap_value_init, > refmap_value_uninit, > refmap_value_format); > > No need to add variable names if it's clear what the argument is. Ack > >> + >> +/* Frees the map memory. >> + * >> + * The client is responsible for unreferencing any data previously held in >> + * the map. */ > See earlier comment on adding a more explicit comment. Ack > >> +void refmap_destroy(struct refmap *rfm); > Same as earlier, refmap_destroy(struct refmap *), same for all *rfm's below. >> + >> +/* refmap_try_ref takes a reference for the found value upon success. >> + * It's the user's responsibility to unref it. */ > This help might need to be more general as it applies to all three functions > below. > >> +void *refmap_try_ref(struct refmap *rfm, void *key); >> +void *refmap_ref(struct refmap *rfm, void *key, void *arg); >> +bool refmap_try_ref_value(struct refmap *rfm, void *value); > New line here. Ack > >> +void refmap_for_each(struct refmap *rfm, >> + void (*cb)(void *value, void *key, void *arg), >> + void *arg); >> +/* The refmap_value_refcount_read() API requires the caller to hold a >> + * reference, so a returned value of 1 only indicates you were the sole owner >> + * at the moment of the read, but may no longer be by the time you receive the >> + * value. This makes it unsuitable for logic decisions and only useful for >> + * debug logging. >> + */ > This help text should move down to the function definition. Ack > >> +void *refmap_key_from_value(struct refmap *rfm, void *value); >> + >> +/* Return 'true' if it was the last 'value' dereference and >> + * 'value_uninit' has been called. */ >> +bool refmap_unref(struct refmap *rfm, void *value); >> + >> +unsigned int >> +refmap_value_refcount_read(struct refmap *rfm, void *value); >> + >> +#endif /* REFMAP_H */ >> diff --git a/tests/automake.mk b/tests/automake.mk >> index a9d972a86..a74e56454 100644 >> --- a/tests/automake.mk >> +++ b/tests/automake.mk >> @@ -502,6 +502,7 @@ tests_ovstest_SOURCES = \ >> tests/test-rcu.c \ >> tests/test-rculist.c \ >> tests/test-reconnect.c \ >> + tests/test-refmap.c \ >> tests/test-rstp.c \ >> tests/test-sflow.c \ >> tests/test-sha1.c \ >> diff --git a/tests/library.at b/tests/library.at >> index 449f15fd5..3662d488e 100644 >> --- a/tests/library.at >> +++ b/tests/library.at >> @@ -44,6 +44,11 @@ AT_CHECK([ovstest test-ccmap check 1], [0], [... >> ]) >> AT_CLEANUP >> >> +AT_SETUP([refmap]) >> +AT_KEYWORDS([refmap]) >> +AT_CHECK([ovstest test-refmap check], [0], []) >> +AT_CLEANUP >> + >> AT_SETUP([atomic operations]) >> AT_CHECK([ovstest test-atomic]) >> AT_CLEANUP >> diff --git a/tests/test-refmap.c b/tests/test-refmap.c >> new file mode 100644 >> index 000000000..4639088bc >> --- /dev/null >> +++ b/tests/test-refmap.c > > I did not look at the test cases in details, but it looks like most > case are covered. However it might be nice to add a test case were > the value_init() function is failing. Also a test for NULL value_format() > would show the existing bug. Removed the value_format in test-refmap. It is not used anyway. No bug with it. Added coverage for value_init failure. > >> @@ -0,0 +1,894 @@ >> +/* >> + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. >> + * All rights reserved. >> + * SPDX-License-Identifier: Apache-2.0 >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at ... >> + aux->rfm = refmap_create("benchmark-rfm", sizeof(struct key), >> + sizeof(struct value), value_init, value_uninit, >> + value_format); > ovs_assert(!aux->rfm); Ack (without the !) Done for all refmap_create occurrences. > >> + >> + print_test_header(&aux->test); >> + ovs_barrier_block(&barrier_inner); >> + /* Init is done, worker can start preparing to work. */ ...
diff --git a/lib/automake.mk b/lib/automake.mk index c6e988906..cb6458b0d 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -323,6 +323,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/rculist.h \ lib/reconnect.c \ lib/reconnect.h \ + lib/refmap.c \ + lib/refmap.h \ lib/rstp.c \ lib/rstp.h \ lib/rstp-common.h \ diff --git a/lib/refmap.c b/lib/refmap.c new file mode 100644 index 000000000..c2c435238 --- /dev/null +++ b/lib/refmap.c @@ -0,0 +1,485 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#include "cmap.h" +#include "hash.h" +#include "fatal-signal.h" +#include "ovs-atomic.h" +#include "ovs-thread.h" +#include "refmap.h" +#include "timeval.h" + +#include "openvswitch/list.h" +#include "openvswitch/vlog.h" + +VLOG_DEFINE_THIS_MODULE(refmap); +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600); + +static struct ovs_mutex refmap_destroy_lock = OVS_MUTEX_INITIALIZER; +static struct ovs_list refmap_destroy_list + OVS_GUARDED_BY(refmap_destroy_lock) = + OVS_LIST_INITIALIZER(&refmap_destroy_list); + +struct refmap { + struct cmap map; + struct ovs_mutex map_lock; + size_t key_size; + size_t value_size; + refmap_value_init value_init; + refmap_value_uninit value_uninit; + refmap_value_format value_format; + char *name; + struct ovs_list in_destroy_list; +}; + +struct refmap_node { + struct ovsrcu_node rcu_node; + /* CMAP related: */ + struct cmap_node map_node; + uint32_t hash; + /* Content: */ + struct ovs_refcount refcount; + /* Key, then Value follows. */ +}; + +static void +refmap_destroy__(struct refmap *rfm, bool global_destroy); + +static void +refmap_destroy_unregister_protected(struct refmap *rfm) + OVS_REQUIRES(refmap_destroy_lock) +{ + ovs_list_remove(&rfm->in_destroy_list); +} + +static void +refmap_destroy_unregister(struct refmap *rfm) + OVS_EXCLUDED(refmap_destroy_lock) +{ + ovs_mutex_lock(&refmap_destroy_lock); + refmap_destroy_unregister_protected(rfm); + ovs_mutex_unlock(&refmap_destroy_lock); +} + +static void +refmap_destroy_register(struct refmap *rfm) + OVS_EXCLUDED(refmap_destroy_lock) +{ + ovs_mutex_lock(&refmap_destroy_lock); + ovs_list_push_back(&refmap_destroy_list, &rfm->in_destroy_list); + ovs_mutex_unlock(&refmap_destroy_lock); +} + +static void +refmap_destroy_all(void *aux OVS_UNUSED) +{ + struct refmap *rfm; + + ovs_mutex_lock(&refmap_destroy_lock); + LIST_FOR_EACH_SAFE (rfm, in_destroy_list, &refmap_destroy_list) { + refmap_destroy_unregister_protected(rfm); + refmap_destroy__(rfm, true); + } + ovs_mutex_unlock(&refmap_destroy_lock); + ovs_mutex_destroy(&refmap_destroy_lock); +} + +static void +refmap_fatal_signal_hook(void *aux OVS_UNUSED) +{ + /* This argument is only for the type check in 'ovsrcu_postpone', + * it is not otherwise used. */ + static int dummy_arg; + + /* Do not run all destroys right in the signal handler. + * Let other modules execute their own cleanup, and then + * iterate over any remaining to warn about leaks. */ + ovsrcu_postpone(refmap_destroy_all, &dummy_arg); +} + +struct refmap * +refmap_create(const char *name, + size_t key_size, + size_t value_size, + refmap_value_init value_init, + refmap_value_uninit value_uninit, + refmap_value_format value_format) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + struct refmap *rfm; + + ovs_assert(value_init && value_uninit); + + if (ovsthread_once_start(&once)) { + fatal_signal_add_hook(refmap_fatal_signal_hook, NULL, NULL, true); + ovsthread_once_done(&once); + } + + rfm = xzalloc(sizeof *rfm); + rfm->name = xstrdup(name); + rfm->key_size = key_size; + rfm->value_size = value_size; + rfm->value_init = value_init; + rfm->value_uninit = value_uninit; + rfm->value_format = value_format; + + ovs_mutex_init(&rfm->map_lock); + cmap_init(&rfm->map); + + refmap_destroy_register(rfm); + + return rfm; +} + +static void +refmap_destroy__(struct refmap *rfm, bool global_destroy) +{ + bool leaks_detected = false; + + if (!rfm) { + return; + } + + VLOG_DBG("%s: destroying the map", rfm->name); + + ovs_mutex_lock(&rfm->map_lock); + if (!cmap_is_empty(&rfm->map)) { + struct refmap_node *node; + + VLOG_WARN("%s: %s called with elements remaining in the map", + rfm->name, __func__); + leaks_detected = true; + CMAP_FOR_EACH (node, map_node, &rfm->map) { + /* No need to remove the node from the CMAP, it will + * be destroyed immediately. */ + ovsrcu_postpone_embedded(free, node, rcu_node); + } + } + cmap_destroy(&rfm->map); + ovs_mutex_unlock(&rfm->map_lock); + + ovs_mutex_destroy(&rfm->map_lock); + free(rfm->name); + free(rfm); + + /* During the very last stage of execution of RCU callbacks, + * the VLOG subsystem has been disabled. All logs are thus muted. + * If leaks are detected, abort the process, even though we were + * exiting due to a fatal signal. The SIGABRT generated will still + * be visible. */ + if (global_destroy && leaks_detected) { + ovs_abort(-1, "Refmap values leak detected."); + } +} + +void +refmap_destroy(struct refmap *rfm) +{ + if (!rfm) { + return; + } + + refmap_destroy_unregister(rfm); + refmap_destroy__(rfm, false); +} + +static size_t +refmap_aligned_key_size(struct refmap *rfm) +{ + return ROUND_UP(rfm->key_size, 8); +} + +static void * +refmap_node_key(struct refmap_node *node) +{ + if (!node) { + return NULL; + } + + return node + 1; +} + +static void * +refmap_node_value(struct refmap *rfm, struct refmap_node *node) +{ + if (!node) { + return NULL; + } + + return ((char *) refmap_node_key(node)) + refmap_aligned_key_size(rfm); +} + +static size_t +refmap_node_total_size(struct refmap *rfm) +{ + return sizeof(struct refmap_node) + + refmap_aligned_key_size(rfm) + rfm->value_size; +} + +static struct refmap_node * +refmap_node_from_value(struct refmap *rfm, void *value) +{ + size_t offset = sizeof(struct refmap_node) + refmap_aligned_key_size(rfm); + + if ((uintptr_t) value < offset) { + return NULL; + } + return (void *) (((char *) value) - offset); +} + +static void +log_node(struct refmap *rfm, const char *prefix, struct refmap_node *node) +{ + void *key, *value; + struct ds s; + + if (OVS_LIKELY(VLOG_DROP_DBG(&rl) || !rfm->value_format)) { + return; + } + + key = refmap_node_key(node); + value = refmap_node_value(rfm, node); + + ds_init(&s); + ds_put_cstr(&s, ", '"); + rfm->value_format(&s, key, value); + ds_put_cstr(&s, "'"); + VLOG_DBG("%s: %p %s, refcnt=%d%s", rfm->name, value, prefix, + ovs_refcount_read(&node->refcount), ds_cstr(&s)); + ds_destroy(&s); +} + +/* Increments 'refcount', but only if it is over one. + * + * Returns false if the refcount was zero or one. + * In refmap, the last reference is only used to synchronize between + * value init and uninit in case of contention. In such state, the + * object is not valid anymore for external readers, until the + * value_{init,uninit} critical section is completed. + */ +static inline bool +refmap_refcount_try_ref_one(struct ovs_refcount *refcount) +{ + unsigned int count; + + atomic_read_explicit(&refcount->count, &count, memory_order_relaxed); + do { + if (count <= 1) { + return false; + } + } while (!atomic_compare_exchange_weak_explicit(&refcount->count, &count, + count + 1, + memory_order_relaxed, + memory_order_relaxed)); + return true; +} + +void +refmap_for_each(struct refmap *rfm, + void (*cb)(void *value, void *key, void *arg), void *arg) +{ + struct refmap_node *node; + + CMAP_FOR_EACH (node, map_node, &rfm->map) { + void *value; + + if (!refmap_refcount_try_ref_one(&node->refcount)) { + continue; + } + log_node(rfm, "foreach", node); + value = refmap_node_value(rfm, node); + cb(value, refmap_node_key(node), arg); + refmap_unref(rfm, value); + } +} + +static uint32_t +refmap_key_hash(const struct refmap *rfm, const void *key) +{ + return hash_bytes(key, rfm->key_size, 0); +} + +static void * +refmap_lookup_protected(struct refmap *rfm, void *key, uint32_t hash) +{ + struct refmap_node *node; + + CMAP_FOR_EACH_WITH_HASH_PROTECTED (node, map_node, hash, &rfm->map) { + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && + ovs_refcount_read(&node->refcount) > 1) { + return node; + } + } + + return NULL; +} + +static void * +refmap_lookup(struct refmap *rfm, void *key, uint32_t hash) +{ + struct refmap_node *node; + + CMAP_FOR_EACH_WITH_HASH (node, map_node, hash, &rfm->map) { + if (!memcmp(key, refmap_node_key(node), rfm->key_size) && + ovs_refcount_read(&node->refcount) > 1) { + return node; + } + } + + return NULL; +} + +void * +refmap_try_ref(struct refmap *rfm, void *key) +{ + struct refmap_node *node; + + node = refmap_lookup(rfm, key, refmap_key_hash(rfm, key)); + if (!node || !refmap_refcount_try_ref_one(&node->refcount)) { + return NULL; + } + + log_node(rfm, "try_ref", node); + return refmap_node_value(rfm, node); +} + +void * +refmap_ref(struct refmap *rfm, void *key, void *arg) +{ + struct refmap_node *node; + bool error = false; + uint32_t hash; + void *value; + + hash = refmap_key_hash(rfm, key); + + node = refmap_lookup(rfm, key, hash); + if (node && refmap_refcount_try_ref_one(&node->refcount)) { + value = refmap_node_value(rfm, node); + goto out; + } + + ovs_mutex_lock(&rfm->map_lock); + + node = refmap_lookup_protected(rfm, key, hash); + if (node && refmap_refcount_try_ref_one(&node->refcount)) { + ovs_mutex_unlock(&rfm->map_lock); + value = refmap_node_value(rfm, node); + goto out; + } + + node = xzalloc(refmap_node_total_size(rfm)); + node->hash = hash; + ovs_refcount_init(&node->refcount); + memcpy(refmap_node_key(node), key, rfm->key_size); + value = refmap_node_value(rfm, node); + if (rfm->value_init(value, arg) == 0) { + cmap_insert(&rfm->map, &node->map_node, node->hash); + ovs_refcount_ref(&node->refcount); + } else { + value = NULL; + error = true; + VLOG_WARN("%s: value_init failed", rfm->name); + } + ovs_mutex_unlock(&rfm->map_lock); + +out: + if (error) { + free(node); + return NULL; + } + + log_node(rfm, "ref", node); + + return value; +} + +bool +refmap_try_ref_value(struct refmap *rfm, void *value) +{ + struct refmap_node *node; + + if (!value) { + return false; + } + + node = refmap_node_from_value(rfm, value); + if (!node || !refmap_refcount_try_ref_one(&node->refcount)) { + return false; + } + + log_node(rfm, "try_ref_value", node); + return true; +} + +bool +refmap_unref(struct refmap *rfm, void *value) +{ + struct refmap_node *node; + + if (!value) { + return false; + } + + node = refmap_node_from_value(rfm, value); + if (!node) { + return false; + } + + log_node(rfm, "unref", node); + + if (ovs_refcount_unref(&node->refcount) == 2) { + ovs_mutex_lock(&rfm->map_lock); + if (ovs_refcount_read(&node->refcount) > 1) { + ovs_mutex_unlock(&rfm->map_lock); + return false; + } + rfm->value_uninit(refmap_node_value(rfm, node)); + cmap_remove(&rfm->map, &node->map_node, node->hash); + ovs_assert(ovs_refcount_unref(&node->refcount) == 1); + ovs_mutex_unlock(&rfm->map_lock); + ovsrcu_postpone_embedded(free, node, rcu_node); + return true; + } + return false; +} + +void * +refmap_key_from_value(struct refmap *rfm, void *value) +{ + return refmap_node_key(refmap_node_from_value(rfm, value)); +} + +unsigned int +refmap_value_refcount_read(struct refmap *rfm, void *value) +{ + struct refmap_node *node; + + if (!value) { + return 0; + } + + node = refmap_node_from_value(rfm, value); + if (node) { + return ovs_refcount_read(&node->refcount) - 1; + } + + return 0; +} diff --git a/lib/refmap.h b/lib/refmap.h new file mode 100644 index 000000000..e41f1d888 --- /dev/null +++ b/lib/refmap.h @@ -0,0 +1,130 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef REFMAP_H +#define REFMAP_H + +#include <config.h> + +#include <stddef.h> +#include <stdint.h> + +#include "openvswitch/dynamic-string.h" + +/* + * Reference map + * ============= + * + * This key-value store acts like a regular concurrent hashmap, + * except that insertion takes a reference on the value if already + * present. + * The key provided must be fully initialized, including potential pad bytes. + * + * As the value creation is dependent on it being already present + * within the structure and the user cannot predict that, this structure + * requires definitions for value_init and value_uninit functions, + * that will be called only at creation (first reference taken) and + * destruction (last reference released). + * + * Example: + * 1. struct key key; + * 2. memset(&key, 0, sizeof key); + * 3. refmap_create() + * 4. value = refmap_ref(key); + * Since it's the first reference for <key>, value_init is called. + * 5. refmap_ref(key); + * This is not the first reference for <key>. Only ref-count is updated. + * 6. refmap_unref(value); + * This is not the last reference released. Only ref-count is updated. + * 7. refmap_unref(value); + * This is the last reference released. value_uninit is immediatelly + * called, while the value memory is freed after RCU grace period. + * + * Thread safety + * ============= + * + * MT-unsafe: + * * refmap_create + * * refmap_destroy + * + * MT-safe: + * * refmap_for_each + * * refmap_ref + * * refmap_try_ref + * * refmap_try_ref_value + * * refmap_unref + * + */ + +struct refmap; + +/* Called once on a newly created 'value', i.e. when the first + * reference is taken. */ +typedef int (*refmap_value_init)(void *value, void *arg); + +/* Called once on the last dereference to value. */ +typedef void (*refmap_value_uninit)(void *value); + +/* Format a (key, value, arg) tuple in 's'. This is an optional (can be NULL) + * callback, used for debug log purposes. + */ +typedef struct ds *(*refmap_value_format)(struct ds *s, void *key, + void *value); + +/* Allocate and return a map handle. + * + * The user must ensure the 'key' is fully initialized, including potential + * pad bytes. + */ +struct refmap *refmap_create(const char *name, + size_t key_size, + size_t value_size, + refmap_value_init value_init, + refmap_value_uninit value_uninit, + refmap_value_format value_format); + +/* Frees the map memory. + * + * The client is responsible for unreferencing any data previously held in + * the map. */ +void refmap_destroy(struct refmap *rfm); + +/* refmap_try_ref takes a reference for the found value upon success. + * It's the user's responsibility to unref it. */ +void *refmap_try_ref(struct refmap *rfm, void *key); +void *refmap_ref(struct refmap *rfm, void *key, void *arg); +bool refmap_try_ref_value(struct refmap *rfm, void *value); +void refmap_for_each(struct refmap *rfm, + void (*cb)(void *value, void *key, void *arg), + void *arg); +/* The refmap_value_refcount_read() API requires the caller to hold a + * reference, so a returned value of 1 only indicates you were the sole owner + * at the moment of the read, but may no longer be by the time you receive the + * value. This makes it unsuitable for logic decisions and only useful for + * debug logging. + */ +void *refmap_key_from_value(struct refmap *rfm, void *value); + +/* Return 'true' if it was the last 'value' dereference and + * 'value_uninit' has been called. */ +bool refmap_unref(struct refmap *rfm, void *value); + +unsigned int +refmap_value_refcount_read(struct refmap *rfm, void *value); + +#endif /* REFMAP_H */ diff --git a/tests/automake.mk b/tests/automake.mk index a9d972a86..a74e56454 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -502,6 +502,7 @@ tests_ovstest_SOURCES = \ tests/test-rcu.c \ tests/test-rculist.c \ tests/test-reconnect.c \ + tests/test-refmap.c \ tests/test-rstp.c \ tests/test-sflow.c \ tests/test-sha1.c \ diff --git a/tests/library.at b/tests/library.at index 449f15fd5..3662d488e 100644 --- a/tests/library.at +++ b/tests/library.at @@ -44,6 +44,11 @@ AT_CHECK([ovstest test-ccmap check 1], [0], [... ]) AT_CLEANUP +AT_SETUP([refmap]) +AT_KEYWORDS([refmap]) +AT_CHECK([ovstest test-refmap check], [0], []) +AT_CLEANUP + AT_SETUP([atomic operations]) AT_CHECK([ovstest test-atomic]) AT_CLEANUP diff --git a/tests/test-refmap.c b/tests/test-refmap.c new file mode 100644 index 000000000..4639088bc --- /dev/null +++ b/tests/test-refmap.c @@ -0,0 +1,894 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#undef NDEBUG +#include <assert.h> +#include <errno.h> +#include <getopt.h> +#include <string.h> + +#include "ovs-atomic.h" +#include "ovs-numa.h" +#include "ovs-rcu.h" +#include "ovs-thread.h" +#include "ovstest.h" +#include "random.h" +#include "refmap.h" +#include "timeval.h" +#include "util.h" + +#include "openvswitch/util.h" +#include "openvswitch/vlog.h" + +#define N 100 + +static struct refmap_test_params { + unsigned int n_threads; + unsigned int n_ids; + int step_idx; + bool debug; + bool csv_format; +} params = { + .n_threads = 1, + .n_ids = N, + .debug = false, + .csv_format = false, +}; + +DEFINE_STATIC_PER_THREAD_DATA(unsigned int, thread_id, OVSTHREAD_ID_UNSET); + +static unsigned int +thread_id(void) +{ + static atomic_count next_id = ATOMIC_COUNT_INIT(0); + unsigned int id = *thread_id_get(); + + if (OVS_UNLIKELY(id == OVSTHREAD_ID_UNSET)) { + id = atomic_count_inc(&next_id); + *thread_id_get() = id; + } + + return id; +} + +struct key { + size_t idx; + bool b; + uint8_t pad[7]; +}; + +struct value { + uint32_t *hdl; +}; + +struct arg { + uint32_t *ptr; +}; + +static int +value_init(void *value_, void *arg_) +{ + struct value *value = value_; + struct arg *arg = arg_; + + /* Verify that we don't double-init value. */ + ovs_assert(!value->hdl); + + *arg->ptr = 1; + value->hdl = arg->ptr; + return 0; +} + +static void +value_uninit(void *value_) +{ + struct value *value = value_; + + /* Verify that we don't double-uninit value. */ + ovs_assert(value->hdl); + + *value->hdl = 2; + value->hdl = NULL; +} + +static struct ds * +value_format(struct ds *s, void *key_, void *value_) +{ + struct key *key = key_; + struct value *value = value_; + + ds_put_format(s, "idx=%"PRIuSIZE", b=%s, hdl=%p", + key->idx, key->b ? "1" : "0", + value->hdl); + return s; +} + +struct check_refmap_ctx { + struct refmap *rfm; + struct value **values; + int count; +}; + +static void +check_refmap_cb(void *value_, void *key_, void *arg_) +{ + struct check_refmap_ctx *ctx = arg_; + struct key *key = key_; + + ovs_assert(key->idx < N); + if (ctx->values) { + ovs_assert(ctx->values[key->idx] == value_); + } + + ctx->count++; +} + +static void +check_refmap(struct refmap *rfm, struct value **values, int n_expected) +{ + struct check_refmap_ctx ctx = { + .rfm = rfm, + .values = values, + .count = 0, + }; + + refmap_for_each(rfm, check_refmap_cb, &ctx); + ovs_assert(ctx.count == n_expected); +} + +struct iter_modify_ctx { + struct refmap *rfm; + struct key *keys; + struct value **extra_refs; + int ref_count; + int unref_count; +}; + +static void +iter_ref_cb(void *value_, void *key_, void *arg_) +{ + struct iter_modify_ctx *ctx = arg_; + struct key *key = key_; + + ovs_assert(refmap_try_ref_value(ctx->rfm, value_)); + ctx->extra_refs[key->idx] = value_; + ctx->ref_count++; +} + +static void +iter_unref_cb(void *value_, void *key_ OVS_UNUSED, void *arg_) +{ + struct iter_modify_ctx *ctx = arg_; + + refmap_unref(ctx->rfm, value_); + ctx->unref_count++; +} + +struct try_ref_race_ctx { + struct refmap *rfm; + struct key key; + atomic_bool stop; +}; + +static void +race_for_each_cb(void *value_, void *key_ OVS_UNUSED, void *arg_ OVS_UNUSED) +{ + struct value *value = value_; + + ovs_assert(value->hdl); +} + +static void * +try_ref_racer(void *arg) +{ + struct try_ref_race_ctx *ctx = arg; + + for (;;) { + bool stop_; + + atomic_read(&ctx->stop, &stop_); + if (stop_) { + break; + } + + void *value = refmap_try_ref(ctx->rfm, &ctx->key); + if (value) { + struct value *v = value; + + ovs_assert(v->hdl); + refmap_unref(ctx->rfm, value); + } + + refmap_for_each(ctx->rfm, race_for_each_cb, NULL); + } + + return NULL; +} + +/* Stress-test that try_ref rejects entries at refcount 1 (the internal + * reference used during value init/uninit synchronization). + * + * Thread A repeatedly creates and destroys the same entry. + * Thread B continuously calls try_ref and for_each. */ +static void +check_try_ref_race(void) +{ + struct try_ref_race_ctx race_ctx; + pthread_t worker; + struct refmap *rfm; + + rfm = refmap_create("try-ref-race", sizeof(struct key), + sizeof(struct value), value_init, value_uninit, + value_format); + + memset(&race_ctx.key, 0, sizeof race_ctx.key); + race_ctx.key.idx = 0; + race_ctx.rfm = rfm; + atomic_init(&race_ctx.stop, false); + + worker = ovs_thread_create("try-ref-racer", try_ref_racer, &race_ctx); + + for (int i = 0; i < 10000; i++) { + uint32_t arg_val = 0; + struct arg arg = { .ptr = &arg_val }; + void *value; + + value = refmap_ref(rfm, &race_ctx.key, &arg); + refmap_unref(rfm, value); + } + + atomic_store(&race_ctx.stop, true); + xpthread_join(worker, NULL); + + refmap_destroy(rfm); +} + +static void +run_check(struct ovs_cmdl_context *ctx OVS_UNUSED) +{ + struct iter_modify_ctx im_ctx; + struct value *extra_refs[N]; + struct value *values[N]; + struct key keys[N]; + struct refmap *rfm; + uint32_t args[N]; + + rfm = refmap_create("check-rfm", sizeof(struct key), sizeof(struct value), + value_init, value_uninit, value_format); + + check_refmap(rfm, NULL, 0); + + memset(keys, 0, sizeof keys); + for (int i = 0; i < N; i++) { + struct arg arg = { + .ptr = &args[i], + }; + struct value *value; + + keys[i].idx = i; + args[i] = i; + ovs_assert(!refmap_try_ref(rfm, &keys[i])); + value = refmap_ref(rfm, &keys[i], &arg); + ovs_assert(value); + ovs_assert(value == refmap_ref(rfm, &keys[i], &arg)); + refmap_unref(rfm, value); + ovs_assert(value == refmap_try_ref(rfm, &keys[i])); + refmap_unref(rfm, value); + values[i] = value; + } + + check_refmap(rfm, (struct value **) values, N); + + for (int i = 0; i < N; i++) { + /* Verify that value_init is properly called. */ + ovs_assert(values[i]->hdl == &args[i]); + ovs_assert(args[i] == 1); + } + + /* Verify refmap_value_refcount_read: each value has one user ref. */ + for (int i = 0; i < N; i++) { + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); + } + + ovs_assert(refmap_value_refcount_read(rfm, NULL) == 0); + + /* Verify refmap_key_from_value. */ + for (int i = 0; i < N; i++) { + struct key *k = refmap_key_from_value(rfm, values[i]); + ovs_assert(k->idx == keys[i].idx); + } + + /* Verify refmap_try_ref_value and refcount changes. */ + for (int i = 0; i < N; i++) { + ovs_assert(refmap_try_ref_value(rfm, values[i])); + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 2); + refmap_unref(rfm, values[i]); + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); + } + + ovs_assert(!refmap_try_ref_value(rfm, NULL)); + + check_refmap(rfm, (struct value **) values, N); + + /* Take extra refs from within refmap_for_each callback. */ + memset(&im_ctx, 0, sizeof im_ctx); + im_ctx.rfm = rfm; + im_ctx.keys = keys; + im_ctx.extra_refs = (struct value **) extra_refs; + memset(extra_refs, 0, sizeof extra_refs); + refmap_for_each(rfm, iter_ref_cb, &im_ctx); + ovs_assert(im_ctx.ref_count == N); + for (int i = 0; i < N; i++) { + ovs_assert(extra_refs[i] == values[i]); + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 2); + } + + check_refmap(rfm, (struct value **) values, N); + + /* Drop extra refs from within refmap_for_each callback. */ + memset(&im_ctx, 0, sizeof im_ctx); + im_ctx.rfm = rfm; + refmap_for_each(rfm, iter_unref_cb, &im_ctx); + ovs_assert(im_ctx.unref_count == N); + for (int i = 0; i < N; i++) { + ovs_assert(refmap_value_refcount_read(rfm, values[i]) == 1); + } + + check_refmap(rfm, (struct value **) values, N); + + for (int i = 0; i < N; i++) { + refmap_unref(rfm, values[i]); + } + + for (int i = 0; i < N; i++) { + ovs_assert(!refmap_try_ref(rfm, &keys[i])); + } + + for (int i = 0; i < N; i++) { + /* Verify that value_uninit is executed. */ + ovs_assert(args[i] == 2); + } + + check_refmap(rfm, NULL, 0); + + refmap_destroy(rfm); + + check_try_ref_race(); +} + +static uint32_t *ids; +static void **values; +static atomic_uint *thread_working_ms; /* Measured work time. */ + +static struct ovs_barrier barrier_outer; +static struct ovs_barrier barrier_inner; + +static atomic_uint running_time_ms; +static atomic_bool stop; + +static unsigned int +elapsed(unsigned int start) +{ + unsigned int running_time_ms_; + + atomic_read(&running_time_ms, &running_time_ms_); + + return running_time_ms_ - start; +} + +static void * +clock_main(void *arg OVS_UNUSED) +{ + struct timeval start; + struct timeval end; + + xgettimeofday(&start); + for (;;) { + bool stop_; + + atomic_read(&stop, &stop_); + if (stop_) { + break; + } + + xgettimeofday(&end); + atomic_store(&running_time_ms, + timeval_to_msec(&end) - timeval_to_msec(&start)); + xnanosleep(100 * 1000); + } + + return NULL; +} + +enum step_id { + STEP_NONE, + STEP_ALLOC, + STEP_REF, + STEP_UNREF, + STEP_FREE, + STEP_MIXED, + STEP_POS_QUERY, + STEP_NEG_QUERY, +}; + +static const char *step_names[] = { + [STEP_NONE] = "<bug>", + [STEP_ALLOC] = "alloc", + [STEP_REF] = "ref", + [STEP_UNREF] = "unref", + [STEP_FREE] = "free", + [STEP_MIXED] = "mixed", + [STEP_POS_QUERY] = "pos-query", + [STEP_NEG_QUERY] = "neg-query", +}; + +#define MAX_N_STEP 10 + +#define FOREACH_STEP(STEP_VAR, SCHEDULE) \ + for (int __idx = 0, STEP_VAR = (SCHEDULE)[__idx]; \ + (STEP_VAR = (SCHEDULE)[__idx]) != STEP_NONE; \ + __idx++) + +struct test_case { + int idx; + enum step_id schedule[MAX_N_STEP]; +}; + +static void +print_header(void) +{ + if (params.csv_format) { + return; + } + + printf("Benchmarking n=%u on %u thread%s.\n", + params.n_ids, params.n_threads, + params.n_threads > 1 ? "s" : ""); + + printf(" step\\thread: "); + printf(" Avg"); + for (size_t i = 0; i < params.n_threads; i++) { + printf(" %3" PRIuSIZE, i + 1); + } + + printf("\n"); +} + +static void +print_test_header(struct test_case *test) +{ + if (params.csv_format) { + return; + } + + printf("[%d]---------------------------", test->idx); + for (size_t i = 0; i < params.n_threads; i++) { + printf("-------"); + } + + printf("\n"); +} + +static void +print_test_result(struct test_case *test, enum step_id step, int step_idx) +{ + char test_name[50]; + uint32_t *twm; + uint32_t avg; + size_t i; + + twm = xcalloc(params.n_threads, sizeof *twm); + for (i = 0; i < params.n_threads; i++) { + atomic_read(&thread_working_ms[i], &twm[i]); + } + + avg = 0; + for (i = 0; i < params.n_threads; i++) { + avg += twm[i]; + } + + ovs_assert(params.n_threads); + avg /= params.n_threads; + + snprintf(test_name, sizeof test_name, "%d.%d-%s", + test->idx, step_idx, + step_names[step]); + if (params.csv_format) { + printf("%s,%" PRIu32, test_name, avg); + } else { + printf("%*s: ", 18, test_name); + printf(" %6" PRIu32, avg); + for (i = 0; i < params.n_threads; i++) { + printf(" %6" PRIu32, twm[i]); + } + printf(" ms"); + } + + printf("\n"); + + free(twm); +} + +static struct test_case test_cases[] = { + { + .schedule = { + STEP_ALLOC, + STEP_FREE, + }, + }, + { + .schedule = { + STEP_ALLOC, + STEP_REF, + STEP_UNREF, + STEP_FREE, + }, + }, + { + .schedule = { + STEP_MIXED, + STEP_FREE, + }, + }, + { + .schedule = { + STEP_ALLOC, + STEP_POS_QUERY, + /* Test negative query with map full. */ + STEP_NEG_QUERY, + STEP_FREE, + /* Test negative query with map empty. */ + STEP_NEG_QUERY, + }, + }, +}; + +static void +swap_ptr(void **a, void **b) +{ + void *t; + t = *a; + *a = *b; + *b = t; +} + +struct aux { + struct test_case test; + struct refmap *rfm; +}; + +static void * +benchmark_thread_worker(void *aux_) +{ + unsigned int tid = thread_id(); + unsigned int n_ids_per_thread; + unsigned int start_idx; + struct aux *aux = aux_; + struct refmap *rfm; + unsigned int start; + uint32_t *th_ids; + void **th_privs; + void *value; + size_t i; + + n_ids_per_thread = params.n_ids / params.n_threads; + start_idx = tid * n_ids_per_thread; + th_privs = &values[start_idx]; + th_ids = &ids[start_idx]; + + for (;;) { + bool stop_; + + ovs_barrier_block(&barrier_outer); + atomic_read(&stop, &stop_); + if (stop_) { + break; + } + + /* Wait for main thread to finish initializing + * rfm and step schedule. */ + ovs_barrier_block(&barrier_inner); + rfm = aux->rfm; + + FOREACH_STEP(step, aux->test.schedule) { + ovs_barrier_block(&barrier_inner); + atomic_read(&running_time_ms, &start); + switch (step) { + case STEP_ALLOC: + case STEP_REF: + for (i = 0; i < n_ids_per_thread; i++) { + struct key key = { + .idx = start_idx + i, + }; + struct arg arg = { + .ptr = &th_ids[i], + }; + + th_privs[i] = refmap_ref(rfm, &key, &arg); + } + break; + case STEP_POS_QUERY: + for (i = 0; i < n_ids_per_thread; i++) { + struct key key = { + .idx = start_idx + i, + }; + value = refmap_try_ref(rfm, &key); + refmap_unref(rfm, value); + } + break; + case STEP_NEG_QUERY: + for (i = 0; i < n_ids_per_thread; i++) { + struct key key = { + .idx = params.n_ids + 1, + }; + value = refmap_try_ref(rfm, &key); + refmap_unref(rfm, value); + } + break; + case STEP_UNREF: + case STEP_FREE: + for (i = 0; i < n_ids_per_thread; i++) { + refmap_unref(rfm, th_privs[i]); + } + break; + case STEP_MIXED: + for (i = 0; i < n_ids_per_thread; i++) { + struct arg arg; + struct key key; + int shuffled; + + /* Mixed mode is doing: + * 1. Alloc. + * 2. Shuffle two elements. + * 3. Delete shuffled element. + * 4. Alloc again. + * The loop ends with all elements allocated. + */ + + memset(&key, 0, sizeof key); + key.idx = start_idx + i; + shuffled = random_range(i + 1); + + arg.ptr = &th_ids[i]; + th_privs[i] = refmap_ref(rfm, &key, &arg); + swap_ptr(&th_privs[i], &th_privs[shuffled]); + refmap_unref(rfm, th_privs[i]); + arg.ptr = &th_ids[i]; + th_privs[i] = refmap_ref(rfm, &key, &arg); + } + break; + default: + fprintf(stderr, "[%u]: Reached step %d\n", + tid, step); + OVS_NOT_REACHED(); + break; + } + atomic_store(&thread_working_ms[tid], elapsed(start)); + ovs_barrier_block(&barrier_inner); + /* Main thread prints result now. */ + } + } + + return NULL; +} + +static void +benchmark_thread_main(struct aux *aux) +{ + int step_idx; + + memset(ids, 0, params.n_ids * sizeof *ids); + memset(values, 0, params.n_ids * sizeof *values); + + aux->rfm = refmap_create("benchmark-rfm", sizeof(struct key), + sizeof(struct value), value_init, value_uninit, + value_format); + + print_test_header(&aux->test); + ovs_barrier_block(&barrier_inner); + /* Init is done, worker can start preparing to work. */ + step_idx = 0; + FOREACH_STEP(step, aux->test.schedule) { + ovs_barrier_block(&barrier_inner); + /* Workers do the scheduled work now. */ + ovs_barrier_block(&barrier_inner); + print_test_result(&aux->test, step, step_idx++); + } + + refmap_destroy(aux->rfm); +} + +static bool +parse_benchmark_params(int argc, char *argv[]) +{ + long int l_threads = 0; + long int l_ids = 0; + bool valid = true; + long int l; + int i; + + params.step_idx = -1; + for (i = 0; i < argc; i++) { + if (!strcmp(argv[i], "benchmark") || + !strcmp(argv[i], "debug")) { + continue; + } else if (!strcmp(argv[i], "csv")) { + params.csv_format = true; + } else if (!strncmp(argv[i], "step=", 5)) { + if (!str_to_long(&argv[i][5], 10, &l)) { + fprintf(stderr, + "Invalid parameter '%s', expected positive integer.\n", + argv[i]); + valid = false; + goto out; + } + + params.step_idx = l; + } else { + if (!str_to_long(argv[i], 10, &l)) { + fprintf(stderr, + "Invalid parameter '%s', expected positive integer.\n", + argv[i]); + valid = false; + goto out; + } + + if (l_ids == 0) { + l_ids = l; + } else if (l_threads == 0) { + l_threads = l; + } else { + fprintf(stderr, + "Invalid parameter '%s', too many integer values.\n", + argv[i]); + valid = false; + goto out; + } + } + } + + if (l_ids != 0) { + params.n_ids = l_ids; + } else { + fprintf(stderr, "Invalid parameters: no number of elements given.\n"); + valid = false; + } + + if (l_threads != 0) { + params.n_threads = l_threads; + } else { + fprintf(stderr, "Invalid parameters: no number of threads given.\n"); + valid = false; + } + +out: + return valid; +} + +static void +run_benchmark(struct ovs_cmdl_context *ctx) +{ + pthread_t *threads; + pthread_t clock; + struct aux aux; + size_t i; + + if (!parse_benchmark_params(ctx->argc, ctx->argv)) { + return; + } + + ids = xcalloc(params.n_ids, sizeof *ids); + values = xcalloc(params.n_ids, sizeof *values); + thread_working_ms = xcalloc(params.n_threads, + sizeof *thread_working_ms); + for (i = 0; i < params.n_threads; i++) { + atomic_init(&thread_working_ms[i], 0); + } + + atomic_init(&stop, false); + + clock = ovs_thread_create("clock", clock_main, NULL); + + ovsrcu_quiesce_start(); + ovs_barrier_init(&barrier_outer, params.n_threads + 1); + ovs_barrier_init(&barrier_inner, params.n_threads + 1); + threads = xmalloc(params.n_threads * sizeof *threads); + for (i = 0; i < params.n_threads; i++) { + threads[i] = ovs_thread_create("worker", + benchmark_thread_worker, &aux); + } + + print_header(); + for (i = 0; i < ARRAY_SIZE(test_cases); i++) { + test_cases[i].idx = i; + if (params.step_idx != -1 && + params.step_idx != i) { + continue; + } + /* If we don't block workers from progressing now, + * there would be a race for access to aux.test, + * leading to some workers not respecting the schedule. + */ + ovs_barrier_block(&barrier_outer); + memcpy(&aux.test, &test_cases[i], sizeof aux.test); + benchmark_thread_main(&aux); + } + + atomic_store(&stop, true); + ovs_barrier_block(&barrier_outer); + + for (i = 0; i < params.n_threads; i++) { + xpthread_join(threads[i], NULL); + } + + free(threads); + + ovs_barrier_destroy(&barrier_outer); + ovs_barrier_destroy(&barrier_inner); + free(ids); + free(values); + free(thread_working_ms); + xpthread_join(clock, NULL); +} + +static const struct ovs_cmdl_command commands[] = { + {"check", "[debug]", 0, 1, run_check, OVS_RO}, + {"benchmark", "<nb elem> <nb threads> [step=<uint>] [csv]", 0, 4, + run_benchmark, OVS_RO}, + {NULL, NULL, 0, 0, NULL, OVS_RO}, +}; + +static void +parse_test_params(int argc, char *argv[]) +{ + int i; + + for (i = 0; i < argc; i++) { + if (!strcmp(argv[i], "debug")) { + params.debug = true; + } + } +} + +static void +refmap_test_main(int argc, char *argv[]) +{ + struct ovs_cmdl_context ctx = { + .argc = argc - optind, + .argv = argv + optind, + }; + + parse_test_params(argc - optind, argv + optind); + + vlog_set_levels(NULL, VLF_ANY_DESTINATION, VLL_OFF); + if (params.debug) { + vlog_set_levels_from_string_assert("refmap:console:dbg"); + } + + /* Quiesce to start the RCU. */ + ovsrcu_quiesce(); + + set_program_name(argv[0]); + ovs_cmdl_run_command(&ctx, commands); + + ovsrcu_exit(); +} + +OVSTEST_REGISTER("test-refmap", refmap_test_main); diff --git a/utilities/checkpatch_dict.txt b/utilities/checkpatch_dict.txt index c1f43e5af..5ad599c1d 100644 --- a/utilities/checkpatch_dict.txt +++ b/utilities/checkpatch_dict.txt @@ -107,6 +107,7 @@ icmp icmp4 icmpv6 idl +ie ifdef ifindex initializer @@ -232,6 +233,7 @@ rebased recirc recirculation recirculations +refmap revalidate revalidation revalidator