diff mbox series

[ovs-dev,v3,14/28] seq-pool: Module for faster ID generation

Message ID bf9b4f254d497830f91411f0e395b8d7ed5601f8.1619351442.git.grive@u256.net
State Changes Requested
Headers show
Series dpif-netdev: Parallel offload processing | expand

Commit Message

Gaetan Rivet April 25, 2021, 11:55 a.m. UTC
The current id-pool module is slow to allocate the
next valid ID, and can be optimized when restricting
some properties of the pool.

Those restrictions are:

  * No ability to add a random ID to the pool.

  * A new ID is no more the smallest possible ID.  It is
    however guaranteed to be in the range of
    [base, next_id].  Multiple users of the pool are
    registered, each with a thread-local cache for
    better scalability and the next_id is one after the
    latest ID added to any user cache.
    The allocation range can be written as:

       [base, last_alloc + nb-user * cache-size + 1].

  * A user should never free an ID that is not allocated.
    No checks are done and doing so will duplicate the spurious
    ID.  Refcounting or other memory management scheme should
    be used to ensure an object and its ID are only freed once.

This pool is designed to scale reasonably well in multi-thread
setup.  As it is aimed at being a faster replacement to the
current id-pool, a benchmark has been implemented alongside
unit tests.

The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
Respectively

  + 'new': only allocate IDs
  + 'del': only free IDs
  + 'mix': allocate, sequential free, then allocate ID.
  + 'rnd': allocate, random free, allocate ID.

Randomized freeing is done by swapping the latest allocated ID with any
from the range of currently allocated ID, which is reminiscent of the
Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
which is the more natural use-case.

For this specific round, the id-pool performance is such that a timeout
of 10 seconds is added to the benchmark:

   $ ./tests/ovstest test-seq-pool benchmark 10000 1
   Benchmarking n=10000 on 1 thread.
    type\thread:       1    Avg
   seq-pool new:       1      1 ms
   seq-pool del:       0      0 ms
   seq-pool mix:       1      1 ms
   seq-pool rnd:       1      1 ms
    id-pool new:       0      0 ms
    id-pool del:       1      1 ms
    id-pool mix:       1      1 ms
    id-pool rnd:    1201   1201 ms

   $ ./tests/ovstest test-seq-pool benchmark 100000 1
   Benchmarking n=100000 on 1 thread.
    type\thread:       1    Avg
   seq-pool new:       2      2 ms
   seq-pool del:       5      5 ms
   seq-pool mix:       5      5 ms
   seq-pool rnd:       5      5 ms
    id-pool new:       8      8 ms
    id-pool del:       5      5 ms
    id-pool mix:      11     11 ms
    id-pool rnd:  10000+ ****** ms

   $ ./tests/ovstest test-seq-pool benchmark 1000000 1
   Benchmarking n=1000000 on 1 thread.
    type\thread:       1    Avg
   seq-pool new:      23     23 ms
   seq-pool del:      49     49 ms
   seq-pool mix:      53     53 ms
   seq-pool rnd:      53     53 ms
    id-pool new:     190    190 ms
    id-pool del:     173    173 ms
    id-pool mix:     273    273 ms
    id-pool rnd:  10042+ ****** ms

   $ ./tests/ovstest test-seq-pool benchmark 1000000 2
   Benchmarking n=1000000 on 2 threads.
    type\thread:       1      2    Avg
   seq-pool new:      40     39     39 ms
   seq-pool del:      33     33     33 ms
   seq-pool mix:      89     91     90 ms
   seq-pool rnd:     146    151    148 ms
    id-pool new:     485    485    485 ms
    id-pool del:     541    542    541 ms
    id-pool mix:     550    600    575 ms
    id-pool rnd:  10048+ 10003+ ****** ms

   $ ./tests/ovstest test-seq-pool benchmark 1000000 4
   Benchmarking n=1000000 on 4 threads.
    type\thread:       1      2      3      4    Avg
   seq-pool new:      40     39     40     40     39 ms
   seq-pool del:      24     28     28     30     27 ms
   seq-pool mix:      60     63     69     69     65 ms
   seq-pool rnd:     195    197    202    202    199 ms
    id-pool new:     478    471    482    485    479 ms
    id-pool del:     474    469    467    474    471 ms
    id-pool mix:     558    558    611    545    568 ms
    id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms

Signed-off-by: Gaetan Rivet <grive@u256.net>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/automake.mk       |   2 +
 lib/seq-pool.c        | 198 +++++++++++++++
 lib/seq-pool.h        |  66 +++++
 tests/automake.mk     |   1 +
 tests/library.at      |   5 +
 tests/test-seq-pool.c | 543 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 815 insertions(+)
 create mode 100644 lib/seq-pool.c
 create mode 100644 lib/seq-pool.h
 create mode 100644 tests/test-seq-pool.c

Comments

Ilya Maximets April 30, 2021, 5:31 p.m. UTC | #1
On 4/25/21 1:55 PM, Gaetan Rivet wrote:
> The current id-pool module is slow to allocate the
> next valid ID, and can be optimized when restricting
> some properties of the pool.
> 
> Those restrictions are:
> 
>   * No ability to add a random ID to the pool.
> 
>   * A new ID is no more the smallest possible ID.  It is
>     however guaranteed to be in the range of
>     [base, next_id].  Multiple users of the pool are
>     registered, each with a thread-local cache for
>     better scalability and the next_id is one after the
>     latest ID added to any user cache.
>     The allocation range can be written as:
> 
>        [base, last_alloc + nb-user * cache-size + 1].
> 
>   * A user should never free an ID that is not allocated.
>     No checks are done and doing so will duplicate the spurious
>     ID.  Refcounting or other memory management scheme should
>     be used to ensure an object and its ID are only freed once.
> 
> This pool is designed to scale reasonably well in multi-thread
> setup.  As it is aimed at being a faster replacement to the
> current id-pool, a benchmark has been implemented alongside
> unit tests.
> 
> The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
> Respectively
> 
>   + 'new': only allocate IDs
>   + 'del': only free IDs
>   + 'mix': allocate, sequential free, then allocate ID.
>   + 'rnd': allocate, random free, allocate ID.
> 
> Randomized freeing is done by swapping the latest allocated ID with any
> from the range of currently allocated ID, which is reminiscent of the
> Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
> which is the more natural use-case.
> 
> For this specific round, the id-pool performance is such that a timeout
> of 10 seconds is added to the benchmark:
> 
>    $ ./tests/ovstest test-seq-pool benchmark 10000 1
>    Benchmarking n=10000 on 1 thread.
>     type\thread:       1    Avg
>    seq-pool new:       1      1 ms
>    seq-pool del:       0      0 ms
>    seq-pool mix:       1      1 ms
>    seq-pool rnd:       1      1 ms
>     id-pool new:       0      0 ms
>     id-pool del:       1      1 ms
>     id-pool mix:       1      1 ms
>     id-pool rnd:    1201   1201 ms
> 
>    $ ./tests/ovstest test-seq-pool benchmark 100000 1
>    Benchmarking n=100000 on 1 thread.
>     type\thread:       1    Avg
>    seq-pool new:       2      2 ms
>    seq-pool del:       5      5 ms
>    seq-pool mix:       5      5 ms
>    seq-pool rnd:       5      5 ms
>     id-pool new:       8      8 ms
>     id-pool del:       5      5 ms
>     id-pool mix:      11     11 ms
>     id-pool rnd:  10000+ ****** ms
> 
>    $ ./tests/ovstest test-seq-pool benchmark 1000000 1
>    Benchmarking n=1000000 on 1 thread.
>     type\thread:       1    Avg
>    seq-pool new:      23     23 ms
>    seq-pool del:      49     49 ms
>    seq-pool mix:      53     53 ms
>    seq-pool rnd:      53     53 ms
>     id-pool new:     190    190 ms
>     id-pool del:     173    173 ms
>     id-pool mix:     273    273 ms
>     id-pool rnd:  10042+ ****** ms
> 
>    $ ./tests/ovstest test-seq-pool benchmark 1000000 2
>    Benchmarking n=1000000 on 2 threads.
>     type\thread:       1      2    Avg
>    seq-pool new:      40     39     39 ms
>    seq-pool del:      33     33     33 ms
>    seq-pool mix:      89     91     90 ms
>    seq-pool rnd:     146    151    148 ms
>     id-pool new:     485    485    485 ms
>     id-pool del:     541    542    541 ms
>     id-pool mix:     550    600    575 ms
>     id-pool rnd:  10048+ 10003+ ****** ms
> 
>    $ ./tests/ovstest test-seq-pool benchmark 1000000 4
>    Benchmarking n=1000000 on 4 threads.
>     type\thread:       1      2      3      4    Avg
>    seq-pool new:      40     39     40     40     39 ms
>    seq-pool del:      24     28     28     30     27 ms
>    seq-pool mix:      60     63     69     69     65 ms
>    seq-pool rnd:     195    197    202    202    199 ms
>     id-pool new:     478    471    482    485    479 ms
>     id-pool del:     474    469    467    474    471 ms
>     id-pool mix:     558    558    611    545    568 ms
>     id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms

Exactly same question here.  The new data structure doesn't provide
any guarantees and checks.  It doesn't allocate smallest id.
When why not just an array + spinlock?   We're already using this
schema in lib/netdev-afxdp-pool.c and it works fine.

I crafted a very simple implementation and added it to this benchmark.
Results are following:

$ ./tests/ovstest test-seq-pool benchmark 10000 1
Benchmarking n=10000 on 1 thread.
 type\thread:       1    Avg
seq-pool new:       1      1 ms
seq-pool del:       2      2 ms
seq-pool mix:       4      4 ms
seq-pool rnd:       4      4 ms
 my-pool new:       1      1 ms
 my-pool del:       1      1 ms
 my-pool mix:       1      1 ms
 my-pool rnd:       2      2 ms

$ ./tests/ovstest test-seq-pool benchmark 100000 1
Benchmarking n=100000 on 1 thread.
 type\thread:       1    Avg
seq-pool new:      13     13 ms
seq-pool del:      24     24 ms
seq-pool mix:      11     11 ms
seq-pool rnd:       8      8 ms
 my-pool new:       2      2 ms
 my-pool del:       1      1 ms
 my-pool mix:       4      4 ms
 my-pool rnd:       3      3 ms

$ ./tests/ovstest test-seq-pool benchmark 1000000 1
Benchmarking n=1000000 on 1 thread.
 type\thread:       1    Avg
seq-pool new:      43     43 ms
seq-pool del:      50     50 ms
seq-pool mix:      54     54 ms
seq-pool rnd:      54     54 ms
 my-pool new:      10     10 ms
 my-pool del:      10     10 ms
 my-pool mix:      28     28 ms
 my-pool rnd:      41     41 ms

$ ./tests/ovstest test-seq-pool benchmark 1000000 2
Benchmarking n=1000000 on 2 threads.
 type\thread:       1      2    Avg
seq-pool new:      72     71     71 ms
seq-pool del:      31     33     32 ms
seq-pool mix:      83     83     83 ms
seq-pool rnd:     144    147    145 ms
 my-pool new:      16     12     14 ms
 my-pool del:      16     12     14 ms
 my-pool mix:      66     70     68 ms
 my-pool rnd:      42     58     50 ms

$ ./tests/ovstest test-seq-pool benchmark 1000000 4
Benchmarking n=1000000 on 4 threads.
 type\thread:       1      2      3      4    Avg
seq-pool new:      71     69     69     67     69 ms
seq-pool del:      27     22     27     23     24 ms
seq-pool mix:      57     64     71     67     64 ms
seq-pool rnd:     182    186    189    191    187 ms
 my-pool new:      50     42     44     50     46 ms
 my-pool del:      46     39     37     45     41 ms
 my-pool mix:      79     45     55     81     65 ms
 my-pool rnd:     149    130    123    148    137 ms

So, this implementation is faster and doesn't require any new complex
data structures.

Sometimes high contention on a spin lock results in unfair distribution
and gives somewhat worse results, but they are still not too shabby, e.g.:

$ ./tests/ovstest test-seq-pool benchmark 1000000 4
Benchmarking n=1000000 on 4 threads.
 type\thread:       1      2      3      4    Avg
seq-pool new:      75     75     75     74     74 ms
seq-pool del:      25     22     23     26     24 ms
seq-pool mix:      56     68     63     69     64 ms
seq-pool rnd:     183    188    191    192    188 ms
 my-pool new:      52     52     53     39     49 ms
 my-pool del:      52     54     55     42     50 ms
 my-pool mix:     150    156    159    126    147 ms
 my-pool rnd:     160    175    177    139    162 ms

I didn't add any core affinity management to this benchmark.
And even in a bad for spinlock scenario with 16 threads on 4 physical
cores with multi-threading, it doesn't look terrible:

$ ./tests/ovstest test-seq-pool benchmark 1000000 16
Benchmarking n=1000000 on 16 threads.
 type\thread:       1      ...     16    Avg
seq-pool new:      38      ...     40     40 ms
seq-pool del:      23      ...     20     20 ms
seq-pool mix:      66      ...     68     65 ms
seq-pool rnd:     200      ...    185    195 ms
 my-pool new:      68      ...     91     97 ms
 my-pool del:      87      ...    163    145 ms
 my-pool mix:     363      ...    359    343 ms
 my-pool rnd:     464      ...    432    428 ms

For the scenario with a few offload thread should be perfectly fine.
What do you think?

Best regards, Ilya Maximets.


P.S. Here is an implementation that I used for tests:

diff --git a/tests/test-seq-pool.c b/tests/test-seq-pool.c
index a1e0d5500..53af7be20 100644
--- a/tests/test-seq-pool.c
+++ b/tests/test-seq-pool.c
@@ -24,6 +24,7 @@
 #include "command-line.h"
 #include "id-pool.h"
 #include "openvswitch/util.h"
+#include "openvswitch/vlog.h"
 #include "ovs-thread.h"
 #include "ovs-rcu.h"
 #include "ovstest.h"
@@ -468,6 +469,180 @@ benchmark_id_pool(void)
     free(threads);
 }
 
+struct id_unordered_spin_pool {
+    struct ovs_spin lock;
+    uint32_t *array;
+    size_t count;
+};
+
+static struct id_unordered_spin_pool *
+id_unordered_spin_pool_create(int start, int size)
+{
+    struct id_unordered_spin_pool *pool = xzalloc(sizeof *pool);
+
+    ovs_spin_init(&pool->lock);
+    ovs_spin_lock(&pool->lock);
+    pool->array = xzalloc(size * sizeof pool->array[0]);
+    for (int i = 0; i < size; i++) {
+        pool->array[pool->count++] = start + i;
+    }
+    ovs_spin_unlock(&pool->lock);
+    return pool;
+}
+
+static bool
+id_unordered_spin_pool_alloc_id(struct id_unordered_spin_pool *pool,
+                                uint32_t *id)
+{
+    bool res = false;
+
+    ovs_spin_lock(&pool->lock);
+    if (pool->count) {
+        *id = pool->array[--pool->count];
+        res = true;
+    }
+    ovs_spin_unlock(&pool->lock);
+    return res;
+}
+
+static void
+id_unordered_spin_pool_free_id(struct id_unordered_spin_pool *pool,
+                               uint32_t id)
+{
+    ovs_spin_lock(&pool->lock);
+    pool->array[pool->count++] = id;
+    ovs_spin_unlock(&pool->lock);
+}
+
+static void
+id_unordered_spin_pool_destroy(struct id_unordered_spin_pool *pool)
+{
+    ovs_spin_lock(&pool->lock);
+    free(pool->array);
+    ovs_spin_unlock(&pool->lock);
+    ovs_spin_destroy(&pool->lock);
+    free(pool);
+}
+
+struct id_unordered_spin_pool_aux {
+    struct id_unordered_spin_pool *pool;
+    atomic_uint thread_id;
+};
+
+static void *
+id_unordered_spin_pool_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct id_unordered_spin_pool_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_assert(id_unordered_spin_pool_alloc_id(aux->pool, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_unordered_spin_pool_free_id(aux->pool, th_ids[i]);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(id_unordered_spin_pool_alloc_id(aux->pool, &th_ids[i]));
+        id_unordered_spin_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_unordered_spin_pool_alloc_id(aux->pool, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ignore(id_unordered_spin_pool_alloc_id(aux->pool, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        id_unordered_spin_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_unordered_spin_pool_alloc_id(aux->pool, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+static void
+benchmark_id_unordered_spin_pool(void)
+{
+    pthread_t *threads;
+    struct id_unordered_spin_pool_aux aux;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = id_unordered_spin_pool_create(0, n_ids);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        id_unordered_spin_pool_alloc_id(aux.pool, &ids[i]);
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("id_unordered_spin_pool_alloc",
+                                       id_unordered_spin_pool_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" my-pool new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" my-pool del");
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" my-pool mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result(" my-pool rnd");
+
+    id_unordered_spin_pool_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+
 static void *
 clock_main(void *arg OVS_UNUSED)
 {
@@ -492,6 +666,8 @@ run_benchmarks(struct ovs_cmdl_context *ctx)
     long int l_ids;
     size_t i;
 
+    vlog_set_levels(NULL, VLF_ANY_DESTINATION, VLL_OFF);
+
     l_ids = strtol(ctx->argv[1], NULL, 10);
     l_threads = strtol(ctx->argv[2], NULL, 10);
     ovs_assert(l_ids > 0 && l_threads > 0);
@@ -514,7 +690,8 @@ run_benchmarks(struct ovs_cmdl_context *ctx)
     printf("   Avg\n");
 
     benchmark_seq_pool();
-    benchmark_id_pool();
+    /* benchmark_id_pool(); */
+    benchmark_id_unordered_spin_pool();
 
     stop = true;
 
---
Gaetan Rivet May 1, 2021, 2:49 p.m. UTC | #2
On Fri, Apr 30, 2021, at 19:31, Ilya Maximets wrote:
> On 4/25/21 1:55 PM, Gaetan Rivet wrote:
> > The current id-pool module is slow to allocate the
> > next valid ID, and can be optimized when restricting
> > some properties of the pool.
> > 
> > Those restrictions are:
> > 
> >   * No ability to add a random ID to the pool.
> > 
> >   * A new ID is no more the smallest possible ID.  It is
> >     however guaranteed to be in the range of
> >     [base, next_id].  Multiple users of the pool are
> >     registered, each with a thread-local cache for
> >     better scalability and the next_id is one after the
> >     latest ID added to any user cache.
> >     The allocation range can be written as:
> > 
> >        [base, last_alloc + nb-user * cache-size + 1].
> > 
> >   * A user should never free an ID that is not allocated.
> >     No checks are done and doing so will duplicate the spurious
> >     ID.  Refcounting or other memory management scheme should
> >     be used to ensure an object and its ID are only freed once.
> > 
> > This pool is designed to scale reasonably well in multi-thread
> > setup.  As it is aimed at being a faster replacement to the
> > current id-pool, a benchmark has been implemented alongside
> > unit tests.
> > 
> > The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
> > Respectively
> > 
> >   + 'new': only allocate IDs
> >   + 'del': only free IDs
> >   + 'mix': allocate, sequential free, then allocate ID.
> >   + 'rnd': allocate, random free, allocate ID.
> > 
> > Randomized freeing is done by swapping the latest allocated ID with any
> > from the range of currently allocated ID, which is reminiscent of the
> > Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
> > which is the more natural use-case.
> > 
> > For this specific round, the id-pool performance is such that a timeout
> > of 10 seconds is added to the benchmark:
> > 
> >    $ ./tests/ovstest test-seq-pool benchmark 10000 1
> >    Benchmarking n=10000 on 1 thread.
> >     type\thread:       1    Avg
> >    seq-pool new:       1      1 ms
> >    seq-pool del:       0      0 ms
> >    seq-pool mix:       1      1 ms
> >    seq-pool rnd:       1      1 ms
> >     id-pool new:       0      0 ms
> >     id-pool del:       1      1 ms
> >     id-pool mix:       1      1 ms
> >     id-pool rnd:    1201   1201 ms
> > 
> >    $ ./tests/ovstest test-seq-pool benchmark 100000 1
> >    Benchmarking n=100000 on 1 thread.
> >     type\thread:       1    Avg
> >    seq-pool new:       2      2 ms
> >    seq-pool del:       5      5 ms
> >    seq-pool mix:       5      5 ms
> >    seq-pool rnd:       5      5 ms
> >     id-pool new:       8      8 ms
> >     id-pool del:       5      5 ms
> >     id-pool mix:      11     11 ms
> >     id-pool rnd:  10000+ ****** ms
> > 
> >    $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> >    Benchmarking n=1000000 on 1 thread.
> >     type\thread:       1    Avg
> >    seq-pool new:      23     23 ms
> >    seq-pool del:      49     49 ms
> >    seq-pool mix:      53     53 ms
> >    seq-pool rnd:      53     53 ms
> >     id-pool new:     190    190 ms
> >     id-pool del:     173    173 ms
> >     id-pool mix:     273    273 ms
> >     id-pool rnd:  10042+ ****** ms
> > 
> >    $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> >    Benchmarking n=1000000 on 2 threads.
> >     type\thread:       1      2    Avg
> >    seq-pool new:      40     39     39 ms
> >    seq-pool del:      33     33     33 ms
> >    seq-pool mix:      89     91     90 ms
> >    seq-pool rnd:     146    151    148 ms
> >     id-pool new:     485    485    485 ms
> >     id-pool del:     541    542    541 ms
> >     id-pool mix:     550    600    575 ms
> >     id-pool rnd:  10048+ 10003+ ****** ms
> > 
> >    $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> >    Benchmarking n=1000000 on 4 threads.
> >     type\thread:       1      2      3      4    Avg
> >    seq-pool new:      40     39     40     40     39 ms
> >    seq-pool del:      24     28     28     30     27 ms
> >    seq-pool mix:      60     63     69     69     65 ms
> >    seq-pool rnd:     195    197    202    202    199 ms
> >     id-pool new:     478    471    482    485    479 ms
> >     id-pool del:     474    469    467    474    471 ms
> >     id-pool mix:     558    558    611    545    568 ms
> >     id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms
> 
> Exactly same question here.  The new data structure doesn't provide
> any guarantees and checks.  It doesn't allocate smallest id.
> When why not just an array + spinlock?   We're already using this
> schema in lib/netdev-afxdp-pool.c and it works fine.
> 
> I crafted a very simple implementation and added it to this benchmark.
> Results are following:
> 
> $ ./tests/ovstest test-seq-pool benchmark 10000 1
> Benchmarking n=10000 on 1 thread.
>  type\thread:       1    Avg
> seq-pool new:       1      1 ms
> seq-pool del:       2      2 ms
> seq-pool mix:       4      4 ms
> seq-pool rnd:       4      4 ms
>  my-pool new:       1      1 ms
>  my-pool del:       1      1 ms
>  my-pool mix:       1      1 ms
>  my-pool rnd:       2      2 ms
> 
> $ ./tests/ovstest test-seq-pool benchmark 100000 1
> Benchmarking n=100000 on 1 thread.
>  type\thread:       1    Avg
> seq-pool new:      13     13 ms
> seq-pool del:      24     24 ms
> seq-pool mix:      11     11 ms
> seq-pool rnd:       8      8 ms
>  my-pool new:       2      2 ms
>  my-pool del:       1      1 ms
>  my-pool mix:       4      4 ms
>  my-pool rnd:       3      3 ms
> 
> $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> Benchmarking n=1000000 on 1 thread.
>  type\thread:       1    Avg
> seq-pool new:      43     43 ms
> seq-pool del:      50     50 ms
> seq-pool mix:      54     54 ms
> seq-pool rnd:      54     54 ms
>  my-pool new:      10     10 ms
>  my-pool del:      10     10 ms
>  my-pool mix:      28     28 ms
>  my-pool rnd:      41     41 ms
> 
> $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> Benchmarking n=1000000 on 2 threads.
>  type\thread:       1      2    Avg
> seq-pool new:      72     71     71 ms
> seq-pool del:      31     33     32 ms
> seq-pool mix:      83     83     83 ms
> seq-pool rnd:     144    147    145 ms
>  my-pool new:      16     12     14 ms
>  my-pool del:      16     12     14 ms
>  my-pool mix:      66     70     68 ms
>  my-pool rnd:      42     58     50 ms
> 
> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> Benchmarking n=1000000 on 4 threads.
>  type\thread:       1      2      3      4    Avg
> seq-pool new:      71     69     69     67     69 ms
> seq-pool del:      27     22     27     23     24 ms
> seq-pool mix:      57     64     71     67     64 ms
> seq-pool rnd:     182    186    189    191    187 ms
>  my-pool new:      50     42     44     50     46 ms
>  my-pool del:      46     39     37     45     41 ms
>  my-pool mix:      79     45     55     81     65 ms
>  my-pool rnd:     149    130    123    148    137 ms
> 
> So, this implementation is faster and doesn't require any new complex
> data structures.
> 
> Sometimes high contention on a spin lock results in unfair distribution
> and gives somewhat worse results, but they are still not too shabby, e.g.:
> 
> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> Benchmarking n=1000000 on 4 threads.
>  type\thread:       1      2      3      4    Avg
> seq-pool new:      75     75     75     74     74 ms
> seq-pool del:      25     22     23     26     24 ms
> seq-pool mix:      56     68     63     69     64 ms
> seq-pool rnd:     183    188    191    192    188 ms
>  my-pool new:      52     52     53     39     49 ms
>  my-pool del:      52     54     55     42     50 ms
>  my-pool mix:     150    156    159    126    147 ms
>  my-pool rnd:     160    175    177    139    162 ms
> 
> I didn't add any core affinity management to this benchmark.
> And even in a bad for spinlock scenario with 16 threads on 4 physical
> cores with multi-threading, it doesn't look terrible:
> 
> $ ./tests/ovstest test-seq-pool benchmark 1000000 16
> Benchmarking n=1000000 on 16 threads.
>  type\thread:       1      ...     16    Avg
> seq-pool new:      38      ...     40     40 ms
> seq-pool del:      23      ...     20     20 ms
> seq-pool mix:      66      ...     68     65 ms
> seq-pool rnd:     200      ...    185    195 ms
>  my-pool new:      68      ...     91     97 ms
>  my-pool del:      87      ...    163    145 ms
>  my-pool mix:     363      ...    359    343 ms
>  my-pool rnd:     464      ...    432    428 ms
> 
> For the scenario with a few offload thread should be perfectly fine.
> What do you think?

Thanks, I hadn't seen the afxdp pool, it's interesting.

I see two issues, one trivial and one maybe more problematic.
First is that there is an implicit dependency in the marks allocated
using the id-pool, on remaining under a limit. I can't speak for other
hardware vendors, but we have a limitation in Nvidia NICs, where
marks in the full range of the ones used by offloads are not all supported.

It was a surprise for me, otherwise I was going for a fully random ID generator
(using malloc-generated addresses), there are simpler ways to get there.
For your pool the change is trivial anyway, only the ID values have to be
set accordingly:

487     for (int i = size; i > 0; i--) {
488         pool->array[pool->count++] = start + i - 1;
489     }

There is no way currently to query the hardware for rte_flow limits,
so I don't have a good way to set this limit dynamically,
and I don't see how it could work with port hotplug, downsizing a shared range.
Maybe this is another gap in the rte_flow API that should be addressed.

The other issue is that the full range of ID is pre-allocated.
With the current mark range it means 16 GB of mostly unused data.
We can limit this to the part that we actually use, considering
that we should only get flow_limit marks at most.

With a flow limit of 10k, 20k maybe to deal with async add + delete, it would
be reduced to 80k bytes pre-allocated.

I very much prefer your implementation. I am not happy with the seq-pool.
I tried to make it a drop-in replacement for id-pool, so allowing very large
ranges was part of it. If it is okay to restrict the mark range I think it's
worth using yours instead.

Best regards,
Ilya Maximets May 3, 2021, 12:48 p.m. UTC | #3
On 5/1/21 4:49 PM, Gaƫtan Rivet wrote:
> On Fri, Apr 30, 2021, at 19:31, Ilya Maximets wrote:
>> On 4/25/21 1:55 PM, Gaetan Rivet wrote:
>>> The current id-pool module is slow to allocate the
>>> next valid ID, and can be optimized when restricting
>>> some properties of the pool.
>>>
>>> Those restrictions are:
>>>
>>>   * No ability to add a random ID to the pool.
>>>
>>>   * A new ID is no more the smallest possible ID.  It is
>>>     however guaranteed to be in the range of
>>>     [base, next_id].  Multiple users of the pool are
>>>     registered, each with a thread-local cache for
>>>     better scalability and the next_id is one after the
>>>     latest ID added to any user cache.
>>>     The allocation range can be written as:
>>>
>>>        [base, last_alloc + nb-user * cache-size + 1].
>>>
>>>   * A user should never free an ID that is not allocated.
>>>     No checks are done and doing so will duplicate the spurious
>>>     ID.  Refcounting or other memory management scheme should
>>>     be used to ensure an object and its ID are only freed once.
>>>
>>> This pool is designed to scale reasonably well in multi-thread
>>> setup.  As it is aimed at being a faster replacement to the
>>> current id-pool, a benchmark has been implemented alongside
>>> unit tests.
>>>
>>> The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
>>> Respectively
>>>
>>>   + 'new': only allocate IDs
>>>   + 'del': only free IDs
>>>   + 'mix': allocate, sequential free, then allocate ID.
>>>   + 'rnd': allocate, random free, allocate ID.
>>>
>>> Randomized freeing is done by swapping the latest allocated ID with any
>>> from the range of currently allocated ID, which is reminiscent of the
>>> Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
>>> which is the more natural use-case.
>>>
>>> For this specific round, the id-pool performance is such that a timeout
>>> of 10 seconds is added to the benchmark:
>>>
>>>    $ ./tests/ovstest test-seq-pool benchmark 10000 1
>>>    Benchmarking n=10000 on 1 thread.
>>>     type\thread:       1    Avg
>>>    seq-pool new:       1      1 ms
>>>    seq-pool del:       0      0 ms
>>>    seq-pool mix:       1      1 ms
>>>    seq-pool rnd:       1      1 ms
>>>     id-pool new:       0      0 ms
>>>     id-pool del:       1      1 ms
>>>     id-pool mix:       1      1 ms
>>>     id-pool rnd:    1201   1201 ms
>>>
>>>    $ ./tests/ovstest test-seq-pool benchmark 100000 1
>>>    Benchmarking n=100000 on 1 thread.
>>>     type\thread:       1    Avg
>>>    seq-pool new:       2      2 ms
>>>    seq-pool del:       5      5 ms
>>>    seq-pool mix:       5      5 ms
>>>    seq-pool rnd:       5      5 ms
>>>     id-pool new:       8      8 ms
>>>     id-pool del:       5      5 ms
>>>     id-pool mix:      11     11 ms
>>>     id-pool rnd:  10000+ ****** ms
>>>
>>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 1
>>>    Benchmarking n=1000000 on 1 thread.
>>>     type\thread:       1    Avg
>>>    seq-pool new:      23     23 ms
>>>    seq-pool del:      49     49 ms
>>>    seq-pool mix:      53     53 ms
>>>    seq-pool rnd:      53     53 ms
>>>     id-pool new:     190    190 ms
>>>     id-pool del:     173    173 ms
>>>     id-pool mix:     273    273 ms
>>>     id-pool rnd:  10042+ ****** ms
>>>
>>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 2
>>>    Benchmarking n=1000000 on 2 threads.
>>>     type\thread:       1      2    Avg
>>>    seq-pool new:      40     39     39 ms
>>>    seq-pool del:      33     33     33 ms
>>>    seq-pool mix:      89     91     90 ms
>>>    seq-pool rnd:     146    151    148 ms
>>>     id-pool new:     485    485    485 ms
>>>     id-pool del:     541    542    541 ms
>>>     id-pool mix:     550    600    575 ms
>>>     id-pool rnd:  10048+ 10003+ ****** ms
>>>
>>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 4
>>>    Benchmarking n=1000000 on 4 threads.
>>>     type\thread:       1      2      3      4    Avg
>>>    seq-pool new:      40     39     40     40     39 ms
>>>    seq-pool del:      24     28     28     30     27 ms
>>>    seq-pool mix:      60     63     69     69     65 ms
>>>    seq-pool rnd:     195    197    202    202    199 ms
>>>     id-pool new:     478    471    482    485    479 ms
>>>     id-pool del:     474    469    467    474    471 ms
>>>     id-pool mix:     558    558    611    545    568 ms
>>>     id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms
>>
>> Exactly same question here.  The new data structure doesn't provide
>> any guarantees and checks.  It doesn't allocate smallest id.
>> When why not just an array + spinlock?   We're already using this
>> schema in lib/netdev-afxdp-pool.c and it works fine.
>>
>> I crafted a very simple implementation and added it to this benchmark.
>> Results are following:
>>
>> $ ./tests/ovstest test-seq-pool benchmark 10000 1
>> Benchmarking n=10000 on 1 thread.
>>  type\thread:       1    Avg
>> seq-pool new:       1      1 ms
>> seq-pool del:       2      2 ms
>> seq-pool mix:       4      4 ms
>> seq-pool rnd:       4      4 ms
>>  my-pool new:       1      1 ms
>>  my-pool del:       1      1 ms
>>  my-pool mix:       1      1 ms
>>  my-pool rnd:       2      2 ms
>>
>> $ ./tests/ovstest test-seq-pool benchmark 100000 1
>> Benchmarking n=100000 on 1 thread.
>>  type\thread:       1    Avg
>> seq-pool new:      13     13 ms
>> seq-pool del:      24     24 ms
>> seq-pool mix:      11     11 ms
>> seq-pool rnd:       8      8 ms
>>  my-pool new:       2      2 ms
>>  my-pool del:       1      1 ms
>>  my-pool mix:       4      4 ms
>>  my-pool rnd:       3      3 ms
>>
>> $ ./tests/ovstest test-seq-pool benchmark 1000000 1
>> Benchmarking n=1000000 on 1 thread.
>>  type\thread:       1    Avg
>> seq-pool new:      43     43 ms
>> seq-pool del:      50     50 ms
>> seq-pool mix:      54     54 ms
>> seq-pool rnd:      54     54 ms
>>  my-pool new:      10     10 ms
>>  my-pool del:      10     10 ms
>>  my-pool mix:      28     28 ms
>>  my-pool rnd:      41     41 ms
>>
>> $ ./tests/ovstest test-seq-pool benchmark 1000000 2
>> Benchmarking n=1000000 on 2 threads.
>>  type\thread:       1      2    Avg
>> seq-pool new:      72     71     71 ms
>> seq-pool del:      31     33     32 ms
>> seq-pool mix:      83     83     83 ms
>> seq-pool rnd:     144    147    145 ms
>>  my-pool new:      16     12     14 ms
>>  my-pool del:      16     12     14 ms
>>  my-pool mix:      66     70     68 ms
>>  my-pool rnd:      42     58     50 ms
>>
>> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
>> Benchmarking n=1000000 on 4 threads.
>>  type\thread:       1      2      3      4    Avg
>> seq-pool new:      71     69     69     67     69 ms
>> seq-pool del:      27     22     27     23     24 ms
>> seq-pool mix:      57     64     71     67     64 ms
>> seq-pool rnd:     182    186    189    191    187 ms
>>  my-pool new:      50     42     44     50     46 ms
>>  my-pool del:      46     39     37     45     41 ms
>>  my-pool mix:      79     45     55     81     65 ms
>>  my-pool rnd:     149    130    123    148    137 ms
>>
>> So, this implementation is faster and doesn't require any new complex
>> data structures.
>>
>> Sometimes high contention on a spin lock results in unfair distribution
>> and gives somewhat worse results, but they are still not too shabby, e.g.:
>>
>> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
>> Benchmarking n=1000000 on 4 threads.
>>  type\thread:       1      2      3      4    Avg
>> seq-pool new:      75     75     75     74     74 ms
>> seq-pool del:      25     22     23     26     24 ms
>> seq-pool mix:      56     68     63     69     64 ms
>> seq-pool rnd:     183    188    191    192    188 ms
>>  my-pool new:      52     52     53     39     49 ms
>>  my-pool del:      52     54     55     42     50 ms
>>  my-pool mix:     150    156    159    126    147 ms
>>  my-pool rnd:     160    175    177    139    162 ms
>>
>> I didn't add any core affinity management to this benchmark.
>> And even in a bad for spinlock scenario with 16 threads on 4 physical
>> cores with multi-threading, it doesn't look terrible:
>>
>> $ ./tests/ovstest test-seq-pool benchmark 1000000 16
>> Benchmarking n=1000000 on 16 threads.
>>  type\thread:       1      ...     16    Avg
>> seq-pool new:      38      ...     40     40 ms
>> seq-pool del:      23      ...     20     20 ms
>> seq-pool mix:      66      ...     68     65 ms
>> seq-pool rnd:     200      ...    185    195 ms
>>  my-pool new:      68      ...     91     97 ms
>>  my-pool del:      87      ...    163    145 ms
>>  my-pool mix:     363      ...    359    343 ms
>>  my-pool rnd:     464      ...    432    428 ms
>>
>> For the scenario with a few offload thread should be perfectly fine.
>> What do you think?
> 
> Thanks, I hadn't seen the afxdp pool, it's interesting.
> 
> I see two issues, one trivial and one maybe more problematic.
> First is that there is an implicit dependency in the marks allocated
> using the id-pool, on remaining under a limit. I can't speak for other
> hardware vendors, but we have a limitation in Nvidia NICs, where
> marks in the full range of the ones used by offloads are not all supported.
> 
> It was a surprise for me, otherwise I was going for a fully random ID generator
> (using malloc-generated addresses), there are simpler ways to get there.
> For your pool the change is trivial anyway, only the ID values have to be
> set accordingly:
> 
> 487     for (int i = size; i > 0; i--) {
> 488         pool->array[pool->count++] = start + i - 1;
> 489     }
> 
> There is no way currently to query the hardware for rte_flow limits,
> so I don't have a good way to set this limit dynamically,
> and I don't see how it could work with port hotplug, downsizing a shared range.
> Maybe this is another gap in the rte_flow API that should be addressed.
> 
> The other issue is that the full range of ID is pre-allocated.
> With the current mark range it means 16 GB of mostly unused data.
> We can limit this to the part that we actually use, considering
> that we should only get flow_limit marks at most.
> 
> With a flow limit of 10k, 20k maybe to deal with async add + delete, it would
> be reduced to 80k bytes pre-allocated.
> 
> I very much prefer your implementation. I am not happy with the seq-pool.
> I tried to make it a drop-in replacement for id-pool, so allowing very large
> ranges was part of it. If it is okay to restrict the mark range I think it's
> worth using yours instead.

I think, we can avoid pre-allocation of the whole array and use 2n-realloc
schema if we will run out of pre-allocated values.  It's not ideal to
realloc under the spinlock, but this should not be a very big issue if
it happens once or twice in a lifetime of a process.  We can, probably,
pre-allocate 200K values which will be ~1.5MB of data.  200K is the default
limit for the number of datapath flows, so we should almost never re-allocate.
Alternatively, we can pre-allocate 20K and have at most 4 re-allocations
in a lifetime of the process.

What do you think?

Something like this (not fully tested, only few benchmark runs without
big performance difference):

struct id_stack_pool {
    struct ovs_spin lock;
    uint32_t *array;
    uint32_t start;
    size_t count;
    size_t size;
    size_t allocated;
    size_t populated;
};

static struct id_stack_pool *
id_stack_pool_create(int start, int size, int prealloc)
{
    struct id_stack_pool *pool = xzalloc(sizeof *pool);

    if (prealloc > size) {
        prealloc = size;
    }

    ovs_spin_init(&pool->lock);
    ovs_spin_lock(&pool->lock);
    pool->array = xzalloc(prealloc * sizeof pool->array[0]);
    for (int i = 0; i < prealloc; i++) {
        pool->array[pool->count++] = start + i;
    }
    pool->start = start;
    pool->size = size;
    pool->allocated = prealloc;
    pool->populated = prealloc;
    ovs_spin_unlock(&pool->lock);
    return pool;
}

#define POOL_REPOPULATION_LIMIT 10000

static bool
id_stack_pool_alloc_id(struct id_stack_pool *pool, uint32_t *id)
{
    bool res = false;

    ovs_spin_lock(&pool->lock);

try_again:

    if (pool->count) {
        *id = pool->array[--pool->count];
        res = true;
    } else if (pool->populated < pool->allocated) {
        for (int i = 0; i < POOL_REPOPULATION_LIMIT; i++) {
            pool->array[pool->count++] = pool->start + pool->populated++;
            if (pool->populated == pool->allocated) {
                break;
            }
        }
        goto try_again;
    } else if (pool->allocated < pool->size) {
        pool->array = x2nrealloc(pool->array, &pool->allocated,
                                 sizeof pool->array[0]);
        if (pool->allocated > pool->size) {
            pool->allocated = pool->size;
        }
        goto try_again;
    }
    ovs_spin_unlock(&pool->lock);
    return res;
}
Gaetan Rivet May 5, 2021, 2:41 p.m. UTC | #4
On Mon, May 3, 2021, at 14:48, Ilya Maximets wrote:
> On 5/1/21 4:49 PM, Gaƫtan Rivet wrote:
> > On Fri, Apr 30, 2021, at 19:31, Ilya Maximets wrote:
> >> On 4/25/21 1:55 PM, Gaetan Rivet wrote:
> >>> The current id-pool module is slow to allocate the
> >>> next valid ID, and can be optimized when restricting
> >>> some properties of the pool.
> >>>
> >>> Those restrictions are:
> >>>
> >>>   * No ability to add a random ID to the pool.
> >>>
> >>>   * A new ID is no more the smallest possible ID.  It is
> >>>     however guaranteed to be in the range of
> >>>     [base, next_id].  Multiple users of the pool are
> >>>     registered, each with a thread-local cache for
> >>>     better scalability and the next_id is one after the
> >>>     latest ID added to any user cache.
> >>>     The allocation range can be written as:
> >>>
> >>>        [base, last_alloc + nb-user * cache-size + 1].
> >>>
> >>>   * A user should never free an ID that is not allocated.
> >>>     No checks are done and doing so will duplicate the spurious
> >>>     ID.  Refcounting or other memory management scheme should
> >>>     be used to ensure an object and its ID are only freed once.
> >>>
> >>> This pool is designed to scale reasonably well in multi-thread
> >>> setup.  As it is aimed at being a faster replacement to the
> >>> current id-pool, a benchmark has been implemented alongside
> >>> unit tests.
> >>>
> >>> The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
> >>> Respectively
> >>>
> >>>   + 'new': only allocate IDs
> >>>   + 'del': only free IDs
> >>>   + 'mix': allocate, sequential free, then allocate ID.
> >>>   + 'rnd': allocate, random free, allocate ID.
> >>>
> >>> Randomized freeing is done by swapping the latest allocated ID with any
> >>> from the range of currently allocated ID, which is reminiscent of the
> >>> Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
> >>> which is the more natural use-case.
> >>>
> >>> For this specific round, the id-pool performance is such that a timeout
> >>> of 10 seconds is added to the benchmark:
> >>>
> >>>    $ ./tests/ovstest test-seq-pool benchmark 10000 1
> >>>    Benchmarking n=10000 on 1 thread.
> >>>     type\thread:       1    Avg
> >>>    seq-pool new:       1      1 ms
> >>>    seq-pool del:       0      0 ms
> >>>    seq-pool mix:       1      1 ms
> >>>    seq-pool rnd:       1      1 ms
> >>>     id-pool new:       0      0 ms
> >>>     id-pool del:       1      1 ms
> >>>     id-pool mix:       1      1 ms
> >>>     id-pool rnd:    1201   1201 ms
> >>>
> >>>    $ ./tests/ovstest test-seq-pool benchmark 100000 1
> >>>    Benchmarking n=100000 on 1 thread.
> >>>     type\thread:       1    Avg
> >>>    seq-pool new:       2      2 ms
> >>>    seq-pool del:       5      5 ms
> >>>    seq-pool mix:       5      5 ms
> >>>    seq-pool rnd:       5      5 ms
> >>>     id-pool new:       8      8 ms
> >>>     id-pool del:       5      5 ms
> >>>     id-pool mix:      11     11 ms
> >>>     id-pool rnd:  10000+ ****** ms
> >>>
> >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> >>>    Benchmarking n=1000000 on 1 thread.
> >>>     type\thread:       1    Avg
> >>>    seq-pool new:      23     23 ms
> >>>    seq-pool del:      49     49 ms
> >>>    seq-pool mix:      53     53 ms
> >>>    seq-pool rnd:      53     53 ms
> >>>     id-pool new:     190    190 ms
> >>>     id-pool del:     173    173 ms
> >>>     id-pool mix:     273    273 ms
> >>>     id-pool rnd:  10042+ ****** ms
> >>>
> >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> >>>    Benchmarking n=1000000 on 2 threads.
> >>>     type\thread:       1      2    Avg
> >>>    seq-pool new:      40     39     39 ms
> >>>    seq-pool del:      33     33     33 ms
> >>>    seq-pool mix:      89     91     90 ms
> >>>    seq-pool rnd:     146    151    148 ms
> >>>     id-pool new:     485    485    485 ms
> >>>     id-pool del:     541    542    541 ms
> >>>     id-pool mix:     550    600    575 ms
> >>>     id-pool rnd:  10048+ 10003+ ****** ms
> >>>
> >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> >>>    Benchmarking n=1000000 on 4 threads.
> >>>     type\thread:       1      2      3      4    Avg
> >>>    seq-pool new:      40     39     40     40     39 ms
> >>>    seq-pool del:      24     28     28     30     27 ms
> >>>    seq-pool mix:      60     63     69     69     65 ms
> >>>    seq-pool rnd:     195    197    202    202    199 ms
> >>>     id-pool new:     478    471    482    485    479 ms
> >>>     id-pool del:     474    469    467    474    471 ms
> >>>     id-pool mix:     558    558    611    545    568 ms
> >>>     id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms
> >>
> >> Exactly same question here.  The new data structure doesn't provide
> >> any guarantees and checks.  It doesn't allocate smallest id.
> >> When why not just an array + spinlock?   We're already using this
> >> schema in lib/netdev-afxdp-pool.c and it works fine.
> >>
> >> I crafted a very simple implementation and added it to this benchmark.
> >> Results are following:
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 10000 1
> >> Benchmarking n=10000 on 1 thread.
> >>  type\thread:       1    Avg
> >> seq-pool new:       1      1 ms
> >> seq-pool del:       2      2 ms
> >> seq-pool mix:       4      4 ms
> >> seq-pool rnd:       4      4 ms
> >>  my-pool new:       1      1 ms
> >>  my-pool del:       1      1 ms
> >>  my-pool mix:       1      1 ms
> >>  my-pool rnd:       2      2 ms
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 100000 1
> >> Benchmarking n=100000 on 1 thread.
> >>  type\thread:       1    Avg
> >> seq-pool new:      13     13 ms
> >> seq-pool del:      24     24 ms
> >> seq-pool mix:      11     11 ms
> >> seq-pool rnd:       8      8 ms
> >>  my-pool new:       2      2 ms
> >>  my-pool del:       1      1 ms
> >>  my-pool mix:       4      4 ms
> >>  my-pool rnd:       3      3 ms
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> >> Benchmarking n=1000000 on 1 thread.
> >>  type\thread:       1    Avg
> >> seq-pool new:      43     43 ms
> >> seq-pool del:      50     50 ms
> >> seq-pool mix:      54     54 ms
> >> seq-pool rnd:      54     54 ms
> >>  my-pool new:      10     10 ms
> >>  my-pool del:      10     10 ms
> >>  my-pool mix:      28     28 ms
> >>  my-pool rnd:      41     41 ms
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> >> Benchmarking n=1000000 on 2 threads.
> >>  type\thread:       1      2    Avg
> >> seq-pool new:      72     71     71 ms
> >> seq-pool del:      31     33     32 ms
> >> seq-pool mix:      83     83     83 ms
> >> seq-pool rnd:     144    147    145 ms
> >>  my-pool new:      16     12     14 ms
> >>  my-pool del:      16     12     14 ms
> >>  my-pool mix:      66     70     68 ms
> >>  my-pool rnd:      42     58     50 ms
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> >> Benchmarking n=1000000 on 4 threads.
> >>  type\thread:       1      2      3      4    Avg
> >> seq-pool new:      71     69     69     67     69 ms
> >> seq-pool del:      27     22     27     23     24 ms
> >> seq-pool mix:      57     64     71     67     64 ms
> >> seq-pool rnd:     182    186    189    191    187 ms
> >>  my-pool new:      50     42     44     50     46 ms
> >>  my-pool del:      46     39     37     45     41 ms
> >>  my-pool mix:      79     45     55     81     65 ms
> >>  my-pool rnd:     149    130    123    148    137 ms
> >>
> >> So, this implementation is faster and doesn't require any new complex
> >> data structures.
> >>
> >> Sometimes high contention on a spin lock results in unfair distribution
> >> and gives somewhat worse results, but they are still not too shabby, e.g.:
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> >> Benchmarking n=1000000 on 4 threads.
> >>  type\thread:       1      2      3      4    Avg
> >> seq-pool new:      75     75     75     74     74 ms
> >> seq-pool del:      25     22     23     26     24 ms
> >> seq-pool mix:      56     68     63     69     64 ms
> >> seq-pool rnd:     183    188    191    192    188 ms
> >>  my-pool new:      52     52     53     39     49 ms
> >>  my-pool del:      52     54     55     42     50 ms
> >>  my-pool mix:     150    156    159    126    147 ms
> >>  my-pool rnd:     160    175    177    139    162 ms
> >>
> >> I didn't add any core affinity management to this benchmark.
> >> And even in a bad for spinlock scenario with 16 threads on 4 physical
> >> cores with multi-threading, it doesn't look terrible:
> >>
> >> $ ./tests/ovstest test-seq-pool benchmark 1000000 16
> >> Benchmarking n=1000000 on 16 threads.
> >>  type\thread:       1      ...     16    Avg
> >> seq-pool new:      38      ...     40     40 ms
> >> seq-pool del:      23      ...     20     20 ms
> >> seq-pool mix:      66      ...     68     65 ms
> >> seq-pool rnd:     200      ...    185    195 ms
> >>  my-pool new:      68      ...     91     97 ms
> >>  my-pool del:      87      ...    163    145 ms
> >>  my-pool mix:     363      ...    359    343 ms
> >>  my-pool rnd:     464      ...    432    428 ms
> >>
> >> For the scenario with a few offload thread should be perfectly fine.
> >> What do you think?
> > 
> > Thanks, I hadn't seen the afxdp pool, it's interesting.
> > 
> > I see two issues, one trivial and one maybe more problematic.
> > First is that there is an implicit dependency in the marks allocated
> > using the id-pool, on remaining under a limit. I can't speak for other
> > hardware vendors, but we have a limitation in Nvidia NICs, where
> > marks in the full range of the ones used by offloads are not all supported.
> > 
> > It was a surprise for me, otherwise I was going for a fully random ID generator
> > (using malloc-generated addresses), there are simpler ways to get there.
> > For your pool the change is trivial anyway, only the ID values have to be
> > set accordingly:
> > 
> > 487     for (int i = size; i > 0; i--) {
> > 488         pool->array[pool->count++] = start + i - 1;
> > 489     }
> > 
> > There is no way currently to query the hardware for rte_flow limits,
> > so I don't have a good way to set this limit dynamically,
> > and I don't see how it could work with port hotplug, downsizing a shared range.
> > Maybe this is another gap in the rte_flow API that should be addressed.
> > 
> > The other issue is that the full range of ID is pre-allocated.
> > With the current mark range it means 16 GB of mostly unused data.
> > We can limit this to the part that we actually use, considering
> > that we should only get flow_limit marks at most.
> > 
> > With a flow limit of 10k, 20k maybe to deal with async add + delete, it would
> > be reduced to 80k bytes pre-allocated.
> > 
> > I very much prefer your implementation. I am not happy with the seq-pool.
> > I tried to make it a drop-in replacement for id-pool, so allowing very large
> > ranges was part of it. If it is okay to restrict the mark range I think it's
> > worth using yours instead.
> 
> I think, we can avoid pre-allocation of the whole array and use 2n-realloc
> schema if we will run out of pre-allocated values.  It's not ideal to
> realloc under the spinlock, but this should not be a very big issue if
> it happens once or twice in a lifetime of a process.  We can, probably,
> pre-allocate 200K values which will be ~1.5MB of data.  200K is the default
> limit for the number of datapath flows, so we should almost never re-allocate.
> Alternatively, we can pre-allocate 20K and have at most 4 re-allocations
> in a lifetime of the process.
> 
> What do you think?

On principle I agree, deferring the allocation should work.

I'm not sure about the 2n-realloc. At face value, exponential alloc seems dangerous with
such a range. A growth constant would make the hiccups more predictable, considering
the spinlock will block other threads in a busy loop, it seems important to keep in mind.
Also, given that memory overcommit won't help, we could waste a lot of memory with
the last realloc.

I will respin the patch with a new version, probably with a new name instead of
seq-pool (I fear it might confuse readers regarding the struct 'seq' otherwise).
Gaetan Rivet May 23, 2021, 5:56 p.m. UTC | #5
On Wed, May 5, 2021, at 16:41, Gaƫtan Rivet wrote:
> On Mon, May 3, 2021, at 14:48, Ilya Maximets wrote:
> > On 5/1/21 4:49 PM, Gaƫtan Rivet wrote:
> > > On Fri, Apr 30, 2021, at 19:31, Ilya Maximets wrote:
> > >> On 4/25/21 1:55 PM, Gaetan Rivet wrote:
> > >>> The current id-pool module is slow to allocate the
> > >>> next valid ID, and can be optimized when restricting
> > >>> some properties of the pool.
> > >>>
> > >>> Those restrictions are:
> > >>>
> > >>>   * No ability to add a random ID to the pool.
> > >>>
> > >>>   * A new ID is no more the smallest possible ID.  It is
> > >>>     however guaranteed to be in the range of
> > >>>     [base, next_id].  Multiple users of the pool are
> > >>>     registered, each with a thread-local cache for
> > >>>     better scalability and the next_id is one after the
> > >>>     latest ID added to any user cache.
> > >>>     The allocation range can be written as:
> > >>>
> > >>>        [base, last_alloc + nb-user * cache-size + 1].
> > >>>
> > >>>   * A user should never free an ID that is not allocated.
> > >>>     No checks are done and doing so will duplicate the spurious
> > >>>     ID.  Refcounting or other memory management scheme should
> > >>>     be used to ensure an object and its ID are only freed once.
> > >>>
> > >>> This pool is designed to scale reasonably well in multi-thread
> > >>> setup.  As it is aimed at being a faster replacement to the
> > >>> current id-pool, a benchmark has been implemented alongside
> > >>> unit tests.
> > >>>
> > >>> The benchmark is composed of 4 rounds: 'new', 'del', 'mix', and 'rnd'.
> > >>> Respectively
> > >>>
> > >>>   + 'new': only allocate IDs
> > >>>   + 'del': only free IDs
> > >>>   + 'mix': allocate, sequential free, then allocate ID.
> > >>>   + 'rnd': allocate, random free, allocate ID.
> > >>>
> > >>> Randomized freeing is done by swapping the latest allocated ID with any
> > >>> from the range of currently allocated ID, which is reminiscent of the
> > >>> Fisher-Yates shuffle.  This evaluates freeing non-sequential IDs,
> > >>> which is the more natural use-case.
> > >>>
> > >>> For this specific round, the id-pool performance is such that a timeout
> > >>> of 10 seconds is added to the benchmark:
> > >>>
> > >>>    $ ./tests/ovstest test-seq-pool benchmark 10000 1
> > >>>    Benchmarking n=10000 on 1 thread.
> > >>>     type\thread:       1    Avg
> > >>>    seq-pool new:       1      1 ms
> > >>>    seq-pool del:       0      0 ms
> > >>>    seq-pool mix:       1      1 ms
> > >>>    seq-pool rnd:       1      1 ms
> > >>>     id-pool new:       0      0 ms
> > >>>     id-pool del:       1      1 ms
> > >>>     id-pool mix:       1      1 ms
> > >>>     id-pool rnd:    1201   1201 ms
> > >>>
> > >>>    $ ./tests/ovstest test-seq-pool benchmark 100000 1
> > >>>    Benchmarking n=100000 on 1 thread.
> > >>>     type\thread:       1    Avg
> > >>>    seq-pool new:       2      2 ms
> > >>>    seq-pool del:       5      5 ms
> > >>>    seq-pool mix:       5      5 ms
> > >>>    seq-pool rnd:       5      5 ms
> > >>>     id-pool new:       8      8 ms
> > >>>     id-pool del:       5      5 ms
> > >>>     id-pool mix:      11     11 ms
> > >>>     id-pool rnd:  10000+ ****** ms
> > >>>
> > >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> > >>>    Benchmarking n=1000000 on 1 thread.
> > >>>     type\thread:       1    Avg
> > >>>    seq-pool new:      23     23 ms
> > >>>    seq-pool del:      49     49 ms
> > >>>    seq-pool mix:      53     53 ms
> > >>>    seq-pool rnd:      53     53 ms
> > >>>     id-pool new:     190    190 ms
> > >>>     id-pool del:     173    173 ms
> > >>>     id-pool mix:     273    273 ms
> > >>>     id-pool rnd:  10042+ ****** ms
> > >>>
> > >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> > >>>    Benchmarking n=1000000 on 2 threads.
> > >>>     type\thread:       1      2    Avg
> > >>>    seq-pool new:      40     39     39 ms
> > >>>    seq-pool del:      33     33     33 ms
> > >>>    seq-pool mix:      89     91     90 ms
> > >>>    seq-pool rnd:     146    151    148 ms
> > >>>     id-pool new:     485    485    485 ms
> > >>>     id-pool del:     541    542    541 ms
> > >>>     id-pool mix:     550    600    575 ms
> > >>>     id-pool rnd:  10048+ 10003+ ****** ms
> > >>>
> > >>>    $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> > >>>    Benchmarking n=1000000 on 4 threads.
> > >>>     type\thread:       1      2      3      4    Avg
> > >>>    seq-pool new:      40     39     40     40     39 ms
> > >>>    seq-pool del:      24     28     28     30     27 ms
> > >>>    seq-pool mix:      60     63     69     69     65 ms
> > >>>    seq-pool rnd:     195    197    202    202    199 ms
> > >>>     id-pool new:     478    471    482    485    479 ms
> > >>>     id-pool del:     474    469    467    474    471 ms
> > >>>     id-pool mix:     558    558    611    545    568 ms
> > >>>     id-pool rnd:  10121+ 10076+ 10030+ 10167+ ****** ms
> > >>
> > >> Exactly same question here.  The new data structure doesn't provide
> > >> any guarantees and checks.  It doesn't allocate smallest id.
> > >> When why not just an array + spinlock?   We're already using this
> > >> schema in lib/netdev-afxdp-pool.c and it works fine.
> > >>
> > >> I crafted a very simple implementation and added it to this benchmark.
> > >> Results are following:
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 10000 1
> > >> Benchmarking n=10000 on 1 thread.
> > >>  type\thread:       1    Avg
> > >> seq-pool new:       1      1 ms
> > >> seq-pool del:       2      2 ms
> > >> seq-pool mix:       4      4 ms
> > >> seq-pool rnd:       4      4 ms
> > >>  my-pool new:       1      1 ms
> > >>  my-pool del:       1      1 ms
> > >>  my-pool mix:       1      1 ms
> > >>  my-pool rnd:       2      2 ms
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 100000 1
> > >> Benchmarking n=100000 on 1 thread.
> > >>  type\thread:       1    Avg
> > >> seq-pool new:      13     13 ms
> > >> seq-pool del:      24     24 ms
> > >> seq-pool mix:      11     11 ms
> > >> seq-pool rnd:       8      8 ms
> > >>  my-pool new:       2      2 ms
> > >>  my-pool del:       1      1 ms
> > >>  my-pool mix:       4      4 ms
> > >>  my-pool rnd:       3      3 ms
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 1000000 1
> > >> Benchmarking n=1000000 on 1 thread.
> > >>  type\thread:       1    Avg
> > >> seq-pool new:      43     43 ms
> > >> seq-pool del:      50     50 ms
> > >> seq-pool mix:      54     54 ms
> > >> seq-pool rnd:      54     54 ms
> > >>  my-pool new:      10     10 ms
> > >>  my-pool del:      10     10 ms
> > >>  my-pool mix:      28     28 ms
> > >>  my-pool rnd:      41     41 ms
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 1000000 2
> > >> Benchmarking n=1000000 on 2 threads.
> > >>  type\thread:       1      2    Avg
> > >> seq-pool new:      72     71     71 ms
> > >> seq-pool del:      31     33     32 ms
> > >> seq-pool mix:      83     83     83 ms
> > >> seq-pool rnd:     144    147    145 ms
> > >>  my-pool new:      16     12     14 ms
> > >>  my-pool del:      16     12     14 ms
> > >>  my-pool mix:      66     70     68 ms
> > >>  my-pool rnd:      42     58     50 ms
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> > >> Benchmarking n=1000000 on 4 threads.
> > >>  type\thread:       1      2      3      4    Avg
> > >> seq-pool new:      71     69     69     67     69 ms
> > >> seq-pool del:      27     22     27     23     24 ms
> > >> seq-pool mix:      57     64     71     67     64 ms
> > >> seq-pool rnd:     182    186    189    191    187 ms
> > >>  my-pool new:      50     42     44     50     46 ms
> > >>  my-pool del:      46     39     37     45     41 ms
> > >>  my-pool mix:      79     45     55     81     65 ms
> > >>  my-pool rnd:     149    130    123    148    137 ms
> > >>
> > >> So, this implementation is faster and doesn't require any new complex
> > >> data structures.
> > >>
> > >> Sometimes high contention on a spin lock results in unfair distribution
> > >> and gives somewhat worse results, but they are still not too shabby, e.g.:
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 1000000 4
> > >> Benchmarking n=1000000 on 4 threads.
> > >>  type\thread:       1      2      3      4    Avg
> > >> seq-pool new:      75     75     75     74     74 ms
> > >> seq-pool del:      25     22     23     26     24 ms
> > >> seq-pool mix:      56     68     63     69     64 ms
> > >> seq-pool rnd:     183    188    191    192    188 ms
> > >>  my-pool new:      52     52     53     39     49 ms
> > >>  my-pool del:      52     54     55     42     50 ms
> > >>  my-pool mix:     150    156    159    126    147 ms
> > >>  my-pool rnd:     160    175    177    139    162 ms
> > >>
> > >> I didn't add any core affinity management to this benchmark.
> > >> And even in a bad for spinlock scenario with 16 threads on 4 physical
> > >> cores with multi-threading, it doesn't look terrible:
> > >>
> > >> $ ./tests/ovstest test-seq-pool benchmark 1000000 16
> > >> Benchmarking n=1000000 on 16 threads.
> > >>  type\thread:       1      ...     16    Avg
> > >> seq-pool new:      38      ...     40     40 ms
> > >> seq-pool del:      23      ...     20     20 ms
> > >> seq-pool mix:      66      ...     68     65 ms
> > >> seq-pool rnd:     200      ...    185    195 ms
> > >>  my-pool new:      68      ...     91     97 ms
> > >>  my-pool del:      87      ...    163    145 ms
> > >>  my-pool mix:     363      ...    359    343 ms
> > >>  my-pool rnd:     464      ...    432    428 ms
> > >>
> > >> For the scenario with a few offload thread should be perfectly fine.
> > >> What do you think?
> > > 
> > > Thanks, I hadn't seen the afxdp pool, it's interesting.
> > > 
> > > I see two issues, one trivial and one maybe more problematic.
> > > First is that there is an implicit dependency in the marks allocated
> > > using the id-pool, on remaining under a limit. I can't speak for other
> > > hardware vendors, but we have a limitation in Nvidia NICs, where
> > > marks in the full range of the ones used by offloads are not all supported.
> > > 
> > > It was a surprise for me, otherwise I was going for a fully random ID generator
> > > (using malloc-generated addresses), there are simpler ways to get there.
> > > For your pool the change is trivial anyway, only the ID values have to be
> > > set accordingly:
> > > 
> > > 487     for (int i = size; i > 0; i--) {
> > > 488         pool->array[pool->count++] = start + i - 1;
> > > 489     }
> > > 
> > > There is no way currently to query the hardware for rte_flow limits,
> > > so I don't have a good way to set this limit dynamically,
> > > and I don't see how it could work with port hotplug, downsizing a shared range.
> > > Maybe this is another gap in the rte_flow API that should be addressed.
> > > 
> > > The other issue is that the full range of ID is pre-allocated.
> > > With the current mark range it means 16 GB of mostly unused data.
> > > We can limit this to the part that we actually use, considering
> > > that we should only get flow_limit marks at most.
> > > 
> > > With a flow limit of 10k, 20k maybe to deal with async add + delete, it would
> > > be reduced to 80k bytes pre-allocated.
> > > 
> > > I very much prefer your implementation. I am not happy with the seq-pool.
> > > I tried to make it a drop-in replacement for id-pool, so allowing very large
> > > ranges was part of it. If it is okay to restrict the mark range I think it's
> > > worth using yours instead.
> > 
> > I think, we can avoid pre-allocation of the whole array and use 2n-realloc
> > schema if we will run out of pre-allocated values.  It's not ideal to
> > realloc under the spinlock, but this should not be a very big issue if
> > it happens once or twice in a lifetime of a process.  We can, probably,
> > pre-allocate 200K values which will be ~1.5MB of data.  200K is the default
> > limit for the number of datapath flows, so we should almost never re-allocate.
> > Alternatively, we can pre-allocate 20K and have at most 4 re-allocations
> > in a lifetime of the process.
> > 
> > What do you think?
> 
> On principle I agree, deferring the allocation should work.
> 
> I'm not sure about the 2n-realloc. At face value, exponential alloc 
> seems dangerous with
> such a range. A growth constant would make the hiccups more 
> predictable, considering
> the spinlock will block other threads in a busy loop, it seems 
> important to keep in mind.
> Also, given that memory overcommit won't help, we could waste a lot of 
> memory with
> the last realloc.
> 
> I will respin the patch with a new version, probably with a new name instead of
> seq-pool (I fear it might confuse readers regarding the struct 'seq' otherwise).
> 
> -- 
> Gaetan Rivet
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 

Hi Ilya,

Finally I had some time to improve the pool implementation.
I tried your idea, which works. I was wrong of course in my previous email:
you took care of benefitting from memory overcommit by allocating using 2n-realloc
scheme but only writing at a constant pace. So with that in mind, no issue to implement
and test it.

I modified it slightly, to reverse the allocated IDs, starting from the lowest IDs
and incrementing. As it was allocating IDs starting from the base, I called it
'id-queue' although it's closer to an unordered pile of IDs. Maybe the final
structure should be called 'id-pile'.

During tests, I found an egregious error I made in the benchmark: between 'MIX'
and 'MIX SHUFFLED' phase, IDs are not freed at all. The 'MIX SHUFFLED' phase thus
tests starvation, which explains why seq-pool performs so badly. Properly testing
'MIX SHUFFLED', here are the numbers:

./tests/ovstest test-id-cache benchmark  1000000 4
Benchmarking n=1000000 on 4 threads.
 type\thread:       1      2      3      4    Avg
seq-pool new:      34     35     34     35     34 ms
seq-pool del:      24     26     26     26     25 ms
seq-pool mix:      72     42     61     69     61 ms
seq-pool rnd:      97     95     98     98     97 ms
id-queue new:      90     96     99     99     96 ms
id-queue del:      79     81     83     84     81 ms
id-queue mix:     240    251    261    261    253 ms
id-queue rnd:     259    262    273    272    266 ms
 id-pool new:     466    464    463    456    462 ms
 id-pool del:     457    458    455    455    456 ms
 id-pool mix:     516    530    527    598    542 ms
 id-pool rnd:  10002+ 10001+ 10003+ 10000+ ****** ms

So the id-pool remains broken, but the comparison between seq-pool and id-queue becomes
more interesting. The example above with 4 threads shows one issue with the 2n-realloc
array of IDs: it scales very poorly as threads are increased. The only configuration
where the id-queue is better is with a single-thread:

./tests/ovstest test-id-cache benchmark  1000000 1
Benchmarking n=1000000 on 1 thread.
 type\thread:       1    Avg
seq-pool new:      20     20 ms
seq-pool del:      49     49 ms
seq-pool mix:      46     46 ms
seq-pool rnd:      54     54 ms
id-queue new:      13     13 ms
id-queue del:      10     10 ms
id-queue mix:      31     31 ms
id-queue rnd:      52     52 ms
 id-pool new:     276    276 ms
 id-pool del:     288    288 ms
 id-pool mix:     445    445 ms
 id-pool rnd:  10000+ ****** ms

Given those results, a compromise between seq-pool and id-queue should be good:
it should be possible to have the id-queue numbers or close to it in single-thread,
but staying the same as threads are increased.

The working name is 'id-cache' because I wasn't able to find something else,
but I'm not happy with it. Instead of allocating a single giant array, it splits
into cache-line sized slabs holding IDs. It allows managing each slabs per-thread.
Concurrency is managed using a spinlock instead of a lockless ring.

Doing measures, due to using spinlock for a proper comparison I set the affinity of each
threads from 1 to 4 to 4 different physical CPUs, and isolated the process using taskset.
(id-pool is disabled.)

$ pool-stats.sh
20 times 'taskset -c 4,5,6,7 ./tests/ovstest test-id-cache benchmark  1000000 1':
id-cache new: avg   15.6 | stdev    1.5 | max    22 | min    15
id-cache del: avg   14.8 | stdev    0.5 | max    16 | min    14
id-cache mix: avg   34.5 | stdev    0.6 | max    36 | min    34
id-cache rnd: avg   46.6 | stdev    1.4 | max    51 | min    46
seq-pool new: avg   20.6 | stdev    0.5 | max    21 | min    20
seq-pool del: avg   47.7 | stdev    0.5 | max    48 | min    47
seq-pool mix: avg   45.5 | stdev    0.5 | max    46 | min    45
seq-pool rnd: avg   53.3 | stdev    0.7 | max    56 | min    53
id-queue new: avg   12.8 | stdev    0.4 | max    13 | min    12
id-queue del: avg    9.8 | stdev    0.4 | max    10 | min     9
id-queue mix: avg   30.4 | stdev    0.5 | max    31 | min    30
id-queue rnd: avg   42.4 | stdev    0.5 | max    43 | min    42

$ pool-stats.sh 1000000 2
20 times 'taskset -c 4,5,6,7 ./tests/ovstest test-id-cache benchmark  1000000 2':
id-cache new: avg   21.2 | stdev    2.1 | max    25 | min    18
id-cache del: avg   20.0 | stdev    1.3 | max    23 | min    18
id-cache mix: avg   44.2 | stdev    5.7 | max    54 | min    38
id-cache rnd: avg   46.5 | stdev    4.7 | max    57 | min    42
seq-pool new: avg   33.5 | stdev    1.0 | max    37 | min    32
seq-pool del: avg   30.3 | stdev    1.1 | max    35 | min    30
seq-pool mix: avg   85.4 | stdev    2.0 | max    92 | min    82
seq-pool rnd: avg   85.3 | stdev    1.1 | max    87 | min    83
id-queue new: avg   26.5 | stdev    3.9 | max    35 | min    20
id-queue del: avg   23.4 | stdev    3.3 | max    30 | min    18
id-queue mix: avg   70.2 | stdev   11.1 | max    95 | min    47
id-queue rnd: avg   81.6 | stdev   16.9 | max   106 | min    45

$ pool-stats.sh 1000000 3
20 times 'taskset -c 4,5,6,7 ./tests/ovstest test-id-cache benchmark  1000000 3':
id-cache new: avg   16.8 | stdev    0.5 | max    18 | min    16
id-cache del: avg   15.2 | stdev    0.6 | max    17 | min    14
id-cache mix: avg   25.6 | stdev    0.6 | max    27 | min    25
id-cache rnd: avg   28.2 | stdev    0.4 | max    29 | min    28
seq-pool new: avg   31.9 | stdev    0.8 | max    33 | min    31
seq-pool del: avg   25.3 | stdev    0.8 | max    28 | min    24
seq-pool mix: avg   74.1 | stdev    0.7 | max    76 | min    73
seq-pool rnd: avg   79.4 | stdev    1.1 | max    81 | min    77
id-queue new: avg   44.5 | stdev    8.1 | max    57 | min    29
id-queue del: avg   45.5 | stdev    6.7 | max    55 | min    30
id-queue mix: avg  147.7 | stdev   19.2 | max   173 | min   110
id-queue rnd: avg  165.1 | stdev   14.0 | max   188 | min   145

$ pool-stats.sh 1000000 4
20 times 'taskset -c 4,5,6,7 ./tests/ovstest test-id-cache benchmark  1000000 4':
id-cache new: avg   16.8 | stdev    4.1 | max    21 | min    10
id-cache del: avg   20.5 | stdev    1.0 | max    22 | min    19
id-cache mix: avg   31.6 | stdev    0.9 | max    35 | min    31
id-cache rnd: avg   33.3 | stdev    0.6 | max    35 | min    32
seq-pool new: avg   33.7 | stdev    0.6 | max    35 | min    33
seq-pool del: avg   25.6 | stdev    1.2 | max    30 | min    24
seq-pool mix: avg   67.4 | stdev    2.5 | max    73 | min    63
seq-pool rnd: avg   89.9 | stdev    2.8 | max    95 | min    84
id-queue new: avg   83.0 | stdev    6.3 | max    92 | min    66
id-queue del: avg   68.9 | stdev    6.9 | max    81 | min    59
id-queue mix: avg  215.1 | stdev   16.6 | max   250 | min   192
id-queue rnd: avg  247.6 | stdev    8.3 | max   259 | min   235

In single-thread setup it is very close to the id-queue, especially in the more
'realistic' rounds (mix and rnd), while a little worse in the straight alloc
and free rounds. However it is better than seq-pool, and it also works ok as
threads increase. It does not scale well as there is no improvement, but it is not
increasing.

I also ran the same tests without affining the threads and using isolated CPUs.
The averages are close (slightly better it seems) but with increased variance:

$ pool-stats.sh 1000000 4
20 times './tests/ovstest test-id-cache benchmark  1000000 4':
id-cache new: avg   17.5 | stdev    3.6 | max    28 | min    12
id-cache del: avg   17.4 | stdev    2.7 | max    21 | min    13
id-cache mix: avg   28.5 | stdev    3.6 | max    32 | min    21
id-cache rnd: avg   31.7 | stdev    3.5 | max    35 | min    24
seq-pool new: avg   32.7 | stdev    1.2 | max    37 | min    32
seq-pool del: avg   23.7 | stdev    0.9 | max    25 | min    23
seq-pool mix: avg   59.4 | stdev    4.4 | max    67 | min    50
seq-pool rnd: avg   71.1 | stdev    6.0 | max    86 | min    65
id-queue new: avg   62.5 | stdev   15.8 | max    91 | min    44
id-queue del: avg   55.8 | stdev   15.3 | max    89 | min    36
id-queue mix: avg  180.9 | stdev   46.4 | max   262 | min   119
id-queue rnd: avg  215.7 | stdev   37.8 | max   262 | min   141

So I think the 'id-cache' compromise should be preferred. The implementation is slightly
more complex as 'per-thread' slabs are managed, but it's just fixed size arrays used as
queues protected by spinlocks.

What do you think?

As reference here are the implementations:
---
diff --git a/lib/automake.mk b/lib/automake.mk
index 3ca43b57a..685691e06 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -138,8 +138,12 @@ lib_libopenvswitch_la_SOURCES = \
 	lib/hmap.c \
 	lib/hmapx.c \
 	lib/hmapx.h \
+	lib/id-cache.c \
+	lib/id-cache.h \
 	lib/id-pool.c \
 	lib/id-pool.h \
+	lib/id-queue.c \
+	lib/id-queue.h \
 	lib/if-notifier-manual.c \
 	lib/if-notifier.h \
 	lib/ipf.c \
diff --git a/lib/id-cache.c b/lib/id-cache.c
new file mode 100644
index 000000000..95f4a6421
--- /dev/null
+++ b/lib/id-cache.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2021 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "openvswitch/list.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/util.h"
+#include "ovs-atomic.h"
+#include "id-cache.h"
+
+#define N_ID_PER_SLAB \
+    ((CACHE_LINE_SIZE \
+      - sizeof(struct ovs_list) \
+      - sizeof(uint32_t)) / sizeof(uint32_t))
+struct id_slab {
+    struct ovs_list node;
+    uint32_t pos;
+    uint32_t ids[N_ID_PER_SLAB];
+};
+BUILD_ASSERT_DECL(sizeof(struct id_slab) == CACHE_LINE_SIZE);
+
+struct per_user {
+PADDED_MEMBERS(CACHE_LINE_SIZE,
+    struct ovs_spin user_lock;
+    struct id_slab *slab;
+);};
+
+struct id_cache {
+    /* Constants */
+    uint32_t floor; /* IDs in the range of [floor, ceil). */
+    uint32_t ceil;
+    size_t nb_user; /* Number of user threads. */
+
+    /* Shared mutable data protected by cache lock. */
+    struct ovs_spin cache_lock;
+    struct ovs_list free_slabs;
+    uint32_t next_id;
+
+    /* Per-user mutable data protected by user locks. */
+    struct per_user per_users[0];
+};
+
+/* Lock precedence is
+ * 1: cache.per_users.user_lock
+ * 2: cache_lock
+ */
+
+static struct id_slab *
+id_slab_create(uint32_t *next_id, uint32_t max)
+{
+    struct id_slab *slab;
+    size_t n_ids;
+    size_t pos;
+
+    if (next_id[0] == max) {
+        return NULL;
+    }
+
+    n_ids = max - next_id[0];
+    slab = xmalloc(sizeof *slab);
+    ovs_list_init(&slab->node);
+    slab->pos = 0;
+
+    for (pos = MIN(n_ids, ARRAY_SIZE(slab->ids)); pos > 0; pos--) {
+        slab->ids[pos - 1] = next_id[0];
+        next_id[0]++;
+        slab->pos++;
+    }
+
+    return slab;
+}
+
+static bool
+id_slab_insert(struct id_slab *slab, uint32_t id)
+{
+    if (slab == NULL) {
+        return false;
+    }
+    if (slab->pos >= ARRAY_SIZE(slab->ids)) {
+        return false;
+    }
+    slab->ids[slab->pos++] = id;
+    return true;
+}
+
+static bool
+id_slab_remove(struct id_slab *slab, uint32_t *id)
+{
+    if (slab == NULL) {
+        return false;
+    }
+    if (slab->pos == 0) {
+        return false;
+    }
+    *id = slab->ids[--slab->pos];
+    return true;
+}
+
+#define per_user_lock(pu) do { ovs_spin_lock(&pu->user_lock); } while (0)
+#define per_user_unlock(pu) do { ovs_spin_unlock(&pu->user_lock); } while (0)
+
+static void
+per_user_init(struct per_user *pu, uint32_t *next_id, uint32_t max)
+{
+    ovs_spin_init(&pu->user_lock);
+    pu->slab = id_slab_create(next_id, max);
+}
+
+static void
+per_user_destroy(struct per_user *pu)
+{
+    per_user_lock(pu);
+    free(pu->slab);
+    pu->slab = NULL;
+    per_user_unlock(pu);
+    ovs_spin_destroy(&pu->user_lock);
+}
+
+struct id_cache *
+id_cache_create(unsigned int nb_user, uint32_t floor, uint32_t n_ids)
+{
+    struct id_cache *cache;
+    size_t i;
+
+    ovs_assert(nb_user != 0);
+    ovs_assert(floor <= UINT32_MAX - n_ids);
+
+    cache = xmalloc(sizeof *cache + nb_user * sizeof(struct per_user));
+    cache->next_id = floor;
+    cache->floor = floor;
+    cache->ceil = floor + n_ids;
+
+    for (i = 0; i < nb_user; i++) {
+        per_user_init(&cache->per_users[i],
+                      &cache->next_id, cache->ceil);
+    }
+    cache->nb_user = nb_user;
+
+    ovs_spin_init(&cache->cache_lock);
+    ovs_list_init(&cache->free_slabs);
+
+    return cache;
+}
+
+void
+id_cache_destroy(struct id_cache *cache)
+{
+    struct id_slab *slab;
+    struct id_slab *next;
+    size_t i;
+
+    ovs_spin_lock(&cache->cache_lock);
+    LIST_FOR_EACH_SAFE (slab, next, node, &cache->free_slabs) {
+        free(slab);
+    }
+    ovs_list_poison(&cache->free_slabs);
+    ovs_spin_unlock(&cache->cache_lock);
+    ovs_spin_destroy(&cache->cache_lock);
+
+    for (i = 0; i < cache->nb_user; i++) {
+        per_user_destroy(&cache->per_users[i]);
+    }
+    free(cache);
+}
+
+bool
+id_cache_new_id(struct id_cache *cache, unsigned int uid, uint32_t *id)
+{
+    struct per_user *pu;
+    unsigned int uid2;
+    bool res = false;
+
+    ovs_assert(uid < cache->nb_user);
+    pu = &cache->per_users[uid];
+
+    per_user_lock(pu);
+
+    if (id_slab_remove(pu->slab, id)) {
+        res = true;
+        goto unlock_and_ret;
+    }
+    free(pu->slab);
+
+    ovs_spin_lock(&cache->cache_lock);
+    if (!ovs_list_is_empty(&cache->free_slabs)) {
+        pu->slab = CONTAINER_OF(ovs_list_pop_front(&cache->free_slabs),
+                                struct id_slab, node);
+    } else {
+        pu->slab = id_slab_create(&cache->next_id, cache->ceil);
+    }
+    ovs_spin_unlock(&cache->cache_lock);
+
+    if (pu->slab != NULL) {
+        res = id_slab_remove(pu->slab, id);
+        goto unlock_and_ret;
+    }
+
+    per_user_unlock(pu);
+
+    /* No ID available in local slab, no slab available in shared list.
+     * The shared counter is maxed out. Attempt to steal an ID from another
+     * user slab. */
+
+    for (uid2 = 0; uid2 < cache->nb_user; uid2++) {
+        struct per_user *pu2 = &cache->per_users[uid2];
+
+        if (uid == uid2) {
+            continue;
+        }
+        per_user_lock(pu2);
+        res = id_slab_remove(pu2->slab, id);
+        per_user_unlock(pu2);
+        if (res) {
+            break;
+        }
+    }
+
+    goto out;
+
+unlock_and_ret:
+    per_user_unlock(pu);
+out:
+    return res;
+}
+
+void
+id_cache_free_id(struct id_cache *cache, unsigned int uid, uint32_t id)
+{
+    struct per_user *pu;
+
+    if (id < cache->floor || id >= cache->ceil) {
+        return;
+    }
+
+    ovs_assert(uid < cache->nb_user);
+    pu = &cache->per_users[uid];
+
+    per_user_lock(pu);
+
+    if (pu->slab == NULL) {
+        /* Create local slab with a single ID. */
+        pu->slab = id_slab_create(&id, id + 1);
+        goto unlock;
+    }
+
+    if (id_slab_insert(pu->slab, id)) {
+        goto unlock;
+    }
+
+    ovs_spin_lock(&cache->cache_lock);
+    ovs_list_push_back(&cache->free_slabs, &pu->slab->node);
+    ovs_spin_unlock(&cache->cache_lock);
+
+    /* Create local slab with a single ID. */
+    pu->slab = id_slab_create(&id, id + 1);
+
+unlock:
+    per_user_unlock(pu);
+}
diff --git a/lib/id-cache.h b/lib/id-cache.h
new file mode 100644
index 000000000..4af419f47
--- /dev/null
+++ b/lib/id-cache.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ID_CACHE_H
+#define ID_CACHE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct id_cache;
+struct id_cache *id_cache_create(unsigned int nb_user,
+                                 uint32_t base, uint32_t n_ids);
+void id_cache_destroy(struct id_cache *cache);
+bool id_cache_new_id(struct id_cache *cache, unsigned int uid, uint32_t *id);
+void id_cache_free_id(struct id_cache *cache, unsigned int uid, uint32_t id);
+
+#endif  /* ID_CACHE_H */
diff --git a/lib/id-queue.c b/lib/id-queue.c
new file mode 100644
index 000000000..a616c197a
--- /dev/null
+++ b/lib/id-queue.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2021 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "openvswitch/list.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/util.h"
+#include "ovs-thread.h"
+#include "id-queue.h"
+
+#ifdef HAVE_PTHREAD_SPIN_LOCK
+#define id_queue_lock_type ovs_spin
+#define id_queue_lock_init(l) do { ovs_spin_init(l); } while(0)
+#define id_queue_lock_destroy(l) do { ovs_spin_destroy(l); } while (0)
+#define id_queue_lock(l) do { ovs_spin_lock(l); } while(0)
+#define id_queue_unlock(l) do { ovs_spin_unlock(l); } while (0)
+#else
+#define id_queue_lock_type ovs_mutex
+#define id_queue_lock_init(l) do { ovs_mutex_init(l); } while(0)
+#define id_queue_lock_destroy(l) do { ovs_mutex_destroy(l); } while (0)
+#define id_queue_lock(l) do { ovs_mutex_lock(l); } while(0)
+#define id_queue_unlock(l) do { ovs_mutex_unlock(l); } while (0)
+#endif
+
+struct id_queue {
+    struct id_queue_lock_type lock;
+    uint32_t *array;
+    uint32_t base;
+    uint32_t n_ids;
+    size_t pos; /* Next allocated ID in array. */
+    size_t populated; /* Currently populated items in array. */
+    size_t allocated; /* Allocated size of array. */
+};
+
+#define ID_QUEUE_GROWTH_CONSTANT 10000
+
+static void
+id_queue_grow(struct id_queue *queue, size_t n)
+    OVS_REQUIRES(queue->lock)
+{
+    size_t size = queue->n_ids - queue->base;
+    size_t i;
+
+    if (queue->populated == size || n == 0) {
+        return;
+    }
+
+    for (i = queue->populated; i < queue->allocated; i++) {
+        queue->array[queue->populated++] = queue->base + i;
+        if (n-- == 0) {
+            return;
+        }
+    }
+
+    queue->array = x2nrealloc(queue->array, &queue->allocated,
+                              sizeof *queue->array);
+    if (queue->allocated > size) {
+        queue->allocated = size;
+    }
+
+    /* Do not write the whole allocated array at once to benefit
+     * from memory overcommit. */
+    for (i = queue->populated; i < queue->allocated; i++) {
+        queue->array[queue->populated++] = queue->base + i;
+        if (n-- == 0) {
+            return;
+        }
+    }
+}
+
+struct id_queue *
+id_queue_create(uint32_t base, uint32_t n_ids, size_t pre_alloc)
+{
+    struct id_queue *queue;
+    size_t size;
+
+    ovs_assert(base <= UINT32_MAX - n_ids);
+
+    queue = xzalloc(sizeof *queue);
+    size = n_ids - base;
+    if (pre_alloc > size) {
+        pre_alloc = size;
+    } else if (pre_alloc == 0) {
+        pre_alloc = 1;
+    }
+
+    queue->base = base;
+    queue->n_ids = n_ids;
+    id_queue_lock_init(&queue->lock);
+
+    id_queue_lock(&queue->lock);
+    id_queue_grow(queue, pre_alloc);
+    queue->pos = 0;
+    id_queue_unlock(&queue->lock);
+
+    return queue;
+}
+
+void
+id_queue_destroy(struct id_queue *queue)
+{
+    id_queue_lock(&queue->lock);
+    free(queue->array);
+    queue->array = NULL;
+    id_queue_unlock(&queue->lock);
+    id_queue_lock_destroy(&queue->lock);
+
+    memset(queue, 0, sizeof *queue);
+    free(queue);
+}
+
+bool
+id_queue_alloc(struct id_queue *queue, uint32_t *id)
+{
+    bool res = false;
+
+    id_queue_lock(&queue->lock);
+    if (queue->pos == queue->populated) {
+        id_queue_grow(queue, ID_QUEUE_GROWTH_CONSTANT);
+    }
+    if (queue->pos < queue->populated) {
+        *id = queue->array[queue->pos++];
+        res = true;
+    }
+    id_queue_unlock(&queue->lock);
+
+    return res;
+}
+
+void
+id_queue_free(struct id_queue *queue, uint32_t id)
+{
+    if (id < queue->base || id >= queue->base + queue->n_ids) {
+        return;
+    }
+
+    id_queue_lock(&queue->lock);
+    /* If the current cursor is 0, the queue is full, nothing can be inserted.
+     * If a thread attempts to free an ID while the queue is already full,
+     * at least one ID was freed multiple times, so abort. */
+    ovs_assert(queue->pos > 0);
+    queue->array[queue->pos--] = id;
+    id_queue_unlock(&queue->lock);
+}
diff --git a/lib/id-queue.h b/lib/id-queue.h
new file mode 100644
index 000000000..2aed36a84
--- /dev/null
+++ b/lib/id-queue.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ID_QUEUE_H
+#define ID_QUEUE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct id_queue;
+struct id_queue *id_queue_create(uint32_t base, uint32_t n_ids, size_t pre_alloc);
+void id_queue_destroy(struct id_queue *queue);
+bool id_queue_alloc(struct id_queue *queue, uint32_t *id);
+void id_queue_free(struct id_queue *queue, uint32_t id);
+
+#endif  /* ID_QUEUE_H */
diff --git a/pool-stats.sh b/pool-stats.sh
new file mode 100755
index 000000000..9f91bb910
--- /dev/null
+++ b/pool-stats.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env sh
+
+N_ELEM=${1:-1000000}
+N_CORE=${2:-1}
+N_RUN=${3:-20}
+
+#BIN="taskset -c 4,5,6,7 ./tests/ovstest test-id-cache benchmark "
+BIN="./tests/ovstest test-id-cache benchmark "
+CMD="$BIN $N_ELEM $N_CORE"
+CMD_fast="$BIN 1 1"
+
+xc() { python -c "import math; print(float($*))"; }
+join_by() {( IFS="$1"; shift; echo "$*"; )}
+
+stdev() {(
+    m=$1; shift;
+    sum=0
+    for v in $*; do
+        sum=$(xc "$sum + pow($v - $m, 2)")
+    done
+    echo $(xc "math.sqrt($sum / $#)")
+)}
+mean() { echo $(xc "($(join_by + $*)) / $#"); }
+
+# $1: name field max width
+# $2: stat name
+# $3-: values
+print_stats() {(
+    len=$1; shift;
+    name=$1; shift;
+    values="$*"
+    m=$(mean $values)
+    sd=$(stdev $m $values)
+    printf "%*s: avg %6.1f | stdev %6.1f | max %5.0f | min %5.0f\n" \
+        "$len" "$name" "$m" "$sd" \
+        "$(xc "max($(join_by , $values))")" \
+        "$(xc "min($(join_by , $values))")"
+)}
+
+tmp=$(mktemp)
+$CMD_fast | grep -v 'Benchmarking\|thread' | \
+while read line; do
+    printf "%s\t" "$(echo $line | cut -d: -f1)"
+done >> $tmp
+printf "\n" >> $tmp
+
+echo "$N_RUN times '$CMD':"
+
+for i in $(seq 1 $N_RUN); do
+$CMD | grep -v 'Benchmarking\|thread' | \
+while read line; do
+    printf "%d\t" "$(echo $line | cut -d: -f2- | rev | cut -d' ' -f2 | rev)"
+done >> $tmp
+printf "\n" >> $tmp
+done
+
+#cat $tmp
+
+nb_col=$(awk -F$'\t' '{print NF-1}' $tmp | head -1)
+maxlen=0
+for i in $(seq 1 $nb_col); do
+    name=$(head -1 $tmp | cut -d$'\t' -f$i)
+    len=$(printf "%s" "$name" |wc -m)
+    [ "$maxlen" -lt "$len" ] && maxlen=$len
+done
+
+for i in $(seq 1 $nb_col); do
+    name=$(head -1 $tmp | cut -d$'\t' -f$i)
+    values=$(tail -n +2 $tmp | cut -d$'\t' -f$i | xargs)
+    print_stats $maxlen "$name" $values
+done
+
+rm $tmp
diff --git a/tests/automake.mk b/tests/automake.mk
index 66f0ed678..6a3432c25 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -461,6 +461,7 @@ tests_ovstest_SOURCES = \
 	tests/test-heap.c \
 	tests/test-hindex.c \
 	tests/test-hmap.c \
+	tests/test-id-cache.c \
 	tests/test-json.c \
 	tests/test-jsonrpc.c \
 	tests/test-list.c \
diff --git a/tests/test-id-cache.c b/tests/test-id-cache.c
new file mode 100644
index 000000000..dcd820380
--- /dev/null
+++ b/tests/test-id-cache.c
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2021 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+#include <getopt.h>
+#include <string.h>
+
+#include <config.h>
+
+#include "command-line.h"
+#include "id-cache.h"
+#include "id-pool.h"
+#include "id-queue.h"
+#include "openvswitch/vlog.h"
+#include "openvswitch/util.h"
+#include "ovs-thread.h"
+#include "ovs-rcu.h"
+#include "ovs-numa.h"
+#include "ovstest.h"
+#include "random.h"
+#include "seq-pool.h"
+#include "timeval.h"
+#include "util.h"
+
+#define ID_CACHE_SLAB_SIZE 11
+
+static void
+test_id_cache_alloc(void)
+{
+    const uint32_t base = 0;
+    const uint32_t n_id = 10;
+    struct id_cache *ic = id_cache_create(1, base, n_id);
+    uint32_t ids[n_id];
+    size_t i;
+
+    /* Can do n_id allocs. */
+    for (i = 0; i < n_id; i++) {
+        ovs_assert(id_cache_new_id(ic, 0, &ids[i]));
+        ovs_assert(ids[i] >= base);
+        ovs_assert(ids[i] < base + n_id);
+    }
+    /* Only n_id successful allocations. */
+    ovs_assert(id_cache_new_id(ic, 0, NULL) == false);
+
+    /* Monotonic alloc. */
+    for (i = 0; i < n_id - 1; i++) {
+        ovs_assert(ids[i] < ids[i + 1]);
+    }
+
+    for (i = 0; i < n_id; i++) {
+        id_cache_free_id(ic, 0, ids[i]);
+    }
+
+    /* Can do n_id new allocs. */
+    for (i = 0; i < n_id; i++) {
+        ovs_assert(id_cache_new_id(ic, 0, &ids[i]));
+        ovs_assert(ids[i] >= base);
+        ovs_assert(ids[i] < base + n_id);
+    }
+    /* Only n_id successful allocations. */
+    ovs_assert(id_cache_new_id(ic, 0, NULL) == false);
+
+    for (i = 0; i < n_id; i++) {
+        id_cache_free_id(ic, 0, ids[i]);
+    }
+
+    id_cache_destroy(ic);
+}
+
+static void
+test_id_cache_alloc_range(void)
+{
+    const uint32_t n_id = 100;
+    struct id_cache *ic = id_cache_create(1, 0, n_id);
+    bool id_allocated[n_id];
+    size_t i;
+
+    memset(id_allocated, 0, sizeof id_allocated);
+
+    /* Allocate all IDs only once. */
+    for (i = 0; i < n_id; i++) {
+        uint32_t id;
+
+        ovs_assert(id_cache_new_id(ic, 0, &id));
+        ovs_assert(id >= 0);
+        ovs_assert(id < n_id);
+
+        ovs_assert(id_allocated[id] == false);
+        id_allocated[id] = true;
+    }
+    /* Only n_id successful allocations. */
+    ovs_assert(id_cache_new_id(ic, 0, NULL) == false);
+
+    for (i = 0; i < n_id; i++) {
+        ovs_assert(id_allocated[i]);
+        id_cache_free_id(ic, 0, i);
+        id_allocated[i] = false;
+    }
+
+    /* The full range is again fully available. */
+    for (i = 0; i < n_id; i++) {
+        uint32_t id;
+
+        ovs_assert(id_cache_new_id(ic, 0, &id));
+        ovs_assert(id >= 0);
+        ovs_assert(id < n_id);
+
+        ovs_assert(id_allocated[id] == false);
+        id_allocated[id] = true;
+    }
+
+    id_cache_destroy(ic);
+}
+
+static void
+test_id_cache_alloc_steal(void)
+{
+    /* N must be less than a slab size to force the second user
+     * to steal from the first.
+     */
+    const size_t N = ID_CACHE_SLAB_SIZE / 2;
+    bool ids[N];
+    struct id_cache *ic;
+    uint32_t id;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    ic = id_cache_create(2, 0, N);
+
+    /* Fill up user 0 cache. */
+    ovs_assert(id_cache_new_id(ic, 0, &id));
+    for (i = 0; i < N - 1; i++) {
+        /* Check that user 1 can still alloc from user 0 cache. */
+        ovs_assert(id_cache_new_id(ic, 1, &id));
+    }
+
+    id_cache_destroy(ic);
+}
+
+static void
+test_id_cache_alloc_under_limit(void)
+{
+    const size_t n_id = 100;
+    uint32_t ids[n_id];
+    unsigned int limit;
+    struct id_cache *ic;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    ic = id_cache_create(1, 0, n_id);
+
+    for (limit = 1; limit < n_id; limit++) {
+        /* Allocate until arbitrary limit then free allocated ids. */
+        for (i = 0; i < limit; i++) {
+            ovs_assert(id_cache_new_id(ic, 0, &ids[i]));
+        }
+        for (i = 0; i < limit; i++) {
+            id_cache_free_id(ic, 0, ids[i]);
+        }
+        /* Verify that the N='limit' next allocations are under limit. */
+        for (i = 0; i < limit; i++) {
+            ovs_assert(id_cache_new_id(ic, 0, &ids[i]));
+            ovs_assert(ids[i] < limit + ID_CACHE_SLAB_SIZE);
+        }
+        for (i = 0; i < limit; i++) {
+            id_cache_free_id(ic, 0, ids[i]);
+        }
+    }
+
+    id_cache_destroy(ic);
+}
+
+static void
+run_tests(struct ovs_cmdl_context *ctx OVS_UNUSED)
+{
+    test_id_cache_alloc();
+    test_id_cache_alloc_range();
+    test_id_cache_alloc_steal();
+    test_id_cache_alloc_under_limit();
+}
+
+static uint32_t *ids;
+static uint64_t *thread_working_ms; /* Measured work time. */
+
+static unsigned int n_threads;
+static unsigned int n_ids;
+
+static struct ovs_barrier barrier;
+
+#define TIMEOUT_MS (10 * 1000) /* 10 sec timeout */
+static int running_time_ms;
+static volatile bool stop = false;
+
+static int
+elapsed(int *start)
+{
+    return running_time_ms - *start;
+}
+
+static void
+swap_u32(uint32_t *a, uint32_t *b)
+{
+    uint32_t t;
+    t = *a;
+    *a = *b;
+    *b = t;
+}
+
+static void
+shuffle(uint32_t *p, size_t n)
+{
+    for (; n > 1; n--, p++) {
+        uint32_t *q = &p[random_range(n)];
+        swap_u32(p, q);
+    }
+}
+
+static void
+print_result(const char *prefix)
+{
+    uint64_t avg;
+    size_t i;
+
+    avg = 0;
+    for (i = 0; i < n_threads; i++) {
+        avg += thread_working_ms[i];
+    }
+    avg /= n_threads;
+    printf("%s: ", prefix);
+    for (i = 0; i < n_threads; i++) {
+        if (thread_working_ms[i] >= TIMEOUT_MS) {
+            printf("%6" PRIu64 "+", thread_working_ms[i]);
+        } else {
+            printf(" %6" PRIu64, thread_working_ms[i]);
+        }
+    }
+    if (avg >= TIMEOUT_MS) {
+        printf(" ****** ms\n");
+    } else {
+        printf(" %6" PRIu64 " ms\n", avg);
+    }
+}
+
+struct id_cache_aux {
+    struct id_cache *pool;
+    atomic_uint thread_id;
+};
+
+static void *
+id_cache_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct id_cache_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW / ALLOC */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(id_cache_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_cache_free_id(aux->pool, tid, th_ids[i]);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(id_cache_new_id(aux->pool, tid, &th_ids[i]));
+        id_cache_free_id(aux->pool, tid, th_ids[i]);
+        ignore(id_cache_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* Do not interfere with other threads still in 'MIX' phase. */
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_cache_free_id(aux->pool, tid, th_ids[i]);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ignore(id_cache_new_id(aux->pool, tid, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        id_cache_free_id(aux->pool, tid, th_ids[i]);
+        ignore(id_cache_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+static void
+benchmark_id_cache(void)
+{
+    pthread_t *threads;
+    struct id_cache_aux aux;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = id_cache_create(n_threads, 0, n_ids);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        id_cache_new_id(aux.pool, 0, &ids[i]);
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("id_cache_alloc",
+                                       id_cache_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result("id-cache new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result("id-cache del");
+
+    ovs_barrier_block(&barrier);
+    /* Cleanup. */
+    ovs_barrier_block(&barrier);
+
+    print_result("id-cache mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result("id-cache rnd");
+
+    id_cache_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+struct seq_pool_aux {
+    struct seq_pool *pool;
+    atomic_uint thread_id;
+};
+
+static void *
+seq_pool_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct seq_pool_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW / ALLOC */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* Do not interfere with other threads still in 'MIX' phase. */
+    for (i = 0; i < n_ids_per_thread; i++) {
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+OVS_UNUSED
+static void
+benchmark_seq_pool(void)
+{
+    pthread_t *threads;
+    struct seq_pool_aux aux;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = seq_pool_create(n_threads, 0, n_ids);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        uint32_t id;
+
+        seq_pool_new_id(aux.pool, 0, &id);
+        ids[i] = id;
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("seq_pool_alloc",
+                                       seq_pool_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool del");
+
+    ovs_barrier_block(&barrier);
+    /* Cleanup. */
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result("seq-pool rnd");
+
+    seq_pool_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+struct id_queue_aux {
+    struct id_queue *queue;
+    atomic_uint thread_id;
+};
+
+static void *
+id_queue_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct id_queue_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW / ALLOC */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(id_queue_alloc(aux->queue, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_queue_free(aux->queue, th_ids[i]);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(id_queue_alloc(aux->queue, &th_ids[i]));
+        id_queue_free(aux->queue, th_ids[i]);
+        ignore(id_queue_alloc(aux->queue, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* Do not interfere with other threads still in 'MIX' phase. */
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_queue_free(aux->queue, th_ids[i]);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ignore(id_queue_alloc(aux->queue, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        id_queue_free(aux->queue, th_ids[i]);
+        ignore(id_queue_alloc(aux->queue, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+OVS_UNUSED
+static void
+benchmark_id_queue(void)
+{
+    pthread_t *threads;
+    struct id_queue_aux aux;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.queue = id_queue_create(0, n_ids, 0);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        id_queue_alloc(aux.queue, &ids[i]);
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("id_queue_alloc",
+                                       id_queue_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result("id-queue new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result("id-queue del");
+
+    ovs_barrier_block(&barrier);
+    /* Cleanup. */
+    ovs_barrier_block(&barrier);
+
+    print_result("id-queue mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result("id-queue rnd");
+
+    id_queue_destroy(aux.queue);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+struct id_pool_aux {
+    struct id_pool *pool;
+    struct ovs_mutex *lock;
+    atomic_uint thread_id;
+};
+
+static void *
+id_pool_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct id_pool_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        ovs_assert(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* Do not interfere with other threads still in 'MIX' phase. */
+    ovs_mutex_lock(aux->lock);
+    for (i = 0; i < n_ids_per_thread; i++) {
+        id_pool_free_id(aux->pool, th_ids[i]);
+    }
+    ovs_mutex_unlock(aux->lock);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ovs_mutex_lock(aux->lock);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+OVS_UNUSED
+static void
+benchmark_id_pool(void)
+{
+    pthread_t *threads;
+    struct id_pool_aux aux;
+    struct ovs_mutex lock;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = id_pool_create(0, n_ids);
+    aux.lock = &lock;
+    ovs_mutex_init(&lock);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        id_pool_alloc_id(aux.pool, &ids[i]);
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("id_pool_alloc", id_pool_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool del");
+
+    ovs_barrier_block(&barrier);
+    /* Cleanup. */
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result(" id-pool rnd");
+
+    id_pool_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+static void *
+clock_main(void *arg OVS_UNUSED)
+{
+    struct timeval start;
+    struct timeval end;
+
+    xgettimeofday(&start);
+    while (!stop) {
+        xgettimeofday(&end);
+        running_time_ms = timeval_to_msec(&end) - timeval_to_msec(&start);
+        xnanosleep(1000);
+    }
+
+    return NULL;
+}
+
+static void
+run_benchmarks(struct ovs_cmdl_context *ctx)
+{
+    pthread_t clock;
+    long int l_threads;
+    long int l_ids;
+    size_t i;
+
+    l_ids = strtol(ctx->argv[1], NULL, 10);
+    l_threads = strtol(ctx->argv[2], NULL, 10);
+    ovs_assert(l_ids > 0 && l_threads > 0);
+
+    n_ids = l_ids;
+    n_threads = l_threads;
+
+    ids = xcalloc(n_ids, sizeof *ids);
+    thread_working_ms = xcalloc(n_threads, sizeof *thread_working_ms);
+
+    clock = ovs_thread_create("clock", clock_main, NULL);
+
+    printf("Benchmarking n=%u on %u thread%s.\n", n_ids, n_threads,
+           n_threads > 1 ? "s" : "");
+
+    printf(" type\\thread:  ");
+    for (i = 0; i < n_threads; i++) {
+        printf("   %3" PRIuSIZE " ", i + 1);
+    }
+    printf("   Avg\n");
+
+    ovsrcu_quiesce_start();
+
+    benchmark_id_cache();
+    benchmark_seq_pool();
+    benchmark_id_queue();
+    //benchmark_id_pool();
+
+    stop = true;
+
+    free(thread_working_ms);
+    xpthread_join(clock, NULL);
+}
+
+static const struct ovs_cmdl_command commands[] = {
+    {"check", NULL, 0, 0, run_tests, OVS_RO},
+    {"benchmark", "<nb elem> <nb threads>", 2, 2, run_benchmarks, OVS_RO},
+    {NULL, NULL, 0, 0, NULL, OVS_RO},
+};
+
+static void
+id_queue_test_main(int argc, char *argv[])
+{
+    struct ovs_cmdl_context ctx = {
+        .argc = argc - optind,
+        .argv = argv + optind,
+    };
+
+    vlog_set_levels(NULL, VLF_ANY_DESTINATION, VLL_OFF);
+
+    set_program_name(argv[0]);
+    ovs_cmdl_run_command(&ctx, commands);
+}
+
+OVSTEST_REGISTER("test-id-cache", id_queue_test_main);
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index f71781438..3ca43b57a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -295,6 +295,8 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/sat-math.h \
 	lib/seq.c \
 	lib/seq.h \
+	lib/seq-pool.c \
+	lib/seq-pool.h \
 	lib/sha1.c \
 	lib/sha1.h \
 	lib/shash.c \
diff --git a/lib/seq-pool.c b/lib/seq-pool.c
new file mode 100644
index 000000000..4426d11d8
--- /dev/null
+++ b/lib/seq-pool.c
@@ -0,0 +1,198 @@ 
+/*
+ * Copyright (c) 2020 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "openvswitch/list.h"
+#include "openvswitch/thread.h"
+#include "openvswitch/util.h"
+#include "ovs-atomic.h"
+#include "llring.h"
+#include "seq-pool.h"
+
+#define SEQPOOL_CACHE_SIZE 32
+BUILD_ASSERT_DECL(IS_POW2(SEQPOOL_CACHE_SIZE));
+
+struct seq_node {
+    struct ovs_list list_node;
+    uint32_t id;
+};
+
+struct seq_pool {
+    uint32_t next_id;
+    struct llring **cache; /* per-user id cache. */
+    size_t nb_user; /* Number of user threads. */
+    struct ovs_mutex lock; /* Protects free_ids access. */
+    struct ovs_list free_ids; /* Set of currently free IDs. */
+    uint32_t base; /* IDs in the range of [base, base + n_ids). */
+    uint32_t n_ids; /* Total number of ids in the pool. */
+};
+
+struct seq_pool *
+seq_pool_create(unsigned int nb_user, uint32_t base, uint32_t n_ids)
+{
+    struct seq_pool *pool;
+    size_t i;
+
+    ovs_assert(nb_user != 0);
+    ovs_assert(base <= UINT32_MAX - n_ids);
+
+    pool = xmalloc(sizeof *pool);
+
+    pool->cache = xcalloc(nb_user, sizeof *pool->cache);
+    for (i = 0; i < nb_user; i++) {
+        pool->cache[i] = llring_create(SEQPOOL_CACHE_SIZE);
+    }
+    pool->nb_user = nb_user;
+
+    pool->next_id = base;
+    pool->base = base;
+    pool->n_ids = n_ids;
+
+    ovs_mutex_init(&pool->lock);
+    ovs_list_init(&pool->free_ids);
+
+    return pool;
+}
+
+void
+seq_pool_destroy(struct seq_pool *pool)
+{
+    struct seq_node *node;
+    struct seq_node *next;
+    size_t i;
+
+    if (!pool) {
+        return;
+    }
+
+    ovs_mutex_lock(&pool->lock);
+    LIST_FOR_EACH_SAFE (node, next, list_node, &pool->free_ids) {
+        free(node);
+    }
+    ovs_list_poison(&pool->free_ids);
+    ovs_mutex_unlock(&pool->lock);
+    ovs_mutex_destroy(&pool->lock);
+
+    for (i = 0; i < pool->nb_user; i++) {
+        llring_destroy(pool->cache[i]);
+    }
+    free(pool->cache);
+
+    free(pool);
+}
+
+bool
+seq_pool_new_id(struct seq_pool *pool, unsigned int uid, uint32_t *id)
+{
+    struct llring *cache;
+    struct ovs_list *front;
+    struct seq_node *node;
+
+    uid %= pool->nb_user;
+    cache = pool->cache[uid];
+
+    if (llring_dequeue(cache, id)) {
+        return true;
+    }
+
+    ovs_mutex_lock(&pool->lock);
+
+    while (!ovs_list_is_empty(&pool->free_ids)) {
+        front = ovs_list_front(&pool->free_ids);
+        node = CONTAINER_OF(front, struct seq_node, list_node);
+        if (llring_enqueue(cache, node->id)) {
+            ovs_list_remove(front);
+            free(node);
+        } else {
+            break;
+        }
+    }
+
+    while (pool->next_id < pool->base + pool->n_ids) {
+        if (llring_enqueue(cache, pool->next_id)) {
+            pool->next_id++;
+        } else {
+            break;
+        }
+    }
+
+    ovs_mutex_unlock(&pool->lock);
+
+    if (llring_dequeue(cache, id)) {
+        return true;
+    } else {
+        struct llring *c2;
+        size_t i;
+
+        /* If no ID was available either from shared counter,
+         * free-list or local cache, steal an ID from another
+         * user cache.
+         */
+        for (i = 0; i < pool->nb_user; i++) {
+            if (i == uid) {
+                continue;
+            }
+            c2 = pool->cache[i];
+            if (llring_dequeue(c2, id)) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+void
+seq_pool_free_id(struct seq_pool *pool, unsigned int uid, uint32_t id)
+{
+    struct seq_node *nodes[SEQPOOL_CACHE_SIZE + 1];
+    struct llring *cache;
+    uint32_t node_id;
+    size_t i;
+
+    if (id < pool->base || id >= pool->base + pool->n_ids) {
+        return;
+    }
+
+    uid %= pool->nb_user;
+    cache = pool->cache[uid];
+
+    if (llring_enqueue(cache, id)) {
+        return;
+    }
+
+    /* Flush the cache. */
+    for (i = 0; llring_dequeue(cache, &node_id); i++) {
+        nodes[i] = xmalloc(sizeof *nodes[i]);
+        nodes[i]->id = node_id;
+    }
+
+    /* Finish with the last freed node. */
+    nodes[i] = xmalloc(sizeof **nodes);
+    nodes[i]->id = id;
+    i++;
+
+    if (i < ARRAY_SIZE(nodes)) {
+        nodes[i] = NULL;
+    }
+
+    ovs_mutex_lock(&pool->lock);
+    for (i = 0; i < ARRAY_SIZE(nodes) && nodes[i] != NULL; i++) {
+        ovs_list_push_back(&pool->free_ids, &nodes[i]->list_node);
+    }
+    ovs_mutex_unlock(&pool->lock);
+}
diff --git a/lib/seq-pool.h b/lib/seq-pool.h
new file mode 100644
index 000000000..c992a0988
--- /dev/null
+++ b/lib/seq-pool.h
@@ -0,0 +1,66 @@ 
+/*
+ * Copyright (c) 2020 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef SEQ_POOL_H
+#define SEQ_POOL_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * Sequential ID pool.
+ * ===================
+ *
+ * Pool of unique 32bits IDs.
+ *
+ * Multiple users are registered at initialization.  Each user uses a cache
+ * of ID.  When each thread using the pool uses its own user ID, the pool
+ * scales reasonably for concurrent allocation.
+ *
+ * New IDs are always in the range of '[base, next_id]', where 'next_id' is
+ * in the range of '[last_alloc_ID + nb_user * cache_size + 1]'.
+ * This means that a new ID is not always the smallest available ID, but it is
+ * still from a limited range.
+ *
+ * Users should ensure that an ID is *never* freed twice.  Not doing so will
+ * have the effect of double-allocating such ID afterward.
+ *
+ * Thread-safety
+ * =============
+ *
+ * APIs are thread safe.
+ *
+ * Multiple threads can share the same user ID if necessary, but it can hurt
+ * performance if threads are not otherwise synchronized.
+ */
+
+struct seq_pool;
+
+/* nb_user is the number of expected users of the pool,
+ * in terms of execution threads. */
+struct seq_pool *seq_pool_create(unsigned int nb_user,
+                                 uint32_t base, uint32_t n_ids);
+void seq_pool_destroy(struct seq_pool *pool);
+
+/* uid is the thread user-id. It should be within '[0, nb_user['. */
+bool seq_pool_new_id(struct seq_pool *pool, unsigned int uid, uint32_t *id);
+
+/* uid is the thread user-id. It should be within '[0, nb_user['.
+ * An allocated ID must *never* be freed twice.
+ */
+void seq_pool_free_id(struct seq_pool *pool, unsigned int uid, uint32_t id);
+#endif  /* seq-pool.h */
diff --git a/tests/automake.mk b/tests/automake.mk
index 4588d5b49..66f0ed678 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -475,6 +475,7 @@  tests_ovstest_SOURCES = \
 	tests/test-rcu.c \
 	tests/test-reconnect.c \
 	tests/test-rstp.c \
+	tests/test-seq-pool.c \
 	tests/test-sflow.c \
 	tests/test-sha1.c \
 	tests/test-skiplist.c \
diff --git a/tests/library.at b/tests/library.at
index 0e47bc445..5ffe2fd0a 100644
--- a/tests/library.at
+++ b/tests/library.at
@@ -264,3 +264,8 @@  AT_SETUP([mpsc-queue module])
 AT_CHECK([ovstest test-mpsc-queue check], [0], [....
 ])
 AT_CLEANUP
+
+AT_SETUP([seq-pool module])
+AT_CHECK([ovstest test-seq-pool check], [0], [....
+])
+AT_CLEANUP
diff --git a/tests/test-seq-pool.c b/tests/test-seq-pool.c
new file mode 100644
index 000000000..a1e0d5500
--- /dev/null
+++ b/tests/test-seq-pool.c
@@ -0,0 +1,543 @@ 
+/*
+ * Copyright (c) 2020 NVIDIA Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+#include <getopt.h>
+#include <string.h>
+
+#include <config.h>
+
+#include "command-line.h"
+#include "id-pool.h"
+#include "openvswitch/util.h"
+#include "ovs-thread.h"
+#include "ovs-rcu.h"
+#include "ovstest.h"
+#include "random.h"
+#include "seq-pool.h"
+#include "timeval.h"
+#include "util.h"
+
+#define SEQ_POOL_CACHE_SIZE 32
+
+#define N_IDS 100
+
+static void
+test_seq_pool_alloc_full_range(void)
+{
+    bool ids[N_IDS];
+    struct seq_pool *pool;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    pool = seq_pool_create(1, 0, N_IDS);
+
+    for (i = 0; i < N_IDS; i++) {
+        uint32_t id;
+
+        ovs_assert(seq_pool_new_id(pool, 0, &id));
+        /* No double alloc.*/
+        ovs_assert(ids[id] == false);
+        ids[id] = true;
+    }
+
+    for (i = 0; i < N_IDS; i++) {
+        ovs_assert(ids[i]);
+    }
+
+    seq_pool_destroy(pool);
+    printf(".");
+}
+
+static void
+test_seq_pool_alloc_steal(void)
+{
+    /* N must be less than a pool cache size to force the second user
+     * to steal from the first.
+     */
+#define N (SEQ_POOL_CACHE_SIZE / 4)
+    bool ids[N];
+    struct seq_pool *pool;
+    uint32_t id;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    pool = seq_pool_create(2, 0, N);
+
+    /* Fill up user 0 cache. */
+    ovs_assert(seq_pool_new_id(pool, 0, &id));
+    for (i = 0; i < N - 1; i++) {
+        /* Check that user 1 can still alloc from user 0 cache. */
+        ovs_assert(seq_pool_new_id(pool, 1, &id));
+    }
+
+    seq_pool_destroy(pool);
+    printf(".");
+#undef N
+}
+
+static void
+test_seq_pool_alloc_monotonic(void)
+{
+    uint32_t ids[N_IDS];
+    struct seq_pool *pool;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    pool = seq_pool_create(1, 0, N_IDS);
+
+    for (i = 0; i < N_IDS; i++) {
+        ovs_assert(seq_pool_new_id(pool, 0, &ids[i]));
+        ovs_assert(ids[i] == i);
+    }
+
+    seq_pool_destroy(pool);
+    printf(".");
+}
+
+static void
+test_seq_pool_alloc_under_limit(void)
+{
+    uint32_t ids[N_IDS];
+    unsigned int limit;
+    struct seq_pool *pool;
+    size_t i;
+
+    memset(ids, 0, sizeof ids);
+    pool = seq_pool_create(1, 0, N_IDS);
+
+    for (limit = 1; limit < N_IDS; limit++) {
+        /* Allocate until arbitrary limit then free allocated ids. */
+        for (i = 0; i < limit; i++) {
+            ovs_assert(seq_pool_new_id(pool, 0, &ids[i]));
+        }
+        for (i = 0; i < limit; i++) {
+            seq_pool_free_id(pool, 0, ids[i]);
+        }
+        /* Verify that the N='limit' next allocations are under limit. */
+        for (i = 0; i < limit; i++) {
+            ovs_assert(seq_pool_new_id(pool, 0, &ids[i]));
+            ovs_assert(ids[i] < limit + SEQ_POOL_CACHE_SIZE);
+        }
+        for (i = 0; i < limit; i++) {
+            seq_pool_free_id(pool, 0, ids[i]);
+        }
+    }
+
+    seq_pool_destroy(pool);
+    printf(".");
+}
+
+static void
+run_tests(struct ovs_cmdl_context *ctx OVS_UNUSED)
+{
+    /* Check that all ids can be allocated. */
+    test_seq_pool_alloc_full_range();
+    /* Check that all ids can be allocated with multiple users. */
+    test_seq_pool_alloc_steal();
+    /* Check that id allocation is always increasing. */
+    test_seq_pool_alloc_monotonic();
+    /* Check that id allocation stays under some limit. */
+    test_seq_pool_alloc_under_limit();
+    printf("\n");
+}
+
+static uint32_t *ids;
+static uint64_t *thread_working_ms; /* Measured work time. */
+
+static unsigned int n_threads;
+static unsigned int n_ids;
+
+static struct ovs_barrier barrier;
+
+#define TIMEOUT_MS (10 * 1000) /* 10 sec timeout */
+static int running_time_ms;
+static volatile bool stop = false;
+
+static int
+elapsed(int *start)
+{
+    return running_time_ms - *start;
+}
+
+static void
+swap_u32(uint32_t *a, uint32_t *b)
+{
+    uint32_t t;
+    t = *a;
+    *a = *b;
+    *b = t;
+}
+
+static void
+shuffle(uint32_t *p, size_t n)
+{
+    for (; n > 1; n--, p++) {
+        uint32_t *q = &p[random_range(n)];
+        swap_u32(p, q);
+    }
+}
+
+static void
+print_result(const char *prefix)
+{
+    uint64_t avg;
+    size_t i;
+
+    avg = 0;
+    for (i = 0; i < n_threads; i++) {
+        avg += thread_working_ms[i];
+    }
+    avg /= n_threads;
+    printf("%s: ", prefix);
+    for (i = 0; i < n_threads; i++) {
+        if (thread_working_ms[i] >= TIMEOUT_MS) {
+            printf("%6" PRIu64 "+", thread_working_ms[i]);
+        } else {
+            printf(" %6" PRIu64, thread_working_ms[i]);
+        }
+    }
+    if (avg >= TIMEOUT_MS) {
+        printf(" ****** ms\n");
+    } else {
+        printf(" %6" PRIu64 " ms\n", avg);
+    }
+}
+
+struct seq_pool_aux {
+    struct seq_pool *pool;
+    atomic_uint thread_id;
+};
+
+static void *
+seq_pool_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct seq_pool_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW / ALLOC */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        seq_pool_free_id(aux->pool, tid, th_ids[i]);
+        ignore(seq_pool_new_id(aux->pool, tid, &th_ids[i]));
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+static void
+benchmark_seq_pool(void)
+{
+    pthread_t *threads;
+    struct seq_pool_aux aux;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = seq_pool_create(n_threads, 0, n_ids);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        uint32_t id;
+
+        seq_pool_new_id(aux.pool, 0, &id);
+        ids[i] = id;
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("seq_pool_alloc",
+                                       seq_pool_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool del");
+
+    ovs_barrier_block(&barrier);
+
+    print_result("seq-pool mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result("seq-pool rnd");
+
+    seq_pool_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+struct id_pool_aux {
+    struct id_pool *pool;
+    struct ovs_mutex *lock;
+    atomic_uint thread_id;
+};
+
+static void *
+id_pool_thread(void *aux_)
+{
+    unsigned int n_ids_per_thread;
+    struct id_pool_aux *aux = aux_;
+    uint32_t *th_ids;
+    unsigned int tid;
+    int start;
+    size_t i;
+
+    atomic_add(&aux->thread_id, 1u, &tid);
+    n_ids_per_thread = n_ids / n_threads;
+    th_ids = &ids[tid * n_ids_per_thread];
+
+    /* NEW */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        ovs_assert(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* DEL */
+
+    shuffle(th_ids, n_ids_per_thread);
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        ovs_mutex_lock(aux->lock);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    ovs_barrier_block(&barrier);
+
+    /* MIX SHUFFLED */
+
+    start = running_time_ms;
+    for (i = 0; i < n_ids_per_thread; i++) {
+        if (elapsed(&start) >= TIMEOUT_MS) {
+            break;
+        }
+        ovs_mutex_lock(aux->lock);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        swap_u32(&th_ids[i], &th_ids[random_range(i + 1)]);
+        id_pool_free_id(aux->pool, th_ids[i]);
+        ignore(id_pool_alloc_id(aux->pool, &th_ids[i]));
+        ovs_mutex_unlock(aux->lock);
+    }
+    thread_working_ms[tid] = elapsed(&start);
+
+    return NULL;
+}
+
+static void
+benchmark_id_pool(void)
+{
+    pthread_t *threads;
+    struct id_pool_aux aux;
+    struct ovs_mutex lock;
+    size_t i;
+
+    memset(ids, 0, n_ids & sizeof *ids);
+    memset(thread_working_ms, 0, n_threads & sizeof *thread_working_ms);
+
+    aux.pool = id_pool_create(0, n_ids);
+    aux.lock = &lock;
+    ovs_mutex_init(&lock);
+    atomic_store(&aux.thread_id, 0);
+
+    for (i = n_ids - (n_ids % n_threads); i < n_ids; i++) {
+        id_pool_alloc_id(aux.pool, &ids[i]);
+    }
+
+    threads = xmalloc(n_threads * sizeof *threads);
+    ovs_barrier_init(&barrier, n_threads + 1);
+
+    for (i = 0; i < n_threads; i++) {
+        threads[i] = ovs_thread_create("id_pool_alloc", id_pool_thread, &aux);
+    }
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool new");
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool del");
+
+    ovs_barrier_block(&barrier);
+
+    print_result(" id-pool mix");
+
+    for (i = 0; i < n_threads; i++) {
+        xpthread_join(threads[i], NULL);
+    }
+
+    print_result(" id-pool rnd");
+
+    id_pool_destroy(aux.pool);
+    ovs_barrier_destroy(&barrier);
+    free(threads);
+}
+
+static void *
+clock_main(void *arg OVS_UNUSED)
+{
+    struct timeval start;
+    struct timeval end;
+
+    xgettimeofday(&start);
+    while (!stop) {
+        xgettimeofday(&end);
+        running_time_ms = timeval_to_msec(&end) - timeval_to_msec(&start);
+        xnanosleep(1000);
+    }
+
+    return NULL;
+}
+
+static void
+run_benchmarks(struct ovs_cmdl_context *ctx)
+{
+    pthread_t clock;
+    long int l_threads;
+    long int l_ids;
+    size_t i;
+
+    l_ids = strtol(ctx->argv[1], NULL, 10);
+    l_threads = strtol(ctx->argv[2], NULL, 10);
+    ovs_assert(l_ids > 0 && l_threads > 0);
+
+    n_ids = l_ids;
+    n_threads = l_threads;
+
+    ids = xcalloc(n_ids, sizeof *ids);
+    thread_working_ms = xcalloc(n_threads, sizeof *thread_working_ms);
+
+    clock = ovs_thread_create("clock", clock_main, NULL);
+
+    printf("Benchmarking n=%u on %u thread%s.\n", n_ids, n_threads,
+           n_threads > 1 ? "s" : "");
+
+    printf(" type\\thread:  ");
+    for (i = 0; i < n_threads; i++) {
+        printf("   %3" PRIuSIZE " ", i + 1);
+    }
+    printf("   Avg\n");
+
+    benchmark_seq_pool();
+    benchmark_id_pool();
+
+    stop = true;
+
+    free(thread_working_ms);
+    xpthread_join(clock, NULL);
+}
+
+static const struct ovs_cmdl_command commands[] = {
+    {"check", NULL, 0, 0, run_tests, OVS_RO},
+    {"benchmark", "<nb elem> <nb threads>", 2, 2, run_benchmarks, OVS_RO},
+    {NULL, NULL, 0, 0, NULL, OVS_RO},
+};
+
+static void
+test_seq_pool_main(int argc, char *argv[])
+{
+    struct ovs_cmdl_context ctx = {
+        .argc = argc - optind,
+        .argv = argv + optind,
+    };
+
+    set_program_name(argv[0]);
+    ovs_cmdl_run_command(&ctx, commands);
+}
+
+OVSTEST_REGISTER("test-seq-pool", test_seq_pool_main);