diff mbox

[RFC,3/3] qemu-coroutine: use a ring per thread for the pool

Message ID 54786CF5.2060705@kamp.de
State New
Headers show

Commit Message

Peter Lieven Nov. 28, 2014, 12:39 p.m. UTC
Am 28.11.2014 um 13:26 schrieb Paolo Bonzini:
>
> On 28/11/2014 12:46, Peter Lieven wrote:
>>> I get:
>>> Run operation 40000000 iterations 9.883958 s, 4046K operations/s, 247ns per coroutine
>> Ok, understood, it "steals" the whole pool, right? Isn't that bad if we have more
>> than one thread in need of a lot of coroutines?
> Overall the algorithm is expected to adapt.  The N threads contribute to
> the global release pool, so the pool will fill up N times faster than if
> you had only one thread.  There can be some variance, which is why the
> maximum size of the pool is twice the threshold (and probably could be
> tuned better).
>
> Benchmarks are needed on real I/O too, of course, especially with high
> queue depth.

Yes, cool. The atomic operations are a bit tricky at the first glance ;-)

Question:
 Why is the pool_size increment atomic and the set to zero not?
 
Idea:
 If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)

Run operation 40000000 iterations 9.057805 s, 4416K operations/s, 226ns per coroutine



Bug?:
 The release_pool is not cleanup up on termination I think.

Peter

Comments

Paolo Bonzini Nov. 28, 2014, 12:45 p.m. UTC | #1
On 28/11/2014 13:39, Peter Lieven wrote:
> Am 28.11.2014 um 13:26 schrieb Paolo Bonzini:
>>
>> On 28/11/2014 12:46, Peter Lieven wrote:
>>>> I get:
>>>> Run operation 40000000 iterations 9.883958 s, 4046K operations/s, 247ns per coroutine
>>> Ok, understood, it "steals" the whole pool, right? Isn't that bad if we have more
>>> than one thread in need of a lot of coroutines?
>> Overall the algorithm is expected to adapt.  The N threads contribute to
>> the global release pool, so the pool will fill up N times faster than if
>> you had only one thread.  There can be some variance, which is why the
>> maximum size of the pool is twice the threshold (and probably could be
>> tuned better).
>>
>> Benchmarks are needed on real I/O too, of course, especially with high
>> queue depth.
> 
> Yes, cool. The atomic operations are a bit tricky at the first glance ;-)
> 
> Question:
>  Why is the pool_size increment atomic and the set to zero not?

Because the set to zero is not a read-modify-write operation, so it is
always atomic.  It's just not sequentially-consistent (see
docs/atomics.txt for some info on what that means).

> Idea:
>  If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)

Because you can only waste 64 coroutines per thread.  But numbers cannot
be sneezed at, so it's worth doing it as a separate patch.

> Run operation 40000000 iterations 9.057805 s, 4416K operations/s, 226ns per coroutine
> 
> diff --git a/qemu-coroutine.c b/qemu-coroutine.c
> index 6bee354..edea162 100644
> --- a/qemu-coroutine.c
> +++ b/qemu-coroutine.c
> @@ -25,8 +25,9 @@ enum {
>  
>  /** Free list to speed up creation */
>  static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
> -static unsigned int pool_size;
> +static unsigned int release_pool_size;
>  static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
> +static __thread unsigned int alloc_pool_size;
>  
>  /* The GPrivate is only used to invoke coroutine_pool_cleanup.  */
>  static void coroutine_pool_cleanup(void *value);
> @@ -39,12 +40,12 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>      if (CONFIG_COROUTINE_POOL) {
>          co = QSLIST_FIRST(&alloc_pool);
>          if (!co) {
> -            if (pool_size > POOL_BATCH_SIZE) {
> -                /* This is not exact; there could be a little skew between pool_size
> +            if (release_pool_size > POOL_BATCH_SIZE) {
> +                /* This is not exact; there could be a little skew between release_pool_size
>                   * and the actual size of alloc_pool.  But it is just a heuristic,
>                   * it does not need to be perfect.
>                   */
> -                pool_size = 0;
> +                alloc_pool_size = atomic_fetch_and(&release_pool_size, 0);
>                  QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
>                  co = QSLIST_FIRST(&alloc_pool);
>  
> @@ -53,6 +54,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>                   */
>                  g_private_set(&dummy_key, &dummy_key);
>              }
> +        } else {
> +            alloc_pool_size--;
>          }
>          if (co) {
>              QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
> @@ -71,10 +74,15 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>  static void coroutine_delete(Coroutine *co)
>  {
>      if (CONFIG_COROUTINE_POOL) {
> -        if (pool_size < POOL_BATCH_SIZE * 2) {
> +        if (release_pool_size < POOL_BATCH_SIZE * 2) {
>              co->caller = NULL;
>              QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
> -            atomic_inc(&pool_size);
> +            atomic_inc(&release_pool_size);
> +            return;
> +        } else if (alloc_pool_size < POOL_BATCH_SIZE) {
> +            co->caller = NULL;
> +            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
> +            alloc_pool_size++;
>              return;
>          }
>      }
> 
> 
> Bug?:
>  The release_pool is not cleanup up on termination I think.

That's not necessary, it is global.

Paolo
Peter Lieven Nov. 28, 2014, 12:49 p.m. UTC | #2
Am 28.11.2014 um 13:45 schrieb Paolo Bonzini:
>
> On 28/11/2014 13:39, Peter Lieven wrote:
>> Am 28.11.2014 um 13:26 schrieb Paolo Bonzini:
>>> On 28/11/2014 12:46, Peter Lieven wrote:
>>>>> I get:
>>>>> Run operation 40000000 iterations 9.883958 s, 4046K operations/s, 247ns per coroutine
>>>> Ok, understood, it "steals" the whole pool, right? Isn't that bad if we have more
>>>> than one thread in need of a lot of coroutines?
>>> Overall the algorithm is expected to adapt.  The N threads contribute to
>>> the global release pool, so the pool will fill up N times faster than if
>>> you had only one thread.  There can be some variance, which is why the
>>> maximum size of the pool is twice the threshold (and probably could be
>>> tuned better).
>>>
>>> Benchmarks are needed on real I/O too, of course, especially with high
>>> queue depth.
>> Yes, cool. The atomic operations are a bit tricky at the first glance ;-)
>>
>> Question:
>>  Why is the pool_size increment atomic and the set to zero not?
> Because the set to zero is not a read-modify-write operation, so it is
> always atomic.  It's just not sequentially-consistent (see
> docs/atomics.txt for some info on what that means).
>
>> Idea:
>>  If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)
> Because you can only waste 64 coroutines per thread.  But numbers cannot
> be sneezed at, so it's worth doing it as a separate patch.

What do you mean by that? If I use dataplane I will fill the global pool and never use it okay, but
then I use thread local storage only. So I get the same numbers as in my thread local storage only version.

Maybe it is an idea to tweak the POOL_BATCH_SIZE * 2 according to what is really attached. If we
have only dataplane or ioeventfd it can be POOL_BATCH_SIZE * 0 and we even won't waste those
coroutines oxidating in the global pool.

Peter
Paolo Bonzini Nov. 28, 2014, 12:56 p.m. UTC | #3
On 28/11/2014 13:49, Peter Lieven wrote:
>>> Idea:
>>> >>  If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)
>> > Because you can only waste 64 coroutines per thread.  But numbers cannot

s/only//

>> > be sneezed at, so it's worth doing it as a separate patch.
> What do you mean by that? If I use dataplane I will fill the global pool and never use it okay, but
> then I use thread local storage only. So I get the same numbers as in my thread local storage only version.

Right.  I didn't want to waste the coroutines.  But it's not 64
coroutines per VCPU thread, it's just 64 coroutines for the global
iothread because all the dataplane threads are guaranteed to use
ioeventfd.  Let's do it. :)

Can I add your Signed-off-by to the patch?

Paolo

> Maybe it is an idea to tweak the POOL_BATCH_SIZE * 2 according to what is really attached. If we
> have only dataplane or ioeventfd it can be POOL_BATCH_SIZE * 0 and we even won't waste those
> coroutines oxidating in the global pool.
Peter Lieven Nov. 28, 2014, 1:13 p.m. UTC | #4
Am 28.11.2014 um 13:39 schrieb Peter Lieven:
> Am 28.11.2014 um 13:26 schrieb Paolo Bonzini:
>> On 28/11/2014 12:46, Peter Lieven wrote:
>>>> I get:
>>>> Run operation 40000000 iterations 9.883958 s, 4046K operations/s, 247ns per coroutine
>>> Ok, understood, it "steals" the whole pool, right? Isn't that bad if we have more
>>> than one thread in need of a lot of coroutines?
>> Overall the algorithm is expected to adapt.  The N threads contribute to
>> the global release pool, so the pool will fill up N times faster than if
>> you had only one thread.  There can be some variance, which is why the
>> maximum size of the pool is twice the threshold (and probably could be
>> tuned better).
>>
>> Benchmarks are needed on real I/O too, of course, especially with high
>> queue depth.
> Yes, cool. The atomic operations are a bit tricky at the first glance ;-)
>
> Question:
>  Why is the pool_size increment atomic and the set to zero not?
>  
> Idea:
>  If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)
>
> Run operation 40000000 iterations 9.057805 s, 4416K operations/s, 226ns per coroutine
>
> diff --git a/qemu-coroutine.c b/qemu-coroutine.c
> index 6bee354..edea162 100644
> --- a/qemu-coroutine.c
> +++ b/qemu-coroutine.c
> @@ -25,8 +25,9 @@ enum {
>  
>  /** Free list to speed up creation */
>  static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
> -static unsigned int pool_size;
> +static unsigned int release_pool_size;
>  static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
> +static __thread unsigned int alloc_pool_size;
>  
>  /* The GPrivate is only used to invoke coroutine_pool_cleanup.  */
>  static void coroutine_pool_cleanup(void *value);
> @@ -39,12 +40,12 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>      if (CONFIG_COROUTINE_POOL) {
>          co = QSLIST_FIRST(&alloc_pool);
>          if (!co) {
> -            if (pool_size > POOL_BATCH_SIZE) {
> -                /* This is not exact; there could be a little skew between pool_size
> +            if (release_pool_size > POOL_BATCH_SIZE) {
> +                /* This is not exact; there could be a little skew between release_pool_size
>                   * and the actual size of alloc_pool.  But it is just a heuristic,
>                   * it does not need to be perfect.
>                   */
> -                pool_size = 0;
> +                alloc_pool_size = atomic_fetch_and(&release_pool_size, 0);
>                  QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
>                  co = QSLIST_FIRST(&alloc_pool);
>  
> @@ -53,6 +54,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>                   */
>                  g_private_set(&dummy_key, &dummy_key);
>              }
> +        } else {
> +            alloc_pool_size--;
>          }
>          if (co) {
>              QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
> @@ -71,10 +74,15 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>  static void coroutine_delete(Coroutine *co)
>  {
>      if (CONFIG_COROUTINE_POOL) {
> -        if (pool_size < POOL_BATCH_SIZE * 2) {
> +        if (release_pool_size < POOL_BATCH_SIZE * 2) {
>              co->caller = NULL;
>              QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
> -            atomic_inc(&pool_size);
> +            atomic_inc(&release_pool_size);
> +            return;
> +        } else if (alloc_pool_size < POOL_BATCH_SIZE) {
> +            co->caller = NULL;
> +            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
> +            alloc_pool_size++;
>              return;
>          }
>      }

Signed-off-by: Peter Lieven <pl@kamp.de>
Peter Lieven Nov. 28, 2014, 1:17 p.m. UTC | #5
Am 28.11.2014 um 13:45 schrieb Paolo Bonzini:
>
> On 28/11/2014 13:39, Peter Lieven wrote:
>> Am 28.11.2014 um 13:26 schrieb Paolo Bonzini:
>>> On 28/11/2014 12:46, Peter Lieven wrote:
>>>>> I get:
>>>>> Run operation 40000000 iterations 9.883958 s, 4046K operations/s, 247ns per coroutine
>>>> Ok, understood, it "steals" the whole pool, right? Isn't that bad if we have more
>>>> than one thread in need of a lot of coroutines?
>>> Overall the algorithm is expected to adapt.  The N threads contribute to
>>> the global release pool, so the pool will fill up N times faster than if
>>> you had only one thread.  There can be some variance, which is why the
>>> maximum size of the pool is twice the threshold (and probably could be
>>> tuned better).
>>>
>>> Benchmarks are needed on real I/O too, of course, especially with high
>>> queue depth.
>> Yes, cool. The atomic operations are a bit tricky at the first glance ;-)
>>
>> Question:
>>  Why is the pool_size increment atomic and the set to zero not?
> Because the set to zero is not a read-modify-write operation, so it is
> always atomic.  It's just not sequentially-consistent (see
> docs/atomics.txt for some info on what that means).
>
>> Idea:
>>  If the release_pool is full why not put the coroutine in the thread alloc_pool instead of throwing it away? :-)
> Because you can only waste 64 coroutines per thread.  But numbers cannot
> be sneezed at, so it's worth doing it as a separate patch.
>
>> Run operation 40000000 iterations 9.057805 s, 4416K operations/s, 226ns per coroutine
>>
>> diff --git a/qemu-coroutine.c b/qemu-coroutine.c
>> index 6bee354..edea162 100644
>> --- a/qemu-coroutine.c
>> +++ b/qemu-coroutine.c
>> @@ -25,8 +25,9 @@ enum {
>>  
>>  /** Free list to speed up creation */
>>  static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
>> -static unsigned int pool_size;
>> +static unsigned int release_pool_size;
>>  static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
>> +static __thread unsigned int alloc_pool_size;
>>  
>>  /* The GPrivate is only used to invoke coroutine_pool_cleanup.  */
>>  static void coroutine_pool_cleanup(void *value);
>> @@ -39,12 +40,12 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>>      if (CONFIG_COROUTINE_POOL) {
>>          co = QSLIST_FIRST(&alloc_pool);
>>          if (!co) {
>> -            if (pool_size > POOL_BATCH_SIZE) {
>> -                /* This is not exact; there could be a little skew between pool_size
>> +            if (release_pool_size > POOL_BATCH_SIZE) {
>> +                /* This is not exact; there could be a little skew between release_pool_size
>>                   * and the actual size of alloc_pool.  But it is just a heuristic,
>>                   * it does not need to be perfect.
>>                   */
>> -                pool_size = 0;
>> +                alloc_pool_size = atomic_fetch_and(&release_pool_size, 0);
>>                  QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
>>                  co = QSLIST_FIRST(&alloc_pool);
>>  
>> @@ -53,6 +54,8 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>>                   */
>>                  g_private_set(&dummy_key, &dummy_key);
>>              }
>> +        } else {
>> +            alloc_pool_size--;
>>          }
>>          if (co) {
>>              QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
>> @@ -71,10 +74,15 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>>  static void coroutine_delete(Coroutine *co)
>>  {
>>      if (CONFIG_COROUTINE_POOL) {
>> -        if (pool_size < POOL_BATCH_SIZE * 2) {
>> +        if (release_pool_size < POOL_BATCH_SIZE * 2) {
>>              co->caller = NULL;
>>              QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
>> -            atomic_inc(&pool_size);
>> +            atomic_inc(&release_pool_size);
>> +            return;
>> +        } else if (alloc_pool_size < POOL_BATCH_SIZE) {
>> +            co->caller = NULL;
>> +            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
>> +            alloc_pool_size++;
>>              return;
>>          }
>>      }
>>
>>
>> Bug?:
>>  The release_pool is not cleanup up on termination I think.
> That's not necessary, it is global.

I don't see where you iterate over release_pool and destroy all coroutines?

Maybe just add back the old destructor with s/pool/release_pool/g

Peter

Peter
Paolo Bonzini Nov. 28, 2014, 2:17 p.m. UTC | #6
On 28/11/2014 14:17, Peter Lieven wrote:
>>> >>  The release_pool is not cleanup up on termination I think.
>> > That's not necessary, it is global.
> I don't see where you iterate over release_pool and destroy all coroutines?

The OS does that for us when we exit.

Paolo
Peter Lieven Nov. 28, 2014, 8:11 p.m. UTC | #7
Am 28.11.2014 um 15:17 schrieb Paolo Bonzini:
>
> On 28/11/2014 14:17, Peter Lieven wrote:
>>>>>>  The release_pool is not cleanup up on termination I think.
>>>> That's not necessary, it is global.
>> I don't see where you iterate over release_pool and destroy all coroutines?
> The OS does that for us when we exit.

Sure, but isn't that considered bad practice?
Before this patch the destructured freed the coroutines.

Peter
diff mbox

Patch

diff --git a/qemu-coroutine.c b/qemu-coroutine.c
index 6bee354..edea162 100644
--- a/qemu-coroutine.c
+++ b/qemu-coroutine.c
@@ -25,8 +25,9 @@  enum {
 
 /** Free list to speed up creation */
 static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
-static unsigned int pool_size;
+static unsigned int release_pool_size;
 static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+static __thread unsigned int alloc_pool_size;
 
 /* The GPrivate is only used to invoke coroutine_pool_cleanup.  */
 static void coroutine_pool_cleanup(void *value);
@@ -39,12 +40,12 @@  Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
     if (CONFIG_COROUTINE_POOL) {
         co = QSLIST_FIRST(&alloc_pool);
         if (!co) {
-            if (pool_size > POOL_BATCH_SIZE) {
-                /* This is not exact; there could be a little skew between pool_size
+            if (release_pool_size > POOL_BATCH_SIZE) {
+                /* This is not exact; there could be a little skew between release_pool_size
                  * and the actual size of alloc_pool.  But it is just a heuristic,
                  * it does not need to be perfect.
                  */
-                pool_size = 0;
+                alloc_pool_size = atomic_fetch_and(&release_pool_size, 0);
                 QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
                 co = QSLIST_FIRST(&alloc_pool);
 
@@ -53,6 +54,8 @@  Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
                  */
                 g_private_set(&dummy_key, &dummy_key);
             }
+        } else {
+            alloc_pool_size--;
         }
         if (co) {
             QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
@@ -71,10 +74,15 @@  Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 static void coroutine_delete(Coroutine *co)
 {
     if (CONFIG_COROUTINE_POOL) {
-        if (pool_size < POOL_BATCH_SIZE * 2) {
+        if (release_pool_size < POOL_BATCH_SIZE * 2) {
             co->caller = NULL;
             QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
-            atomic_inc(&pool_size);
+            atomic_inc(&release_pool_size);
+            return;
+        } else if (alloc_pool_size < POOL_BATCH_SIZE) {
+            co->caller = NULL;
+            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
+            alloc_pool_size++;
             return;
         }
     }