diff mbox

[RFC] qht: Align sequence lock to cache line

Message ID 87shrkckfv.fsf@gmail.com
State New
Headers show

Commit Message

Pranith Kumar Oct. 25, 2016, 7:12 p.m. UTC
Paolo Bonzini writes:

> On 25/10/2016 17:49, Pranith Kumar wrote:
>> But we are taking the seqlock of only the head bucket, while the
>> readers are reading hashes/pointers of the chained buckets.
>
> No, we aren't.  See qht_lookup__slowpath.


I don't see it. The reader is taking the head bucket look in
qht_lookup__slowpath() and then iterating over the chained buckets in
qht_do_lookup(). The writer is doing the same. It is taking the head bucket
lock in qht_insert__locked().

I've written a patch (see below) to take the per-bucket sequence locks.

>
> This patch:
>
> throughput base   patch  %change
> update
> 0          8.07   13.33  +65%
> 10         7.10   8.90   +25%
> 20         6.34   7.02   +10%
> 30         5.48   6.11   +9.6%
> 40         4.90   5.46   +11.42%
>
>
> Just doubling the cachesize:
>
> throughput base   patch  %change
> update
> 0          8.07   4.47   -45% ?!?
> 10         7.10   9.82   +38%
> 20         6.34   8.13   +28%
> 30         5.48   7.13   +30%
> 40         5.90   6.45   +30%
>
> It seems to me that your machine has 128-byte cachelines.
>

Nope. It is just the regular 64 byte cache line.

$ getconf LEVEL1_DCACHE_LINESIZE
64

(The machine model is Xeon CPU E5-2620).


Take the per-bucket sequence locks instead of the head bucket lock.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 util/qht.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

Comments

Paolo Bonzini Oct. 25, 2016, 8:02 p.m. UTC | #1
On 25/10/2016 21:12, Pranith Kumar wrote:
> 
> Paolo Bonzini writes:
> 
>> On 25/10/2016 17:49, Pranith Kumar wrote:
>>> But we are taking the seqlock of only the head bucket, while the
>>> readers are reading hashes/pointers of the chained buckets.
>>
>> No, we aren't.  See qht_lookup__slowpath.
> 
> 
> I don't see it. The reader is taking the head bucket look in
> qht_lookup__slowpath() and then iterating over the chained buckets in
> qht_do_lookup(). The writer is doing the same. It is taking the head bucket
> lock in qht_insert__locked().

Uh, you're right.  Sorry I wasn't reading it correctly.

> I've written a patch (see below) to take the per-bucket sequence locks.

What's the performance like?

Paolo

>>
>> This patch:
>>
>> throughput base   patch  %change
>> update
>> 0          8.07   13.33  +65%
>> 10         7.10   8.90   +25%
>> 20         6.34   7.02   +10%
>> 30         5.48   6.11   +9.6%
>> 40         4.90   5.46   +11.42%
>>
>>
>> Just doubling the cachesize:
>>
>> throughput base   patch  %change
>> update
>> 0          8.07   4.47   -45% ?!?
>> 10         7.10   9.82   +38%
>> 20         6.34   8.13   +28%
>> 30         5.48   7.13   +30%
>> 40         5.90   6.45   +30%
>>
>> It seems to me that your machine has 128-byte cachelines.
>>
> 
> Nope. It is just the regular 64 byte cache line.
> 
> $ getconf LEVEL1_DCACHE_LINESIZE
> 64
> 
> (The machine model is Xeon CPU E5-2620).
> 
> 
> Take the per-bucket sequence locks instead of the head bucket lock.
> 
> Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
> ---
>  util/qht.c | 36 ++++++++++++++++++------------------
>  1 file changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/util/qht.c b/util/qht.c
> index 4d82609..cfce5fc 100644
> --- a/util/qht.c
> +++ b/util/qht.c
> @@ -374,19 +374,19 @@ static void qht_bucket_reset__locked(struct qht_bucket *head)
>      struct qht_bucket *b = head;
>      int i;
>  
> -    seqlock_write_begin(&head->sequence);
>      do {
> +        seqlock_write_begin(&b->sequence);
>          for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
>              if (b->pointers[i] == NULL) {
> -                goto done;
> +                seqlock_write_end(&b->sequence);
> +                return;
>              }
>              atomic_set(&b->hashes[i], 0);
>              atomic_set(&b->pointers[i], NULL);
>          }
> +        seqlock_write_end(&b->sequence);
>          b = b->next;
>      } while (b);
> - done:
> -    seqlock_write_end(&head->sequence);
>  }
>  
>  /* call with all bucket locks held */
> @@ -446,6 +446,8 @@ void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func,
>      int i;
>  
>      do {
> +        void *q = NULL;
> +        unsigned int version = seqlock_read_begin(&b->sequence);
>          for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
>              if (atomic_read(&b->hashes[i]) == hash) {
>                  /* The pointer is dereferenced before seqlock_read_retry,
> @@ -455,11 +457,16 @@ void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func,
>                  void *p = atomic_rcu_read(&b->pointers[i]);
>  
>                  if (likely(p) && likely(func(p, userp))) {
> -                    return p;
> +                    q = p;
> +                    break;
>                  }
>              }
>          }
> -        b = atomic_rcu_read(&b->next);
> +        if (!q) {
> +            b = atomic_rcu_read(&b->next);
> +        } else if (!seqlock_read_retry(&b->sequence, version)) {
> +            return q;
> +        }
>      } while (b);
>  
>      return NULL;
> @@ -469,14 +476,7 @@ static __attribute__((noinline))
>  void *qht_lookup__slowpath(struct qht_bucket *b, qht_lookup_func_t func,
>                             const void *userp, uint32_t hash)
>  {
> -    unsigned int version;
> -    void *ret;
> -
> -    do {
> -        version = seqlock_read_begin(&b->sequence);
> -        ret = qht_do_lookup(b, func, userp, hash);
> -    } while (seqlock_read_retry(&b->sequence, version));
> -    return ret;
> +    return qht_do_lookup(b, func, userp, hash);
>  }
>  
>  void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
> @@ -537,14 +537,14 @@ static bool qht_insert__locked(struct qht *ht, struct qht_map *map,
>  
>   found:
>      /* found an empty key: acquire the seqlock and write */
> -    seqlock_write_begin(&head->sequence);
> +    seqlock_write_begin(&b->sequence);
>      if (new) {
>          atomic_rcu_set(&prev->next, b);
>      }
>      /* smp_wmb() implicit in seqlock_write_begin.  */
>      atomic_set(&b->hashes[i], hash);
>      atomic_set(&b->pointers[i], p);
> -    seqlock_write_end(&head->sequence);
> +    seqlock_write_end(&b->sequence);
>      return true;
>  }
>  
> @@ -665,9 +665,9 @@ bool qht_remove__locked(struct qht_map *map, struct qht_bucket *head,
>              }
>              if (q == p) {
>                  qht_debug_assert(b->hashes[i] == hash);
> -                seqlock_write_begin(&head->sequence);
> +                seqlock_write_begin(&b->sequence);
>                  qht_bucket_remove_entry(b, i);
> -                seqlock_write_end(&head->sequence);
> +                seqlock_write_end(&b->sequence);
>                  return true;
>              }
>          }
>
Pranith Kumar Oct. 25, 2016, 8:35 p.m. UTC | #2
On Tue, Oct 25, 2016 at 4:02 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>
>> I've written a patch (see below) to take the per-bucket sequence locks.
>
> What's the performance like?
>

Applying only this patch, the perf numbers are similar to the 128
cache line alignment you suggested.

0 4
10 9.70
20 8.09
30 7.13
40 6.49

I am not sure why only 100% reader case is so low. Applying the
sequence lock cache alignment patch brings it back up to 13
MT/s/thread.
Emilio Cota Oct. 25, 2016, 8:45 p.m. UTC | #3
On Tue, Oct 25, 2016 at 16:35:48 -0400, Pranith Kumar wrote:
> On Tue, Oct 25, 2016 at 4:02 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >
> >> I've written a patch (see below) to take the per-bucket sequence locks.
> >
> > What's the performance like?
> >
> 
> Applying only this patch, the perf numbers are similar to the 128
> cache line alignment you suggested.

That makes sense. Having a single seqlock per bucket is simple and fast;
note that bucket chains should be very short (we use good hashing and
automatic resize for this purpose).

		E.
Paolo Bonzini Oct. 25, 2016, 8:58 p.m. UTC | #4
On 25/10/2016 22:45, Emilio G. Cota wrote:
> On Tue, Oct 25, 2016 at 16:35:48 -0400, Pranith Kumar wrote:
>> On Tue, Oct 25, 2016 at 4:02 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>
>>>> I've written a patch (see below) to take the per-bucket sequence locks.
>>>
>>> What's the performance like?
>>>
>>
>> Applying only this patch, the perf numbers are similar to the 128
>> cache line alignment you suggested.
> 
> That makes sense. Having a single seqlock per bucket is simple and fast;
> note that bucket chains should be very short (we use good hashing and
> automatic resize for this purpose).

But why do we get such worse performance in the 100% reader case?  (And
even more puzzling, why does Pranith's original patch improve
performance instead of causing more cache misses?)

Thanks,

Paolo
diff mbox

Patch

diff --git a/util/qht.c b/util/qht.c
index 4d82609..cfce5fc 100644
--- a/util/qht.c
+++ b/util/qht.c
@@ -374,19 +374,19 @@  static void qht_bucket_reset__locked(struct qht_bucket *head)
     struct qht_bucket *b = head;
     int i;
 
-    seqlock_write_begin(&head->sequence);
     do {
+        seqlock_write_begin(&b->sequence);
         for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
             if (b->pointers[i] == NULL) {
-                goto done;
+                seqlock_write_end(&b->sequence);
+                return;
             }
             atomic_set(&b->hashes[i], 0);
             atomic_set(&b->pointers[i], NULL);
         }
+        seqlock_write_end(&b->sequence);
         b = b->next;
     } while (b);
- done:
-    seqlock_write_end(&head->sequence);
 }
 
 /* call with all bucket locks held */
@@ -446,6 +446,8 @@  void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func,
     int i;
 
     do {
+        void *q = NULL;
+        unsigned int version = seqlock_read_begin(&b->sequence);
         for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
             if (atomic_read(&b->hashes[i]) == hash) {
                 /* The pointer is dereferenced before seqlock_read_retry,
@@ -455,11 +457,16 @@  void *qht_do_lookup(struct qht_bucket *head, qht_lookup_func_t func,
                 void *p = atomic_rcu_read(&b->pointers[i]);
 
                 if (likely(p) && likely(func(p, userp))) {
-                    return p;
+                    q = p;
+                    break;
                 }
             }
         }
-        b = atomic_rcu_read(&b->next);
+        if (!q) {
+            b = atomic_rcu_read(&b->next);
+        } else if (!seqlock_read_retry(&b->sequence, version)) {
+            return q;
+        }
     } while (b);
 
     return NULL;
@@ -469,14 +476,7 @@  static __attribute__((noinline))
 void *qht_lookup__slowpath(struct qht_bucket *b, qht_lookup_func_t func,
                            const void *userp, uint32_t hash)
 {
-    unsigned int version;
-    void *ret;
-
-    do {
-        version = seqlock_read_begin(&b->sequence);
-        ret = qht_do_lookup(b, func, userp, hash);
-    } while (seqlock_read_retry(&b->sequence, version));
-    return ret;
+    return qht_do_lookup(b, func, userp, hash);
 }
 
 void *qht_lookup(struct qht *ht, qht_lookup_func_t func, const void *userp,
@@ -537,14 +537,14 @@  static bool qht_insert__locked(struct qht *ht, struct qht_map *map,
 
  found:
     /* found an empty key: acquire the seqlock and write */
-    seqlock_write_begin(&head->sequence);
+    seqlock_write_begin(&b->sequence);
     if (new) {
         atomic_rcu_set(&prev->next, b);
     }
     /* smp_wmb() implicit in seqlock_write_begin.  */
     atomic_set(&b->hashes[i], hash);
     atomic_set(&b->pointers[i], p);
-    seqlock_write_end(&head->sequence);
+    seqlock_write_end(&b->sequence);
     return true;
 }
 
@@ -665,9 +665,9 @@  bool qht_remove__locked(struct qht_map *map, struct qht_bucket *head,
             }
             if (q == p) {
                 qht_debug_assert(b->hashes[i] == hash);
-                seqlock_write_begin(&head->sequence);
+                seqlock_write_begin(&b->sequence);
                 qht_bucket_remove_entry(b, i);
-                seqlock_write_end(&head->sequence);
+                seqlock_write_end(&b->sequence);
                 return true;
             }
         }