diff mbox series

[ovs-dev] seq: Make read of the current value atomic

Message ID 167991626686.4046542.5458159147307773259.stgit@ebuild.local
State Changes Requested
Headers show
Series [ovs-dev] seq: Make read of the current value atomic | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Eelco Chaudron March 27, 2023, 11:25 a.m. UTC
Make the read of the current seq->value atomic, i.e., not needing to
acquire the global mutex when reading it. On 64-bit systems, this
incurs no overhead, and it will avoid the mutex and potentially
a system call.

For incrementing the value followed by waking up the threads, we are
still taking the mutex, so the current behavior is not changing. The
seq_read() behavior is already defined as, "Returns seq's current
sequence number (which could change immediately)". So the change
should not impact the current behavior.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 lib/ovs-rcu.c |    2 +-
 lib/seq.c     |   37 ++++++++++++++++---------------------
 lib/seq.h     |    1 -
 3 files changed, 17 insertions(+), 23 deletions(-)

Comments

Mike Pattrick March 28, 2023, 5:32 p.m. UTC | #1
On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>
> Make the read of the current seq->value atomic, i.e., not needing to
> acquire the global mutex when reading it. On 64-bit systems, this
> incurs no overhead, and it will avoid the mutex and potentially
> a system call.
>
> For incrementing the value followed by waking up the threads, we are
> still taking the mutex, so the current behavior is not changing. The
> seq_read() behavior is already defined as, "Returns seq's current
> sequence number (which could change immediately)". So the change
> should not impact the current behavior.
>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>  lib/ovs-rcu.c |    2 +-
>  lib/seq.c     |   37 ++++++++++++++++---------------------
>  lib/seq.h     |    1 -
>  3 files changed, 17 insertions(+), 23 deletions(-)
>
> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
> index 946aa04d1..9e07d9bab 100644
> --- a/lib/ovs-rcu.c
> +++ b/lib/ovs-rcu.c
> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>      ovs_assert(!single_threaded());
>      perthread = ovsrcu_perthread_get();
>      if (!seq_try_lock()) {
> -        perthread->seqno = seq_read_protected(global_seqno);
> +        perthread->seqno = seq_read(global_seqno);
>          if (perthread->cbset) {
>              ovsrcu_flush_cbset__(perthread, true);
>          }
> diff --git a/lib/seq.c b/lib/seq.c
> index 99e5bf8bd..22646f3f8 100644
> --- a/lib/seq.c
> +++ b/lib/seq.c
> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>
>  /* A sequence number object. */
>  struct seq {
> -    uint64_t value OVS_GUARDED;
> +    atomic_uint64_t value;
>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>  };
>
> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>  struct seq * OVS_EXCLUDED(seq_mutex)
>  seq_create(void)
>  {
> +    uint64_t seq_value;
>      struct seq *seq;
>
>      seq_init();
> @@ -81,7 +82,8 @@ seq_create(void)
>      COVERAGE_INC(seq_change);
>
>      ovs_mutex_lock(&seq_mutex);

I think it's also possible to get rid of the mutex here. Operations
like modifying a bridge or interface can result in calling seq_create
dozens of times, which could potentially lead to contention in cases
when a lot of interfaces are constantly added or removed. I'm mainly
thinking about kubernetes here.

Other than that, I found this patch to reduce locking on seq_mutex
pretty significantly with a userspace workload:

Before: 468727.2/sec
After: 229265.8/sec

The rest of this patch looks good to me, but what do you think about:

diff --git a/lib/seq.c b/lib/seq.c
index 22646f3f8..09fdccce5 100644
--- a/lib/seq.c
+++ b/lib/seq.c
@@ -57,7 +57,7 @@ struct seq_thread {

 static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;

-static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
+static atomic_uint64_t seq_next = 1;

 static pthread_key_t seq_thread_key;

@@ -81,11 +81,9 @@ seq_create(void)

     COVERAGE_INC(seq_change);

-    ovs_mutex_lock(&seq_mutex);
-    seq_value = seq_next++;
+    seq_value = atomic_count_inc64(&seq_next);
     atomic_store_relaxed(&seq->value, seq_value);
     hmap_init(&seq->waiters);
-    ovs_mutex_unlock(&seq_mutex);

     return seq;
 }
@@ -128,7 +126,7 @@ void
 seq_change_protected(struct seq *seq)
     OVS_REQUIRES(seq_mutex)
 {
-    uint64_t seq_value = seq_next++;
+    uint64_t seq_value = atomic_count_inc64(&seq_next);

     COVERAGE_INC(seq_change);
Jon Kohler March 30, 2023, 3:57 p.m. UTC | #2
> On Mar 28, 2023, at 1:32 PM, Mike Pattrick <mkp@redhat.com> wrote:
> 
> On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>> 
>> Make the read of the current seq->value atomic, i.e., not needing to
>> acquire the global mutex when reading it. On 64-bit systems, this
>> incurs no overhead, and it will avoid the mutex and potentially
>> a system call.
>> 
>> For incrementing the value followed by waking up the threads, we are
>> still taking the mutex, so the current behavior is not changing. The
>> seq_read() behavior is already defined as, "Returns seq's current
>> sequence number (which could change immediately)". So the change
>> should not impact the current behavior.
>> 
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>> lib/ovs-rcu.c |    2 +-
>> lib/seq.c     |   37 ++++++++++++++++---------------------
>> lib/seq.h     |    1 -
>> 3 files changed, 17 insertions(+), 23 deletions(-)
>> 
>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>> index 946aa04d1..9e07d9bab 100644
>> --- a/lib/ovs-rcu.c
>> +++ b/lib/ovs-rcu.c
>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>     ovs_assert(!single_threaded());
>>     perthread = ovsrcu_perthread_get();
>>     if (!seq_try_lock()) {
>> -        perthread->seqno = seq_read_protected(global_seqno);
>> +        perthread->seqno = seq_read(global_seqno);
>>         if (perthread->cbset) {
>>             ovsrcu_flush_cbset__(perthread, true);
>>         }
>> diff --git a/lib/seq.c b/lib/seq.c
>> index 99e5bf8bd..22646f3f8 100644
>> --- a/lib/seq.c
>> +++ b/lib/seq.c
>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>> 
>> /* A sequence number object. */
>> struct seq {
>> -    uint64_t value OVS_GUARDED;
>> +    atomic_uint64_t value;
>>     struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>> };
>> 
>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>> struct seq * OVS_EXCLUDED(seq_mutex)
>> seq_create(void)
>> {
>> +    uint64_t seq_value;
>>     struct seq *seq;
>> 
>>     seq_init();
>> @@ -81,7 +82,8 @@ seq_create(void)
>>     COVERAGE_INC(seq_change);
>> 
>>     ovs_mutex_lock(&seq_mutex);
> 
> I think it's also possible to get rid of the mutex here. Operations
> like modifying a bridge or interface can result in calling seq_create
> dozens of times, which could potentially lead to contention in cases
> when a lot of interfaces are constantly added or removed. I'm mainly
> thinking about kubernetes here.
> 
> Other than that, I found this patch to reduce locking on seq_mutex
> pretty significantly with a userspace workload:
> 
> Before: 468727.2/sec
> After: 229265.8/sec
> 
> The rest of this patch looks good to me, but what do you think about:
> 
> diff --git a/lib/seq.c b/lib/seq.c
> index 22646f3f8..09fdccce5 100644
> --- a/lib/seq.c
> +++ b/lib/seq.c
> @@ -57,7 +57,7 @@ struct seq_thread {
> 
> static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
> 
> -static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> +static atomic_uint64_t seq_next = 1;
> 
> static pthread_key_t seq_thread_key;
> 
> @@ -81,11 +81,9 @@ seq_create(void)
> 
>     COVERAGE_INC(seq_change);
> 
> -    ovs_mutex_lock(&seq_mutex);
> -    seq_value = seq_next++;
> +    seq_value = atomic_count_inc64(&seq_next);
>     atomic_store_relaxed(&seq->value, seq_value);
>     hmap_init(&seq->waiters);
> -    ovs_mutex_unlock(&seq_mutex);
> 
>     return seq;
> }
> @@ -128,7 +126,7 @@ void
> seq_change_protected(struct seq *seq)
>     OVS_REQUIRES(seq_mutex)
> {
> -    uint64_t seq_value = seq_next++;
> +    uint64_t seq_value = atomic_count_inc64(&seq_next);
> 
>     COVERAGE_INC(seq_change);
> 

Mike - Seems like a good idea at first blush?

Eelco, 
This patch has perfect timing, as we’re currently chasing down some
overhead on large systems, e.g. quad socket or newer chips with many
dozens/hundreds of processors per system. In these systems, we see
a lot of overhead from the revalidator threads coming online and
constantly competing for the mutex in seq_read(), seq_woke(), and
seq_change(). This leads to ovs-vswitchd being on-CPU constantly at
somewhere between 20-80% of one core as per top. This time comes from
spending a ton of time contending for the lock, then just futex'ing in
and out of sleep as a result.

Along these same lines of thinking, would it be possible to get rid of
this mutex in seq_change() too? Perhaps we could have some atomic val
that indicated if there was a waiter, and if so, then take the mutex
to deal with the wakeup, but if not, don't take the mutex at all?

Thats just trying to further optimize, even with this patch there is
gains by easing contention on the mutex from all of the read calls, so
great work!

While I'm on the thought: The next point of contention specifically in
the revalidator case that I'm tracking is under nl_dump_next() where
we contend for dump->mutex just to do a short lived update to 
dump->status. Perhaps that can be a spinlock? I'll start a separate 
thread after I try that out.

Jon

> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwIGaQ&c=s883GpUCOChKOHiocYtGcg&r=NGPRGGo37mQiSXgHKm5rCQ&m=J3WKdism6FcwVUgXro62hA3y_rJCV4cmBOy5edXHN1-PSyvYZHawAQDRuEowlngE&s=uijzFxNVvFMOOsSdejvK-CnCkROd2hcUMA1VRwzu8ds&e=
Jon Kohler March 30, 2023, 8:07 p.m. UTC | #3
> On Mar 30, 2023, at 11:57 AM, Jon Kohler <jon@nutanix.com> wrote:
> 
> 
> 
>> On Mar 28, 2023, at 1:32 PM, Mike Pattrick <mkp@redhat.com> wrote:
>> 
>> On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>> 
>>> Make the read of the current seq->value atomic, i.e., not needing to
>>> acquire the global mutex when reading it. On 64-bit systems, this
>>> incurs no overhead, and it will avoid the mutex and potentially
>>> a system call.
>>> 
>>> For incrementing the value followed by waking up the threads, we are
>>> still taking the mutex, so the current behavior is not changing. The
>>> seq_read() behavior is already defined as, "Returns seq's current
>>> sequence number (which could change immediately)". So the change
>>> should not impact the current behavior.
>>> 
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>> lib/ovs-rcu.c |    2 +-
>>> lib/seq.c     |   37 ++++++++++++++++---------------------
>>> lib/seq.h     |    1 -
>>> 3 files changed, 17 insertions(+), 23 deletions(-)
>>> 
>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>> index 946aa04d1..9e07d9bab 100644
>>> --- a/lib/ovs-rcu.c
>>> +++ b/lib/ovs-rcu.c
>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>    ovs_assert(!single_threaded());
>>>    perthread = ovsrcu_perthread_get();
>>>    if (!seq_try_lock()) {
>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>> +        perthread->seqno = seq_read(global_seqno);
>>>        if (perthread->cbset) {
>>>            ovsrcu_flush_cbset__(perthread, true);
>>>        }
>>> diff --git a/lib/seq.c b/lib/seq.c
>>> index 99e5bf8bd..22646f3f8 100644
>>> --- a/lib/seq.c
>>> +++ b/lib/seq.c
>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>> 
>>> /* A sequence number object. */
>>> struct seq {
>>> -    uint64_t value OVS_GUARDED;
>>> +    atomic_uint64_t value;
>>>    struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>> };
>>> 
>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>> struct seq * OVS_EXCLUDED(seq_mutex)
>>> seq_create(void)
>>> {
>>> +    uint64_t seq_value;
>>>    struct seq *seq;
>>> 
>>>    seq_init();
>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>    COVERAGE_INC(seq_change);
>>> 
>>>    ovs_mutex_lock(&seq_mutex);
>> 
>> I think it's also possible to get rid of the mutex here. Operations
>> like modifying a bridge or interface can result in calling seq_create
>> dozens of times, which could potentially lead to contention in cases
>> when a lot of interfaces are constantly added or removed. I'm mainly
>> thinking about kubernetes here.
>> 
>> Other than that, I found this patch to reduce locking on seq_mutex
>> pretty significantly with a userspace workload:
>> 
>> Before: 468727.2/sec
>> After: 229265.8/sec
>> 
>> The rest of this patch looks good to me, but what do you think about:
>> 
>> diff --git a/lib/seq.c b/lib/seq.c
>> index 22646f3f8..09fdccce5 100644
>> --- a/lib/seq.c
>> +++ b/lib/seq.c
>> @@ -57,7 +57,7 @@ struct seq_thread {
>> 
>> static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>> 
>> -static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
>> +static atomic_uint64_t seq_next = 1;
>> 
>> static pthread_key_t seq_thread_key;
>> 
>> @@ -81,11 +81,9 @@ seq_create(void)
>> 
>>    COVERAGE_INC(seq_change);
>> 
>> -    ovs_mutex_lock(&seq_mutex);
>> -    seq_value = seq_next++;
>> +    seq_value = atomic_count_inc64(&seq_next);
>>    atomic_store_relaxed(&seq->value, seq_value);
>>    hmap_init(&seq->waiters);
>> -    ovs_mutex_unlock(&seq_mutex);
>> 
>>    return seq;
>> }
>> @@ -128,7 +126,7 @@ void
>> seq_change_protected(struct seq *seq)
>>    OVS_REQUIRES(seq_mutex)
>> {
>> -    uint64_t seq_value = seq_next++;
>> +    uint64_t seq_value = atomic_count_inc64(&seq_next);
>> 
>>    COVERAGE_INC(seq_change);
>> 
> 
> Mike - Seems like a good idea at first blush?
> 
> Eelco, 
> This patch has perfect timing, as we’re currently chasing down some
> overhead on large systems, e.g. quad socket or newer chips with many
> dozens/hundreds of processors per system. In these systems, we see
> a lot of overhead from the revalidator threads coming online and
> constantly competing for the mutex in seq_read(), seq_woke(), and
> seq_change(). This leads to ovs-vswitchd being on-CPU constantly at
> somewhere between 20-80% of one core as per top. This time comes from
> spending a ton of time contending for the lock, then just futex'ing in
> and out of sleep as a result.
> 
> Along these same lines of thinking, would it be possible to get rid of
> this mutex in seq_change() too? Perhaps we could have some atomic val
> that indicated if there was a waiter, and if so, then take the mutex
> to deal with the wakeup, but if not, don't take the mutex at all?
> 
> Thats just trying to further optimize, even with this patch there is
> gains by easing contention on the mutex from all of the read calls, so
> great work!
> 
> While I'm on the thought: The next point of contention specifically in
> the revalidator case that I'm tracking is under nl_dump_next() where
> we contend for dump->mutex just to do a short lived update to 
> dump->status. Perhaps that can be a spinlock? I'll start a separate 
> thread after I try that out.
> 
> Jon

Your change inspired me to dig a bit deeper. For the revalidator_sweep()
call that the revalidator threads do, it ends up calling ovsrcu_quiesce() in
a loop, which we see quite a bit of contention under with a lot of futex 
sleep/wake.

With your change, we’ll get rid of the potential futex under seq_read, but
we will still acquire mutex in seq_change() as well as potentially in
ovsrcu_flush_cbset(). We could optimize potential locking from 3x to 2x
with your change, and then 1x per ovsrcu_quiesce() with the following
change.

Thoughts?

Happy to send this out as a separate change on top of yours if you’d like.

--- a/lib/ovs-rcu.c
+++ b/lib/ovs-rcu.c
@@ -155,10 +155,12 @@ ovsrcu_quiesce(void)
 
     perthread = ovsrcu_perthread_get();
     perthread->seqno = seq_read(global_seqno);
+    seq_lock();
     if (perthread->cbset) {
-        ovsrcu_flush_cbset(perthread);
+        ovsrcu_flush_cbset__(perthread, true);
     }
-    seq_change(global_seqno);
+    seq_change_protected(global_seqno);
+    seq_unlock();
 
     ovsrcu_quiesced();
 }
Eelco Chaudron March 31, 2023, 1:37 p.m. UTC | #4
On 28 Mar 2023, at 19:32, Mike Pattrick wrote:

> On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>
>> Make the read of the current seq->value atomic, i.e., not needing to
>> acquire the global mutex when reading it. On 64-bit systems, this
>> incurs no overhead, and it will avoid the mutex and potentially
>> a system call.
>>
>> For incrementing the value followed by waking up the threads, we are
>> still taking the mutex, so the current behavior is not changing. The
>> seq_read() behavior is already defined as, "Returns seq's current
>> sequence number (which could change immediately)". So the change
>> should not impact the current behavior.
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>  lib/ovs-rcu.c |    2 +-
>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>  lib/seq.h     |    1 -
>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>
>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>> index 946aa04d1..9e07d9bab 100644
>> --- a/lib/ovs-rcu.c
>> +++ b/lib/ovs-rcu.c
>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>      ovs_assert(!single_threaded());
>>      perthread = ovsrcu_perthread_get();
>>      if (!seq_try_lock()) {
>> -        perthread->seqno = seq_read_protected(global_seqno);
>> +        perthread->seqno = seq_read(global_seqno);
>>          if (perthread->cbset) {
>>              ovsrcu_flush_cbset__(perthread, true);
>>          }
>> diff --git a/lib/seq.c b/lib/seq.c
>> index 99e5bf8bd..22646f3f8 100644
>> --- a/lib/seq.c
>> +++ b/lib/seq.c
>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>
>>  /* A sequence number object. */
>>  struct seq {
>> -    uint64_t value OVS_GUARDED;
>> +    atomic_uint64_t value;
>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>  };
>>
>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>  seq_create(void)
>>  {
>> +    uint64_t seq_value;
>>      struct seq *seq;
>>
>>      seq_init();
>> @@ -81,7 +82,8 @@ seq_create(void)
>>      COVERAGE_INC(seq_change);
>>
>>      ovs_mutex_lock(&seq_mutex);
>
> I think it's also possible to get rid of the mutex here. Operations
> like modifying a bridge or interface can result in calling seq_create
> dozens of times, which could potentially lead to contention in cases
> when a lot of interfaces are constantly added or removed. I'm mainly
> thinking about kubernetes here.
>
> Other than that, I found this patch to reduce locking on seq_mutex
> pretty significantly with a userspace workload:
>
> Before: 468727.2/sec
> After: 229265.8/sec

Yes I just counted syscalls, and I noticed a Hughe drop in futex calls especially if you have more PMD threads.

> The rest of this patch looks good to me, but what do you think about:

At first glance, it looks good to me however not sure if seq_create() is used often enough to see any gains.
But removing a mutex in a function sounds good to me, so I would suggest submitting a patch.

> diff --git a/lib/seq.c b/lib/seq.c
> index 22646f3f8..09fdccce5 100644
> --- a/lib/seq.c
> +++ b/lib/seq.c
> @@ -57,7 +57,7 @@ struct seq_thread {
>
>  static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>
> -static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> +static atomic_uint64_t seq_next = 1;

I think we can keep this and use the  OVS_EXCLUDED(seq_mutex) macro in the function below.

>  static pthread_key_t seq_thread_key;
>
> @@ -81,11 +81,9 @@ seq_create(void)
>
>      COVERAGE_INC(seq_change);
>
> -    ovs_mutex_lock(&seq_mutex);
> -    seq_value = seq_next++;
> +    seq_value = atomic_count_inc64(&seq_next);
>      atomic_store_relaxed(&seq->value, seq_value);
>      hmap_init(&seq->waiters);
> -    ovs_mutex_unlock(&seq_mutex);
>
>      return seq;
>  }
> @@ -128,7 +126,7 @@ void
>  seq_change_protected(struct seq *seq)
>      OVS_REQUIRES(seq_mutex)
>  {
> -    uint64_t seq_value = seq_next++;
> +    uint64_t seq_value = atomic_count_inc64(&seq_next);
>
>      COVERAGE_INC(seq_change);
Eelco Chaudron March 31, 2023, 2:45 p.m. UTC | #5
On 30 Mar 2023, at 17:57, Jon Kohler wrote:

>> On Mar 28, 2023, at 1:32 PM, Mike Pattrick <mkp@redhat.com> wrote:
>>
>> On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>>>
>>> Make the read of the current seq->value atomic, i.e., not needing to
>>> acquire the global mutex when reading it. On 64-bit systems, this
>>> incurs no overhead, and it will avoid the mutex and potentially
>>> a system call.
>>>
>>> For incrementing the value followed by waking up the threads, we are
>>> still taking the mutex, so the current behavior is not changing. The
>>> seq_read() behavior is already defined as, "Returns seq's current
>>> sequence number (which could change immediately)". So the change
>>> should not impact the current behavior.
>>>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>> lib/ovs-rcu.c |    2 +-
>>> lib/seq.c     |   37 ++++++++++++++++---------------------
>>> lib/seq.h     |    1 -
>>> 3 files changed, 17 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>> index 946aa04d1..9e07d9bab 100644
>>> --- a/lib/ovs-rcu.c
>>> +++ b/lib/ovs-rcu.c
>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>     ovs_assert(!single_threaded());
>>>     perthread = ovsrcu_perthread_get();
>>>     if (!seq_try_lock()) {
>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>> +        perthread->seqno = seq_read(global_seqno);
>>>         if (perthread->cbset) {
>>>             ovsrcu_flush_cbset__(perthread, true);
>>>         }
>>> diff --git a/lib/seq.c b/lib/seq.c
>>> index 99e5bf8bd..22646f3f8 100644
>>> --- a/lib/seq.c
>>> +++ b/lib/seq.c
>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>
>>> /* A sequence number object. */
>>> struct seq {
>>> -    uint64_t value OVS_GUARDED;
>>> +    atomic_uint64_t value;
>>>     struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>> };
>>>
>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>> struct seq * OVS_EXCLUDED(seq_mutex)
>>> seq_create(void)
>>> {
>>> +    uint64_t seq_value;
>>>     struct seq *seq;
>>>
>>>     seq_init();
>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>     COVERAGE_INC(seq_change);
>>>
>>>     ovs_mutex_lock(&seq_mutex);
>>
>> I think it's also possible to get rid of the mutex here. Operations
>> like modifying a bridge or interface can result in calling seq_create
>> dozens of times, which could potentially lead to contention in cases
>> when a lot of interfaces are constantly added or removed. I'm mainly
>> thinking about kubernetes here.
>>
>> Other than that, I found this patch to reduce locking on seq_mutex
>> pretty significantly with a userspace workload:
>>
>> Before: 468727.2/sec
>> After: 229265.8/sec
>>
>> The rest of this patch looks good to me, but what do you think about:
>>
>> diff --git a/lib/seq.c b/lib/seq.c
>> index 22646f3f8..09fdccce5 100644
>> --- a/lib/seq.c
>> +++ b/lib/seq.c
>> @@ -57,7 +57,7 @@ struct seq_thread {
>>
>> static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>>
>> -static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
>> +static atomic_uint64_t seq_next = 1;
>>
>> static pthread_key_t seq_thread_key;
>>
>> @@ -81,11 +81,9 @@ seq_create(void)
>>
>>     COVERAGE_INC(seq_change);
>>
>> -    ovs_mutex_lock(&seq_mutex);
>> -    seq_value = seq_next++;
>> +    seq_value = atomic_count_inc64(&seq_next);
>>     atomic_store_relaxed(&seq->value, seq_value);
>>     hmap_init(&seq->waiters);
>> -    ovs_mutex_unlock(&seq_mutex);
>>
>>     return seq;
>> }
>> @@ -128,7 +126,7 @@ void
>> seq_change_protected(struct seq *seq)
>>     OVS_REQUIRES(seq_mutex)
>> {
>> -    uint64_t seq_value = seq_next++;
>> +    uint64_t seq_value = atomic_count_inc64(&seq_next);
>>
>>     COVERAGE_INC(seq_change);
>>
>
> Mike - Seems like a good idea at first blush?
>
> Eelco,
> This patch has perfect timing, as we’re currently chasing down some
> overhead on large systems, e.g. quad socket or newer chips with many
> dozens/hundreds of processors per system. In these systems, we see
> a lot of overhead from the revalidator threads coming online and
> constantly competing for the mutex in seq_read(), seq_woke(), and
> seq_change(). This leads to ovs-vswitchd being on-CPU constantly at
> somewhere between 20-80% of one core as per top. This time comes from
> spending a ton of time contending for the lock, then just futex'ing in
> and out of sleep as a result.
>
> Along these same lines of thinking, would it be possible to get rid of
> this mutex in seq_change() too? Perhaps we could have some atomic val
> that indicated if there was a waiter, and if so, then take the mutex
> to deal with the wakeup, but if not, don't take the mutex at all?

I’m not an expert on the seq code, but quickly looking at the code, the mutex is to protect the seq->waiters. Just having a waiter count outside of the mutex will probably not work, as there is still a raise condition when adding a new waiter.

Maybe you could work on a patch, and see there a now corner cases with this idea…

> Thats just trying to further optimize, even with this patch there is
> gains by easing contention on the mutex from all of the read calls, so
> great work!
>
> While I'm on the thought: The next point of contention specifically in
> the revalidator case that I'm tracking is under nl_dump_next() where
> we contend for dump->mutex just to do a short lived update to
> dump->status. Perhaps that can be a spinlock? I'll start a separate
> thread after I try that out.

I do not like spin locks in userspace, as it could cause longer locks when the thread is moved off the CPU.

From the code, it looks like we might only need the dump_mutex to protect the nl_dump_refill(), and other nl_* operations, not the actual dump->status updates. With some puzzling, we can probably update it atomically with the atomic_compare_exchange functions.

//Eelco
Mike Pattrick March 31, 2023, 3:36 p.m. UTC | #6
On Fri, Mar 31, 2023 at 10:45 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>
>
>
> On 30 Mar 2023, at 17:57, Jon Kohler wrote:
>
> >> On Mar 28, 2023, at 1:32 PM, Mike Pattrick <mkp@redhat.com> wrote:
> >>
> >> On Mon, Mar 27, 2023 at 7:25 AM Eelco Chaudron <echaudro@redhat.com> wrote:
> >>>
> >>> Make the read of the current seq->value atomic, i.e., not needing to
> >>> acquire the global mutex when reading it. On 64-bit systems, this
> >>> incurs no overhead, and it will avoid the mutex and potentially
> >>> a system call.
> >>>
> >>> For incrementing the value followed by waking up the threads, we are
> >>> still taking the mutex, so the current behavior is not changing. The
> >>> seq_read() behavior is already defined as, "Returns seq's current
> >>> sequence number (which could change immediately)". So the change
> >>> should not impact the current behavior.
> >>>
> >>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

I think after much discussion we've identified some additional areas
that this work could be extended to, but there's nothing wrong with
this patch as is.

Acked-by: Mike Pattrick <mkp@redhat.com>
Ilya Maximets May 3, 2023, 10:55 p.m. UTC | #7
On 3/27/23 13:25, Eelco Chaudron wrote:
> Make the read of the current seq->value atomic, i.e., not needing to
> acquire the global mutex when reading it. On 64-bit systems, this
> incurs no overhead, and it will avoid the mutex and potentially
> a system call.
> 
> For incrementing the value followed by waking up the threads, we are
> still taking the mutex, so the current behavior is not changing. The
> seq_read() behavior is already defined as, "Returns seq's current
> sequence number (which could change immediately)". So the change
> should not impact the current behavior.
> 
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
>  lib/ovs-rcu.c |    2 +-
>  lib/seq.c     |   37 ++++++++++++++++---------------------
>  lib/seq.h     |    1 -
>  3 files changed, 17 insertions(+), 23 deletions(-)
> 
> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
> index 946aa04d1..9e07d9bab 100644
> --- a/lib/ovs-rcu.c
> +++ b/lib/ovs-rcu.c
> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>      ovs_assert(!single_threaded());
>      perthread = ovsrcu_perthread_get();
>      if (!seq_try_lock()) {
> -        perthread->seqno = seq_read_protected(global_seqno);
> +        perthread->seqno = seq_read(global_seqno);
>          if (perthread->cbset) {
>              ovsrcu_flush_cbset__(perthread, true);
>          }
> diff --git a/lib/seq.c b/lib/seq.c
> index 99e5bf8bd..22646f3f8 100644
> --- a/lib/seq.c
> +++ b/lib/seq.c
> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>  
>  /* A sequence number object. */
>  struct seq {
> -    uint64_t value OVS_GUARDED;
> +    atomic_uint64_t value;
>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>  };
>  
> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>  struct seq * OVS_EXCLUDED(seq_mutex)
>  seq_create(void)
>  {
> +    uint64_t seq_value;
>      struct seq *seq;
>  
>      seq_init();
> @@ -81,7 +82,8 @@ seq_create(void)
>      COVERAGE_INC(seq_change);
>  
>      ovs_mutex_lock(&seq_mutex);
> -    seq->value = seq_next++;
> +    seq_value = seq_next++;
> +    atomic_store_relaxed(&seq->value, seq_value);
>      hmap_init(&seq->waiters);
>      ovs_mutex_unlock(&seq_mutex);
>  
> @@ -126,9 +128,11 @@ void
>  seq_change_protected(struct seq *seq)
>      OVS_REQUIRES(seq_mutex)
>  {
> +    uint64_t seq_value = seq_next++;
> +
>      COVERAGE_INC(seq_change);
>  
> -    seq->value = seq_next++;
> +    atomic_store_relaxed(&seq->value, seq_value);
>      seq_wake_waiters(seq);
>  }
>  
> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>      ovs_mutex_unlock(&seq_mutex);
>  }
>  
> -/* Returns 'seq''s current sequence number (which could change immediately).
> - *
> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
> - * when an object changes, even without an ability to lock the object.  See
> - * Usage in seq.h for details. */
> -uint64_t
> -seq_read_protected(const struct seq *seq)
> -    OVS_REQUIRES(seq_mutex)
> -{
> -    return seq->value;
> -}
> -
>  /* Returns 'seq''s current sequence number (which could change immediately).
>   *
>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>   * Usage in seq.h for details. */
>  uint64_t
>  seq_read(const struct seq *seq)
> -    OVS_EXCLUDED(seq_mutex)
>  {
>      uint64_t value;
>  
> -    ovs_mutex_lock(&seq_mutex);
> -    value = seq_read_protected(seq);
> -    ovs_mutex_unlock(&seq_mutex);
> -
> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
> +     * memory barrier you would get with locking and unlocking the global
> +     * mutex.

Hi, Eelco.   I'm not sure this is actually correct.

We're performing memory_order_seq_cst operation on the value.

Sequential consistency means just acquire-release semantics in our case, since
we're pairing with non-seq-cst operations.  And acquire-release semantics works
between acquire loads and release stores on the same atomic.  But all the loads
on the value we have with this change are relaxed.  And the reader doesn't take
the mutex.  Meaning there is no acquire-release pair that can synchronize the
memory between threads.  memory_order_seq_cst read on its own can't really
guarantee any synchronization.

Also, the documentation for the sequence number library explicitly says that
"""
 seq_change() synchronizes with seq_read() and             
 seq_wait() on the same variable in release-acquire fashion.  That             
 is, all effects of the memory accesses performed by a thread prior            
 to seq_change() are visible to the threads returning from                     
 seq_read() or seq_wait() observing that change.
"""

And I don't think that's the case with this change anymore.

Best regards, Ilya Maximets.
Eelco Chaudron May 15, 2023, 2:24 p.m. UTC | #8
On 4 May 2023, at 0:55, Ilya Maximets wrote:

> On 3/27/23 13:25, Eelco Chaudron wrote:
>> Make the read of the current seq->value atomic, i.e., not needing to
>> acquire the global mutex when reading it. On 64-bit systems, this
>> incurs no overhead, and it will avoid the mutex and potentially
>> a system call.
>>
>> For incrementing the value followed by waking up the threads, we are
>> still taking the mutex, so the current behavior is not changing. The
>> seq_read() behavior is already defined as, "Returns seq's current
>> sequence number (which could change immediately)". So the change
>> should not impact the current behavior.
>>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>>  lib/ovs-rcu.c |    2 +-
>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>  lib/seq.h     |    1 -
>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>
>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>> index 946aa04d1..9e07d9bab 100644
>> --- a/lib/ovs-rcu.c
>> +++ b/lib/ovs-rcu.c
>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>      ovs_assert(!single_threaded());
>>      perthread = ovsrcu_perthread_get();
>>      if (!seq_try_lock()) {
>> -        perthread->seqno = seq_read_protected(global_seqno);
>> +        perthread->seqno = seq_read(global_seqno);
>>          if (perthread->cbset) {
>>              ovsrcu_flush_cbset__(perthread, true);
>>          }
>> diff --git a/lib/seq.c b/lib/seq.c
>> index 99e5bf8bd..22646f3f8 100644
>> --- a/lib/seq.c
>> +++ b/lib/seq.c
>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>
>>  /* A sequence number object. */
>>  struct seq {
>> -    uint64_t value OVS_GUARDED;
>> +    atomic_uint64_t value;
>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>  };
>>
>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>  seq_create(void)
>>  {
>> +    uint64_t seq_value;
>>      struct seq *seq;
>>
>>      seq_init();
>> @@ -81,7 +82,8 @@ seq_create(void)
>>      COVERAGE_INC(seq_change);
>>
>>      ovs_mutex_lock(&seq_mutex);
>> -    seq->value = seq_next++;
>> +    seq_value = seq_next++;
>> +    atomic_store_relaxed(&seq->value, seq_value);
>>      hmap_init(&seq->waiters);
>>      ovs_mutex_unlock(&seq_mutex);
>>
>> @@ -126,9 +128,11 @@ void
>>  seq_change_protected(struct seq *seq)
>>      OVS_REQUIRES(seq_mutex)
>>  {
>> +    uint64_t seq_value = seq_next++;
>> +
>>      COVERAGE_INC(seq_change);
>>
>> -    seq->value = seq_next++;
>> +    atomic_store_relaxed(&seq->value, seq_value);
>>      seq_wake_waiters(seq);
>>  }
>>
>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>      ovs_mutex_unlock(&seq_mutex);
>>  }
>>
>> -/* Returns 'seq''s current sequence number (which could change immediately).
>> - *
>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>> - * when an object changes, even without an ability to lock the object.  See
>> - * Usage in seq.h for details. */
>> -uint64_t
>> -seq_read_protected(const struct seq *seq)
>> -    OVS_REQUIRES(seq_mutex)
>> -{
>> -    return seq->value;
>> -}
>> -
>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>   *
>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>   * Usage in seq.h for details. */
>>  uint64_t
>>  seq_read(const struct seq *seq)
>> -    OVS_EXCLUDED(seq_mutex)
>>  {
>>      uint64_t value;
>>
>> -    ovs_mutex_lock(&seq_mutex);
>> -    value = seq_read_protected(seq);
>> -    ovs_mutex_unlock(&seq_mutex);
>> -
>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>> +     * memory barrier you would get with locking and unlocking the global
>> +     * mutex.
>
> Hi, Eelco.   I'm not sure this is actually correct.
>
> We're performing memory_order_seq_cst operation on the value.
>
> Sequential consistency means just acquire-release semantics in our case, since
> we're pairing with non-seq-cst operations.  And acquire-release semantics works
> between acquire loads and release stores on the same atomic.  But all the loads
> on the value we have with this change are relaxed.  And the reader doesn't take
> the mutex.  Meaning there is no acquire-release pair that can synchronize the
> memory between threads.  memory_order_seq_cst read on its own can't really
> guarantee any synchronization.
>
> Also, the documentation for the sequence number library explicitly says that
> """
>  seq_change() synchronizes with seq_read() and
>  seq_wait() on the same variable in release-acquire fashion.  That
>  is, all effects of the memory accesses performed by a thread prior
>  to seq_change() are visible to the threads returning from
>  seq_read() or seq_wait() observing that change.
> """
>
> And I don't think that's the case with this change anymore.

It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)

I think it might have been the following:

1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.

Maybe I miss something obvious, if so let me know.


Thanks,

Eelco
Ilya Maximets May 15, 2023, 3:47 p.m. UTC | #9
On 5/15/23 16:24, Eelco Chaudron wrote:
> 
> 
> On 4 May 2023, at 0:55, Ilya Maximets wrote:
> 
>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>> Make the read of the current seq->value atomic, i.e., not needing to
>>> acquire the global mutex when reading it. On 64-bit systems, this
>>> incurs no overhead, and it will avoid the mutex and potentially
>>> a system call.
>>>
>>> For incrementing the value followed by waking up the threads, we are
>>> still taking the mutex, so the current behavior is not changing. The
>>> seq_read() behavior is already defined as, "Returns seq's current
>>> sequence number (which could change immediately)". So the change
>>> should not impact the current behavior.
>>>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>>  lib/ovs-rcu.c |    2 +-
>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>  lib/seq.h     |    1 -
>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>> index 946aa04d1..9e07d9bab 100644
>>> --- a/lib/ovs-rcu.c
>>> +++ b/lib/ovs-rcu.c
>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>      ovs_assert(!single_threaded());
>>>      perthread = ovsrcu_perthread_get();
>>>      if (!seq_try_lock()) {
>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>> +        perthread->seqno = seq_read(global_seqno);
>>>          if (perthread->cbset) {
>>>              ovsrcu_flush_cbset__(perthread, true);
>>>          }
>>> diff --git a/lib/seq.c b/lib/seq.c
>>> index 99e5bf8bd..22646f3f8 100644
>>> --- a/lib/seq.c
>>> +++ b/lib/seq.c
>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>
>>>  /* A sequence number object. */
>>>  struct seq {
>>> -    uint64_t value OVS_GUARDED;
>>> +    atomic_uint64_t value;
>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>  };
>>>
>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>  seq_create(void)
>>>  {
>>> +    uint64_t seq_value;
>>>      struct seq *seq;
>>>
>>>      seq_init();
>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>      COVERAGE_INC(seq_change);
>>>
>>>      ovs_mutex_lock(&seq_mutex);
>>> -    seq->value = seq_next++;
>>> +    seq_value = seq_next++;
>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>      hmap_init(&seq->waiters);
>>>      ovs_mutex_unlock(&seq_mutex);
>>>
>>> @@ -126,9 +128,11 @@ void
>>>  seq_change_protected(struct seq *seq)
>>>      OVS_REQUIRES(seq_mutex)
>>>  {
>>> +    uint64_t seq_value = seq_next++;
>>> +
>>>      COVERAGE_INC(seq_change);
>>>
>>> -    seq->value = seq_next++;
>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>      seq_wake_waiters(seq);
>>>  }
>>>
>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>      ovs_mutex_unlock(&seq_mutex);
>>>  }
>>>
>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>> - *
>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>> - * when an object changes, even without an ability to lock the object.  See
>>> - * Usage in seq.h for details. */
>>> -uint64_t
>>> -seq_read_protected(const struct seq *seq)
>>> -    OVS_REQUIRES(seq_mutex)
>>> -{
>>> -    return seq->value;
>>> -}
>>> -
>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>   *
>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>   * Usage in seq.h for details. */
>>>  uint64_t
>>>  seq_read(const struct seq *seq)
>>> -    OVS_EXCLUDED(seq_mutex)
>>>  {
>>>      uint64_t value;
>>>
>>> -    ovs_mutex_lock(&seq_mutex);
>>> -    value = seq_read_protected(seq);
>>> -    ovs_mutex_unlock(&seq_mutex);
>>> -
>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>> +     * memory barrier you would get with locking and unlocking the global
>>> +     * mutex.
>>
>> Hi, Eelco.   I'm not sure this is actually correct.
>>
>> We're performing memory_order_seq_cst operation on the value.
>>
>> Sequential consistency means just acquire-release semantics in our case, since
>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>> between acquire loads and release stores on the same atomic.  But all the loads
>> on the value we have with this change are relaxed.  And the reader doesn't take
>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>> memory between threads.  memory_order_seq_cst read on its own can't really
>> guarantee any synchronization.
>>
>> Also, the documentation for the sequence number library explicitly says that
>> """
>>  seq_change() synchronizes with seq_read() and
>>  seq_wait() on the same variable in release-acquire fashion.  That
>>  is, all effects of the memory accesses performed by a thread prior
>>  to seq_change() are visible to the threads returning from
>>  seq_read() or seq_wait() observing that change.
>> """
>>
>> And I don't think that's the case with this change anymore.
> 
> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
> 
> I think it might have been the following:
> 
> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
> 
> Maybe I miss something obvious, if so let me know.

The problem here is not the value of the sequence number, but the
fact that pair seq_change() + seq_read() is documented as an
acquire-release synchronization pair.  Meaning that these functions
has to provide memory synchronization on unrelated memory locations.
And we don't have an acquire-release pair of operations that would
ensure that, IIUC.  It might work with current implementations for
well known architectures today, but I don't think it's required for
any of these operations to always perform full memory barriers
regardless of memory location.

Might be fixed by using release semantics for stores though, I guess.

Best regards, Ilya Maximets.
Eelco Chaudron May 16, 2023, 8:20 a.m. UTC | #10
On 15 May 2023, at 17:47, Ilya Maximets wrote:

> On 5/15/23 16:24, Eelco Chaudron wrote:
>>
>>
>> On 4 May 2023, at 0:55, Ilya Maximets wrote:
>>
>>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>>> Make the read of the current seq->value atomic, i.e., not needing to
>>>> acquire the global mutex when reading it. On 64-bit systems, this
>>>> incurs no overhead, and it will avoid the mutex and potentially
>>>> a system call.
>>>>
>>>> For incrementing the value followed by waking up the threads, we are
>>>> still taking the mutex, so the current behavior is not changing. The
>>>> seq_read() behavior is already defined as, "Returns seq's current
>>>> sequence number (which could change immediately)". So the change
>>>> should not impact the current behavior.
>>>>
>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>> ---
>>>>  lib/ovs-rcu.c |    2 +-
>>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>>  lib/seq.h     |    1 -
>>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>>> index 946aa04d1..9e07d9bab 100644
>>>> --- a/lib/ovs-rcu.c
>>>> +++ b/lib/ovs-rcu.c
>>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>>      ovs_assert(!single_threaded());
>>>>      perthread = ovsrcu_perthread_get();
>>>>      if (!seq_try_lock()) {
>>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>>> +        perthread->seqno = seq_read(global_seqno);
>>>>          if (perthread->cbset) {
>>>>              ovsrcu_flush_cbset__(perthread, true);
>>>>          }
>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>> index 99e5bf8bd..22646f3f8 100644
>>>> --- a/lib/seq.c
>>>> +++ b/lib/seq.c
>>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>>
>>>>  /* A sequence number object. */
>>>>  struct seq {
>>>> -    uint64_t value OVS_GUARDED;
>>>> +    atomic_uint64_t value;
>>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>>  };
>>>>
>>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>>  seq_create(void)
>>>>  {
>>>> +    uint64_t seq_value;
>>>>      struct seq *seq;
>>>>
>>>>      seq_init();
>>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>>      COVERAGE_INC(seq_change);
>>>>
>>>>      ovs_mutex_lock(&seq_mutex);
>>>> -    seq->value = seq_next++;
>>>> +    seq_value = seq_next++;
>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>      hmap_init(&seq->waiters);
>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>
>>>> @@ -126,9 +128,11 @@ void
>>>>  seq_change_protected(struct seq *seq)
>>>>      OVS_REQUIRES(seq_mutex)
>>>>  {
>>>> +    uint64_t seq_value = seq_next++;
>>>> +
>>>>      COVERAGE_INC(seq_change);
>>>>
>>>> -    seq->value = seq_next++;
>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>      seq_wake_waiters(seq);
>>>>  }
>>>>
>>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>  }
>>>>
>>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>>> - *
>>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>> - * when an object changes, even without an ability to lock the object.  See
>>>> - * Usage in seq.h for details. */
>>>> -uint64_t
>>>> -seq_read_protected(const struct seq *seq)
>>>> -    OVS_REQUIRES(seq_mutex)
>>>> -{
>>>> -    return seq->value;
>>>> -}
>>>> -
>>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>>   *
>>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>>   * Usage in seq.h for details. */
>>>>  uint64_t
>>>>  seq_read(const struct seq *seq)
>>>> -    OVS_EXCLUDED(seq_mutex)
>>>>  {
>>>>      uint64_t value;
>>>>
>>>> -    ovs_mutex_lock(&seq_mutex);
>>>> -    value = seq_read_protected(seq);
>>>> -    ovs_mutex_unlock(&seq_mutex);
>>>> -
>>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>>> +     * memory barrier you would get with locking and unlocking the global
>>>> +     * mutex.
>>>
>>> Hi, Eelco.   I'm not sure this is actually correct.
>>>
>>> We're performing memory_order_seq_cst operation on the value.
>>>
>>> Sequential consistency means just acquire-release semantics in our case, since
>>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>>> between acquire loads and release stores on the same atomic.  But all the loads
>>> on the value we have with this change are relaxed.  And the reader doesn't take
>>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>>> memory between threads.  memory_order_seq_cst read on its own can't really
>>> guarantee any synchronization.
>>>
>>> Also, the documentation for the sequence number library explicitly says that
>>> """
>>>  seq_change() synchronizes with seq_read() and
>>>  seq_wait() on the same variable in release-acquire fashion.  That
>>>  is, all effects of the memory accesses performed by a thread prior
>>>  to seq_change() are visible to the threads returning from
>>>  seq_read() or seq_wait() observing that change.
>>> """
>>>
>>> And I don't think that's the case with this change anymore.
>>
>> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
>>
>> I think it might have been the following:
>>
>> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
>> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
>> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
>>
>> Maybe I miss something obvious, if so let me know.
>
> The problem here is not the value of the sequence number, but the
> fact that pair seq_change() + seq_read() is documented as an
> acquire-release synchronization pair.  Meaning that these functions
> has to provide memory synchronization on unrelated memory locations.

Yes, that’s what I tried to explain, but looks like I failed ;) I think it’s a bit less restricted, as it is only for the sequence number returned by seq_read compare to the new sequence number generated by seq_change.

> And we don't have an acquire-release pair of operations that would
> ensure that, IIUC.  It might work with current implementations for
> well known architectures today, but I don't think it's required for
> any of these operations to always perform full memory barriers
> regardless of memory location.

Yes, you are right, I think the memory barrier is not a requirement for the mutex_lock(), I guess only for the mutex_unlock().

> Might be fixed by using release semantics for stores though, I guess.

So something like this could solve it:

void
seq_change_protected(struct seq *seq)
    OVS_REQUIRES(seq_mutex)
{
    uint64_t seq_value = seq_next++;

    COVERAGE_INC(seq_change);

    atomic_thread_fence(memory_order_release);
    atomic_store_relaxed(&seq->value, seq_value);
    seq_wake_waiters(seq);
}

uint64_t
seq_read(const struct seq *seq)
{
    uint64_t value;

    /* Note that the odd CONST_CAST() is here to keep sparse happy. */
    atomic_read_relaxed(&CONST_CAST(struct seq *, seq)->value, &value);
    atomic_thread_fence(memory_order_acquire);
    return value;
}

What do you think?

> Best regards, Ilya Maximets.
Ilya Maximets May 16, 2023, 7:48 p.m. UTC | #11
On 5/16/23 10:20, Eelco Chaudron wrote:
> 
> 
> On 15 May 2023, at 17:47, Ilya Maximets wrote:
> 
>> On 5/15/23 16:24, Eelco Chaudron wrote:
>>>
>>>
>>> On 4 May 2023, at 0:55, Ilya Maximets wrote:
>>>
>>>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>>>> Make the read of the current seq->value atomic, i.e., not needing to
>>>>> acquire the global mutex when reading it. On 64-bit systems, this
>>>>> incurs no overhead, and it will avoid the mutex and potentially
>>>>> a system call.
>>>>>
>>>>> For incrementing the value followed by waking up the threads, we are
>>>>> still taking the mutex, so the current behavior is not changing. The
>>>>> seq_read() behavior is already defined as, "Returns seq's current
>>>>> sequence number (which could change immediately)". So the change
>>>>> should not impact the current behavior.
>>>>>
>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>> ---
>>>>>  lib/ovs-rcu.c |    2 +-
>>>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>>>  lib/seq.h     |    1 -
>>>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>>>
>>>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>>>> index 946aa04d1..9e07d9bab 100644
>>>>> --- a/lib/ovs-rcu.c
>>>>> +++ b/lib/ovs-rcu.c
>>>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>>>      ovs_assert(!single_threaded());
>>>>>      perthread = ovsrcu_perthread_get();
>>>>>      if (!seq_try_lock()) {
>>>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>>>> +        perthread->seqno = seq_read(global_seqno);
>>>>>          if (perthread->cbset) {
>>>>>              ovsrcu_flush_cbset__(perthread, true);
>>>>>          }
>>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>>> index 99e5bf8bd..22646f3f8 100644
>>>>> --- a/lib/seq.c
>>>>> +++ b/lib/seq.c
>>>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>>>
>>>>>  /* A sequence number object. */
>>>>>  struct seq {
>>>>> -    uint64_t value OVS_GUARDED;
>>>>> +    atomic_uint64_t value;
>>>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>>>  };
>>>>>
>>>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>>>  seq_create(void)
>>>>>  {
>>>>> +    uint64_t seq_value;
>>>>>      struct seq *seq;
>>>>>
>>>>>      seq_init();
>>>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>>>      COVERAGE_INC(seq_change);
>>>>>
>>>>>      ovs_mutex_lock(&seq_mutex);
>>>>> -    seq->value = seq_next++;
>>>>> +    seq_value = seq_next++;
>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>      hmap_init(&seq->waiters);
>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>
>>>>> @@ -126,9 +128,11 @@ void
>>>>>  seq_change_protected(struct seq *seq)
>>>>>      OVS_REQUIRES(seq_mutex)
>>>>>  {
>>>>> +    uint64_t seq_value = seq_next++;
>>>>> +
>>>>>      COVERAGE_INC(seq_change);
>>>>>
>>>>> -    seq->value = seq_next++;
>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>      seq_wake_waiters(seq);
>>>>>  }
>>>>>
>>>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>  }
>>>>>
>>>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>>>> - *
>>>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>> - * when an object changes, even without an ability to lock the object.  See
>>>>> - * Usage in seq.h for details. */
>>>>> -uint64_t
>>>>> -seq_read_protected(const struct seq *seq)
>>>>> -    OVS_REQUIRES(seq_mutex)
>>>>> -{
>>>>> -    return seq->value;
>>>>> -}
>>>>> -
>>>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>>>   *
>>>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>>>   * Usage in seq.h for details. */
>>>>>  uint64_t
>>>>>  seq_read(const struct seq *seq)
>>>>> -    OVS_EXCLUDED(seq_mutex)
>>>>>  {
>>>>>      uint64_t value;
>>>>>
>>>>> -    ovs_mutex_lock(&seq_mutex);
>>>>> -    value = seq_read_protected(seq);
>>>>> -    ovs_mutex_unlock(&seq_mutex);
>>>>> -
>>>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>>>> +     * memory barrier you would get with locking and unlocking the global
>>>>> +     * mutex.
>>>>
>>>> Hi, Eelco.   I'm not sure this is actually correct.
>>>>
>>>> We're performing memory_order_seq_cst operation on the value.
>>>>
>>>> Sequential consistency means just acquire-release semantics in our case, since
>>>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>>>> between acquire loads and release stores on the same atomic.  But all the loads
>>>> on the value we have with this change are relaxed.  And the reader doesn't take
>>>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>>>> memory between threads.  memory_order_seq_cst read on its own can't really
>>>> guarantee any synchronization.
>>>>
>>>> Also, the documentation for the sequence number library explicitly says that
>>>> """
>>>>  seq_change() synchronizes with seq_read() and
>>>>  seq_wait() on the same variable in release-acquire fashion.  That
>>>>  is, all effects of the memory accesses performed by a thread prior
>>>>  to seq_change() are visible to the threads returning from
>>>>  seq_read() or seq_wait() observing that change.
>>>> """
>>>>
>>>> And I don't think that's the case with this change anymore.
>>>
>>> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
>>>
>>> I think it might have been the following:
>>>
>>> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
>>> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
>>> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
>>>
>>> Maybe I miss something obvious, if so let me know.
>>
>> The problem here is not the value of the sequence number, but the
>> fact that pair seq_change() + seq_read() is documented as an
>> acquire-release synchronization pair.  Meaning that these functions
>> has to provide memory synchronization on unrelated memory locations.
> 
> Yes, that’s what I tried to explain, but looks like I failed ;) I think it’s a bit less restricted, as it is only for the sequence number returned by seq_read compare to the new sequence number generated by seq_change.
> 
>> And we don't have an acquire-release pair of operations that would
>> ensure that, IIUC.  It might work with current implementations for
>> well known architectures today, but I don't think it's required for
>> any of these operations to always perform full memory barriers
>> regardless of memory location.
> 
> Yes, you are right, I think the memory barrier is not a requirement for the mutex_lock(), I guess only for the mutex_unlock().
> 
>> Might be fixed by using release semantics for stores though, I guess.
> 
> So something like this could solve it:
> 
> void
> seq_change_protected(struct seq *seq)
>     OVS_REQUIRES(seq_mutex)
> {
>     uint64_t seq_value = seq_next++;
> 
>     COVERAGE_INC(seq_change);
> 
>     atomic_thread_fence(memory_order_release);
>     atomic_store_relaxed(&seq->value, seq_value);
>     seq_wake_waiters(seq);
> }
> 
> uint64_t
> seq_read(const struct seq *seq)
> {
>     uint64_t value;
> 
>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>     atomic_read_relaxed(&CONST_CAST(struct seq *, seq)->value, &value);
>     atomic_thread_fence(memory_order_acquire);
>     return value;
> }
> 
> What do you think?

Why not just explicit read/store with acquire-release?  I'm not sure why
we need to use a fence.  Could you elaborate?


> 
>> Best regards, Ilya Maximets.
>
Eelco Chaudron May 17, 2023, 10:18 a.m. UTC | #12
On 16 May 2023, at 21:48, Ilya Maximets wrote:

> On 5/16/23 10:20, Eelco Chaudron wrote:
>>
>>
>> On 15 May 2023, at 17:47, Ilya Maximets wrote:
>>
>>> On 5/15/23 16:24, Eelco Chaudron wrote:
>>>>
>>>>
>>>> On 4 May 2023, at 0:55, Ilya Maximets wrote:
>>>>
>>>>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>>>>> Make the read of the current seq->value atomic, i.e., not needing to
>>>>>> acquire the global mutex when reading it. On 64-bit systems, this
>>>>>> incurs no overhead, and it will avoid the mutex and potentially
>>>>>> a system call.
>>>>>>
>>>>>> For incrementing the value followed by waking up the threads, we are
>>>>>> still taking the mutex, so the current behavior is not changing. The
>>>>>> seq_read() behavior is already defined as, "Returns seq's current
>>>>>> sequence number (which could change immediately)". So the change
>>>>>> should not impact the current behavior.
>>>>>>
>>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>>> ---
>>>>>>  lib/ovs-rcu.c |    2 +-
>>>>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>>>>  lib/seq.h     |    1 -
>>>>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>>>>
>>>>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>>>>> index 946aa04d1..9e07d9bab 100644
>>>>>> --- a/lib/ovs-rcu.c
>>>>>> +++ b/lib/ovs-rcu.c
>>>>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>>>>      ovs_assert(!single_threaded());
>>>>>>      perthread = ovsrcu_perthread_get();
>>>>>>      if (!seq_try_lock()) {
>>>>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>>>>> +        perthread->seqno = seq_read(global_seqno);
>>>>>>          if (perthread->cbset) {
>>>>>>              ovsrcu_flush_cbset__(perthread, true);
>>>>>>          }
>>>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>>>> index 99e5bf8bd..22646f3f8 100644
>>>>>> --- a/lib/seq.c
>>>>>> +++ b/lib/seq.c
>>>>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>>>>
>>>>>>  /* A sequence number object. */
>>>>>>  struct seq {
>>>>>> -    uint64_t value OVS_GUARDED;
>>>>>> +    atomic_uint64_t value;
>>>>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>>>>  };
>>>>>>
>>>>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>>>>  seq_create(void)
>>>>>>  {
>>>>>> +    uint64_t seq_value;
>>>>>>      struct seq *seq;
>>>>>>
>>>>>>      seq_init();
>>>>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>>>>      COVERAGE_INC(seq_change);
>>>>>>
>>>>>>      ovs_mutex_lock(&seq_mutex);
>>>>>> -    seq->value = seq_next++;
>>>>>> +    seq_value = seq_next++;
>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>      hmap_init(&seq->waiters);
>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>
>>>>>> @@ -126,9 +128,11 @@ void
>>>>>>  seq_change_protected(struct seq *seq)
>>>>>>      OVS_REQUIRES(seq_mutex)
>>>>>>  {
>>>>>> +    uint64_t seq_value = seq_next++;
>>>>>> +
>>>>>>      COVERAGE_INC(seq_change);
>>>>>>
>>>>>> -    seq->value = seq_next++;
>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>      seq_wake_waiters(seq);
>>>>>>  }
>>>>>>
>>>>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>  }
>>>>>>
>>>>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>>>>> - *
>>>>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>> - * when an object changes, even without an ability to lock the object.  See
>>>>>> - * Usage in seq.h for details. */
>>>>>> -uint64_t
>>>>>> -seq_read_protected(const struct seq *seq)
>>>>>> -    OVS_REQUIRES(seq_mutex)
>>>>>> -{
>>>>>> -    return seq->value;
>>>>>> -}
>>>>>> -
>>>>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>>>>   *
>>>>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>>>>   * Usage in seq.h for details. */
>>>>>>  uint64_t
>>>>>>  seq_read(const struct seq *seq)
>>>>>> -    OVS_EXCLUDED(seq_mutex)
>>>>>>  {
>>>>>>      uint64_t value;
>>>>>>
>>>>>> -    ovs_mutex_lock(&seq_mutex);
>>>>>> -    value = seq_read_protected(seq);
>>>>>> -    ovs_mutex_unlock(&seq_mutex);
>>>>>> -
>>>>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>>>>> +     * memory barrier you would get with locking and unlocking the global
>>>>>> +     * mutex.
>>>>>
>>>>> Hi, Eelco.   I'm not sure this is actually correct.
>>>>>
>>>>> We're performing memory_order_seq_cst operation on the value.
>>>>>
>>>>> Sequential consistency means just acquire-release semantics in our case, since
>>>>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>>>>> between acquire loads and release stores on the same atomic.  But all the loads
>>>>> on the value we have with this change are relaxed.  And the reader doesn't take
>>>>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>>>>> memory between threads.  memory_order_seq_cst read on its own can't really
>>>>> guarantee any synchronization.
>>>>>
>>>>> Also, the documentation for the sequence number library explicitly says that
>>>>> """
>>>>>  seq_change() synchronizes with seq_read() and
>>>>>  seq_wait() on the same variable in release-acquire fashion.  That
>>>>>  is, all effects of the memory accesses performed by a thread prior
>>>>>  to seq_change() are visible to the threads returning from
>>>>>  seq_read() or seq_wait() observing that change.
>>>>> """
>>>>>
>>>>> And I don't think that's the case with this change anymore.
>>>>
>>>> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
>>>>
>>>> I think it might have been the following:
>>>>
>>>> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
>>>> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
>>>> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
>>>>
>>>> Maybe I miss something obvious, if so let me know.
>>>
>>> The problem here is not the value of the sequence number, but the
>>> fact that pair seq_change() + seq_read() is documented as an
>>> acquire-release synchronization pair.  Meaning that these functions
>>> has to provide memory synchronization on unrelated memory locations.
>>
>> Yes, that’s what I tried to explain, but looks like I failed ;) I think it’s a bit less restricted, as it is only for the sequence number returned by seq_read compare to the new sequence number generated by seq_change.
>>
>>> And we don't have an acquire-release pair of operations that would
>>> ensure that, IIUC.  It might work with current implementations for
>>> well known architectures today, but I don't think it's required for
>>> any of these operations to always perform full memory barriers
>>> regardless of memory location.
>>
>> Yes, you are right, I think the memory barrier is not a requirement for the mutex_lock(), I guess only for the mutex_unlock().
>>
>>> Might be fixed by using release semantics for stores though, I guess.
>>
>> So something like this could solve it:
>>
>> void
>> seq_change_protected(struct seq *seq)
>>     OVS_REQUIRES(seq_mutex)
>> {
>>     uint64_t seq_value = seq_next++;
>>
>>     COVERAGE_INC(seq_change);
>>
>>     atomic_thread_fence(memory_order_release);
>>     atomic_store_relaxed(&seq->value, seq_value);
>>     seq_wake_waiters(seq);
>> }
>>
>> uint64_t
>> seq_read(const struct seq *seq)
>> {
>>     uint64_t value;
>>
>>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>>     atomic_read_relaxed(&CONST_CAST(struct seq *, seq)->value, &value);
>>     atomic_thread_fence(memory_order_acquire);
>>     return value;
>> }
>>
>> What do you think?
>
> Why not just explicit read/store with acquire-release?  I'm not sure why
> we need to use a fence.  Could you elaborate?

I misread the spec, and assumed this order was only for the atomic var itself. Found a nice blog explaining this, https://preshing.com/20120913/acquire-and-release-semantics/, it also had my solution as an option, so at least I was on the right track :)

So taking their example the following should be enough (as you suggested :)

void
seq_change_protected(struct seq *seq)
    OVS_REQUIRES(seq_mutex)
{
    uint64_t seq_value = seq_next++;

    COVERAGE_INC(seq_change);

    atomic_store_explicit(&seq->value, seq_value, memory_order_release);
    seq_wake_waiters(seq);
}

uint64_t
seq_read(const struct seq *seq)
{
    uint64_t value;

    /* Note that the odd CONST_CAST() is here to keep sparse happy. */
    atomic_read_explicit(&CONST_CAST(struct seq *, seq)->value, &value, memory_order_acquire);
    return value;
}
Eelco Chaudron May 26, 2023, 1:42 p.m. UTC | #13
On 17 May 2023, at 12:18, Eelco Chaudron wrote:

> On 16 May 2023, at 21:48, Ilya Maximets wrote:
>
>> On 5/16/23 10:20, Eelco Chaudron wrote:
>>>
>>>
>>> On 15 May 2023, at 17:47, Ilya Maximets wrote:
>>>
>>>> On 5/15/23 16:24, Eelco Chaudron wrote:
>>>>>
>>>>>
>>>>> On 4 May 2023, at 0:55, Ilya Maximets wrote:
>>>>>
>>>>>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>>>>>> Make the read of the current seq->value atomic, i.e., not needing to
>>>>>>> acquire the global mutex when reading it. On 64-bit systems, this
>>>>>>> incurs no overhead, and it will avoid the mutex and potentially
>>>>>>> a system call.
>>>>>>>
>>>>>>> For incrementing the value followed by waking up the threads, we are
>>>>>>> still taking the mutex, so the current behavior is not changing. The
>>>>>>> seq_read() behavior is already defined as, "Returns seq's current
>>>>>>> sequence number (which could change immediately)". So the change
>>>>>>> should not impact the current behavior.
>>>>>>>
>>>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>>>> ---
>>>>>>>  lib/ovs-rcu.c |    2 +-
>>>>>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>>>>>  lib/seq.h     |    1 -
>>>>>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>>>>>
>>>>>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>>>>>> index 946aa04d1..9e07d9bab 100644
>>>>>>> --- a/lib/ovs-rcu.c
>>>>>>> +++ b/lib/ovs-rcu.c
>>>>>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>>>>>      ovs_assert(!single_threaded());
>>>>>>>      perthread = ovsrcu_perthread_get();
>>>>>>>      if (!seq_try_lock()) {
>>>>>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>>>>>> +        perthread->seqno = seq_read(global_seqno);
>>>>>>>          if (perthread->cbset) {
>>>>>>>              ovsrcu_flush_cbset__(perthread, true);
>>>>>>>          }
>>>>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>>>>> index 99e5bf8bd..22646f3f8 100644
>>>>>>> --- a/lib/seq.c
>>>>>>> +++ b/lib/seq.c
>>>>>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>>>>>
>>>>>>>  /* A sequence number object. */
>>>>>>>  struct seq {
>>>>>>> -    uint64_t value OVS_GUARDED;
>>>>>>> +    atomic_uint64_t value;
>>>>>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>>>>>  };
>>>>>>>
>>>>>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>>>>>  seq_create(void)
>>>>>>>  {
>>>>>>> +    uint64_t seq_value;
>>>>>>>      struct seq *seq;
>>>>>>>
>>>>>>>      seq_init();
>>>>>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>>>>>      COVERAGE_INC(seq_change);
>>>>>>>
>>>>>>>      ovs_mutex_lock(&seq_mutex);
>>>>>>> -    seq->value = seq_next++;
>>>>>>> +    seq_value = seq_next++;
>>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>>      hmap_init(&seq->waiters);
>>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>>
>>>>>>> @@ -126,9 +128,11 @@ void
>>>>>>>  seq_change_protected(struct seq *seq)
>>>>>>>      OVS_REQUIRES(seq_mutex)
>>>>>>>  {
>>>>>>> +    uint64_t seq_value = seq_next++;
>>>>>>> +
>>>>>>>      COVERAGE_INC(seq_change);
>>>>>>>
>>>>>>> -    seq->value = seq_next++;
>>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>>      seq_wake_waiters(seq);
>>>>>>>  }
>>>>>>>
>>>>>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>>  }
>>>>>>>
>>>>>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>>>>>> - *
>>>>>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>>> - * when an object changes, even without an ability to lock the object.  See
>>>>>>> - * Usage in seq.h for details. */
>>>>>>> -uint64_t
>>>>>>> -seq_read_protected(const struct seq *seq)
>>>>>>> -    OVS_REQUIRES(seq_mutex)
>>>>>>> -{
>>>>>>> -    return seq->value;
>>>>>>> -}
>>>>>>> -
>>>>>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>>>>>   *
>>>>>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>>>>>   * Usage in seq.h for details. */
>>>>>>>  uint64_t
>>>>>>>  seq_read(const struct seq *seq)
>>>>>>> -    OVS_EXCLUDED(seq_mutex)
>>>>>>>  {
>>>>>>>      uint64_t value;
>>>>>>>
>>>>>>> -    ovs_mutex_lock(&seq_mutex);
>>>>>>> -    value = seq_read_protected(seq);
>>>>>>> -    ovs_mutex_unlock(&seq_mutex);
>>>>>>> -
>>>>>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>>>>>> +     * memory barrier you would get with locking and unlocking the global
>>>>>>> +     * mutex.
>>>>>>
>>>>>> Hi, Eelco.   I'm not sure this is actually correct.
>>>>>>
>>>>>> We're performing memory_order_seq_cst operation on the value.
>>>>>>
>>>>>> Sequential consistency means just acquire-release semantics in our case, since
>>>>>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>>>>>> between acquire loads and release stores on the same atomic.  But all the loads
>>>>>> on the value we have with this change are relaxed.  And the reader doesn't take
>>>>>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>>>>>> memory between threads.  memory_order_seq_cst read on its own can't really
>>>>>> guarantee any synchronization.
>>>>>>
>>>>>> Also, the documentation for the sequence number library explicitly says that
>>>>>> """
>>>>>>  seq_change() synchronizes with seq_read() and
>>>>>>  seq_wait() on the same variable in release-acquire fashion.  That
>>>>>>  is, all effects of the memory accesses performed by a thread prior
>>>>>>  to seq_change() are visible to the threads returning from
>>>>>>  seq_read() or seq_wait() observing that change.
>>>>>> """
>>>>>>
>>>>>> And I don't think that's the case with this change anymore.
>>>>>
>>>>> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
>>>>>
>>>>> I think it might have been the following:
>>>>>
>>>>> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
>>>>> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
>>>>> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
>>>>>
>>>>> Maybe I miss something obvious, if so let me know.
>>>>
>>>> The problem here is not the value of the sequence number, but the
>>>> fact that pair seq_change() + seq_read() is documented as an
>>>> acquire-release synchronization pair.  Meaning that these functions
>>>> has to provide memory synchronization on unrelated memory locations.
>>>
>>> Yes, that’s what I tried to explain, but looks like I failed ;) I think it’s a bit less restricted, as it is only for the sequence number returned by seq_read compare to the new sequence number generated by seq_change.
>>>
>>>> And we don't have an acquire-release pair of operations that would
>>>> ensure that, IIUC.  It might work with current implementations for
>>>> well known architectures today, but I don't think it's required for
>>>> any of these operations to always perform full memory barriers
>>>> regardless of memory location.
>>>
>>> Yes, you are right, I think the memory barrier is not a requirement for the mutex_lock(), I guess only for the mutex_unlock().
>>>
>>>> Might be fixed by using release semantics for stores though, I guess.
>>>
>>> So something like this could solve it:
>>>
>>> void
>>> seq_change_protected(struct seq *seq)
>>>     OVS_REQUIRES(seq_mutex)
>>> {
>>>     uint64_t seq_value = seq_next++;
>>>
>>>     COVERAGE_INC(seq_change);
>>>
>>>     atomic_thread_fence(memory_order_release);
>>>     atomic_store_relaxed(&seq->value, seq_value);
>>>     seq_wake_waiters(seq);
>>> }
>>>
>>> uint64_t
>>> seq_read(const struct seq *seq)
>>> {
>>>     uint64_t value;
>>>
>>>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>>>     atomic_read_relaxed(&CONST_CAST(struct seq *, seq)->value, &value);
>>>     atomic_thread_fence(memory_order_acquire);
>>>     return value;
>>> }
>>>
>>> What do you think?
>>
>> Why not just explicit read/store with acquire-release?  I'm not sure why
>> we need to use a fence.  Could you elaborate?
>
> I misread the spec, and assumed this order was only for the atomic var itself. Found a nice blog explaining this, https://preshing.com/20120913/acquire-and-release-semantics/, it also had my solution as an option, so at least I was on the right track :)
>
> So taking their example the following should be enough (as you suggested :)
>
> void
> seq_change_protected(struct seq *seq)
>     OVS_REQUIRES(seq_mutex)
> {
>     uint64_t seq_value = seq_next++;
>
>     COVERAGE_INC(seq_change);
>
>     atomic_store_explicit(&seq->value, seq_value, memory_order_release);
>     seq_wake_waiters(seq);
> }
>
> uint64_t
> seq_read(const struct seq *seq)
> {
>     uint64_t value;
>
>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>     atomic_read_explicit(&CONST_CAST(struct seq *, seq)->value, &value, memory_order_acquire);
>     return value;
> }

Any feedback on the above, so I can send a re-worked patch?

//Eelco
Ilya Maximets May 26, 2023, 1:50 p.m. UTC | #14
On 5/26/23 15:42, Eelco Chaudron wrote:
> 
> 
> On 17 May 2023, at 12:18, Eelco Chaudron wrote:
> 
>> On 16 May 2023, at 21:48, Ilya Maximets wrote:
>>
>>> On 5/16/23 10:20, Eelco Chaudron wrote:
>>>>
>>>>
>>>> On 15 May 2023, at 17:47, Ilya Maximets wrote:
>>>>
>>>>> On 5/15/23 16:24, Eelco Chaudron wrote:
>>>>>>
>>>>>>
>>>>>> On 4 May 2023, at 0:55, Ilya Maximets wrote:
>>>>>>
>>>>>>> On 3/27/23 13:25, Eelco Chaudron wrote:
>>>>>>>> Make the read of the current seq->value atomic, i.e., not needing to
>>>>>>>> acquire the global mutex when reading it. On 64-bit systems, this
>>>>>>>> incurs no overhead, and it will avoid the mutex and potentially
>>>>>>>> a system call.
>>>>>>>>
>>>>>>>> For incrementing the value followed by waking up the threads, we are
>>>>>>>> still taking the mutex, so the current behavior is not changing. The
>>>>>>>> seq_read() behavior is already defined as, "Returns seq's current
>>>>>>>> sequence number (which could change immediately)". So the change
>>>>>>>> should not impact the current behavior.
>>>>>>>>
>>>>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>>>>> ---
>>>>>>>>  lib/ovs-rcu.c |    2 +-
>>>>>>>>  lib/seq.c     |   37 ++++++++++++++++---------------------
>>>>>>>>  lib/seq.h     |    1 -
>>>>>>>>  3 files changed, 17 insertions(+), 23 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
>>>>>>>> index 946aa04d1..9e07d9bab 100644
>>>>>>>> --- a/lib/ovs-rcu.c
>>>>>>>> +++ b/lib/ovs-rcu.c
>>>>>>>> @@ -170,7 +170,7 @@ ovsrcu_try_quiesce(void)
>>>>>>>>      ovs_assert(!single_threaded());
>>>>>>>>      perthread = ovsrcu_perthread_get();
>>>>>>>>      if (!seq_try_lock()) {
>>>>>>>> -        perthread->seqno = seq_read_protected(global_seqno);
>>>>>>>> +        perthread->seqno = seq_read(global_seqno);
>>>>>>>>          if (perthread->cbset) {
>>>>>>>>              ovsrcu_flush_cbset__(perthread, true);
>>>>>>>>          }
>>>>>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>>>>>> index 99e5bf8bd..22646f3f8 100644
>>>>>>>> --- a/lib/seq.c
>>>>>>>> +++ b/lib/seq.c
>>>>>>>> @@ -32,7 +32,7 @@ COVERAGE_DEFINE(seq_change);
>>>>>>>>
>>>>>>>>  /* A sequence number object. */
>>>>>>>>  struct seq {
>>>>>>>> -    uint64_t value OVS_GUARDED;
>>>>>>>> +    atomic_uint64_t value;
>>>>>>>>      struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
>>>>>>>>  };
>>>>>>>>
>>>>>>>> @@ -72,6 +72,7 @@ static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>>>>>  struct seq * OVS_EXCLUDED(seq_mutex)
>>>>>>>>  seq_create(void)
>>>>>>>>  {
>>>>>>>> +    uint64_t seq_value;
>>>>>>>>      struct seq *seq;
>>>>>>>>
>>>>>>>>      seq_init();
>>>>>>>> @@ -81,7 +82,8 @@ seq_create(void)
>>>>>>>>      COVERAGE_INC(seq_change);
>>>>>>>>
>>>>>>>>      ovs_mutex_lock(&seq_mutex);
>>>>>>>> -    seq->value = seq_next++;
>>>>>>>> +    seq_value = seq_next++;
>>>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>>>      hmap_init(&seq->waiters);
>>>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>>>
>>>>>>>> @@ -126,9 +128,11 @@ void
>>>>>>>>  seq_change_protected(struct seq *seq)
>>>>>>>>      OVS_REQUIRES(seq_mutex)
>>>>>>>>  {
>>>>>>>> +    uint64_t seq_value = seq_next++;
>>>>>>>> +
>>>>>>>>      COVERAGE_INC(seq_change);
>>>>>>>>
>>>>>>>> -    seq->value = seq_next++;
>>>>>>>> +    atomic_store_relaxed(&seq->value, seq_value);
>>>>>>>>      seq_wake_waiters(seq);
>>>>>>>>  }
>>>>>>>>
>>>>>>>> @@ -143,18 +147,6 @@ seq_change(struct seq *seq)
>>>>>>>>      ovs_mutex_unlock(&seq_mutex);
>>>>>>>>  }
>>>>>>>>
>>>>>>>> -/* Returns 'seq''s current sequence number (which could change immediately).
>>>>>>>> - *
>>>>>>>> - * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>>>> - * when an object changes, even without an ability to lock the object.  See
>>>>>>>> - * Usage in seq.h for details. */
>>>>>>>> -uint64_t
>>>>>>>> -seq_read_protected(const struct seq *seq)
>>>>>>>> -    OVS_REQUIRES(seq_mutex)
>>>>>>>> -{
>>>>>>>> -    return seq->value;
>>>>>>>> -}
>>>>>>>> -
>>>>>>>>  /* Returns 'seq''s current sequence number (which could change immediately).
>>>>>>>>   *
>>>>>>>>   * seq_read() and seq_wait() can be used together to yield a race-free wakeup
>>>>>>>> @@ -162,14 +154,15 @@ seq_read_protected(const struct seq *seq)
>>>>>>>>   * Usage in seq.h for details. */
>>>>>>>>  uint64_t
>>>>>>>>  seq_read(const struct seq *seq)
>>>>>>>> -    OVS_EXCLUDED(seq_mutex)
>>>>>>>>  {
>>>>>>>>      uint64_t value;
>>>>>>>>
>>>>>>>> -    ovs_mutex_lock(&seq_mutex);
>>>>>>>> -    value = seq_read_protected(seq);
>>>>>>>> -    ovs_mutex_unlock(&seq_mutex);
>>>>>>>> -
>>>>>>>> +    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
>>>>>>>> +     * memory barrier you would get with locking and unlocking the global
>>>>>>>> +     * mutex.
>>>>>>>
>>>>>>> Hi, Eelco.   I'm not sure this is actually correct.
>>>>>>>
>>>>>>> We're performing memory_order_seq_cst operation on the value.
>>>>>>>
>>>>>>> Sequential consistency means just acquire-release semantics in our case, since
>>>>>>> we're pairing with non-seq-cst operations.  And acquire-release semantics works
>>>>>>> between acquire loads and release stores on the same atomic.  But all the loads
>>>>>>> on the value we have with this change are relaxed.  And the reader doesn't take
>>>>>>> the mutex.  Meaning there is no acquire-release pair that can synchronize the
>>>>>>> memory between threads.  memory_order_seq_cst read on its own can't really
>>>>>>> guarantee any synchronization.
>>>>>>>
>>>>>>> Also, the documentation for the sequence number library explicitly says that
>>>>>>> """
>>>>>>>  seq_change() synchronizes with seq_read() and
>>>>>>>  seq_wait() on the same variable in release-acquire fashion.  That
>>>>>>>  is, all effects of the memory accesses performed by a thread prior
>>>>>>>  to seq_change() are visible to the threads returning from
>>>>>>>  seq_read() or seq_wait() observing that change.
>>>>>>> """
>>>>>>>
>>>>>>> And I don't think that's the case with this change anymore.
>>>>>>
>>>>>> It was more than a year ago I did this patch, so can’t really remember my reasoning for doing it this way, and thinking it’s fine :)
>>>>>>
>>>>>> I think it might have been the following:
>>>>>>
>>>>>> 1) Before increasing the value in seq_change() there is a full memory barrier due to taking the lock
>>>>>> 2a) Now if we read the atomic seq in seq_read() and this new value is incremented in the seq_read above, the memory should be in sync
>>>>>> 2b) If we read it before we increase it, it’s the old value and it should not matter if the read has happened or not.
>>>>>>
>>>>>> Maybe I miss something obvious, if so let me know.
>>>>>
>>>>> The problem here is not the value of the sequence number, but the
>>>>> fact that pair seq_change() + seq_read() is documented as an
>>>>> acquire-release synchronization pair.  Meaning that these functions
>>>>> has to provide memory synchronization on unrelated memory locations.
>>>>
>>>> Yes, that’s what I tried to explain, but looks like I failed ;) I think it’s a bit less restricted, as it is only for the sequence number returned by seq_read compare to the new sequence number generated by seq_change.
>>>>
>>>>> And we don't have an acquire-release pair of operations that would
>>>>> ensure that, IIUC.  It might work with current implementations for
>>>>> well known architectures today, but I don't think it's required for
>>>>> any of these operations to always perform full memory barriers
>>>>> regardless of memory location.
>>>>
>>>> Yes, you are right, I think the memory barrier is not a requirement for the mutex_lock(), I guess only for the mutex_unlock().
>>>>
>>>>> Might be fixed by using release semantics for stores though, I guess.
>>>>
>>>> So something like this could solve it:
>>>>
>>>> void
>>>> seq_change_protected(struct seq *seq)
>>>>     OVS_REQUIRES(seq_mutex)
>>>> {
>>>>     uint64_t seq_value = seq_next++;
>>>>
>>>>     COVERAGE_INC(seq_change);
>>>>
>>>>     atomic_thread_fence(memory_order_release);
>>>>     atomic_store_relaxed(&seq->value, seq_value);
>>>>     seq_wake_waiters(seq);
>>>> }
>>>>
>>>> uint64_t
>>>> seq_read(const struct seq *seq)
>>>> {
>>>>     uint64_t value;
>>>>
>>>>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>>>>     atomic_read_relaxed(&CONST_CAST(struct seq *, seq)->value, &value);
>>>>     atomic_thread_fence(memory_order_acquire);
>>>>     return value;
>>>> }
>>>>
>>>> What do you think?
>>>
>>> Why not just explicit read/store with acquire-release?  I'm not sure why
>>> we need to use a fence.  Could you elaborate?
>>
>> I misread the spec, and assumed this order was only for the atomic var itself. Found a nice blog explaining this, https://preshing.com/20120913/acquire-and-release-semantics/, it also had my solution as an option, so at least I was on the right track :)
>>
>> So taking their example the following should be enough (as you suggested :)
>>
>> void
>> seq_change_protected(struct seq *seq)
>>     OVS_REQUIRES(seq_mutex)
>> {
>>     uint64_t seq_value = seq_next++;
>>
>>     COVERAGE_INC(seq_change);
>>
>>     atomic_store_explicit(&seq->value, seq_value, memory_order_release);
>>     seq_wake_waiters(seq);
>> }
>>
>> uint64_t
>> seq_read(const struct seq *seq)
>> {
>>     uint64_t value;
>>
>>     /* Note that the odd CONST_CAST() is here to keep sparse happy. */
>>     atomic_read_explicit(&CONST_CAST(struct seq *, seq)->value, &value, memory_order_acquire);
>>     return value;
>> }
> 
> Any feedback on the above, so I can send a re-worked patch?

Sorry, forgot to reply.
Looks OK in general, feel free to send a new version.
BTW, do we need to use seq_read() in seq_wait_at() ?

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c
index 946aa04d1..9e07d9bab 100644
--- a/lib/ovs-rcu.c
+++ b/lib/ovs-rcu.c
@@ -170,7 +170,7 @@  ovsrcu_try_quiesce(void)
     ovs_assert(!single_threaded());
     perthread = ovsrcu_perthread_get();
     if (!seq_try_lock()) {
-        perthread->seqno = seq_read_protected(global_seqno);
+        perthread->seqno = seq_read(global_seqno);
         if (perthread->cbset) {
             ovsrcu_flush_cbset__(perthread, true);
         }
diff --git a/lib/seq.c b/lib/seq.c
index 99e5bf8bd..22646f3f8 100644
--- a/lib/seq.c
+++ b/lib/seq.c
@@ -32,7 +32,7 @@  COVERAGE_DEFINE(seq_change);
 
 /* A sequence number object. */
 struct seq {
-    uint64_t value OVS_GUARDED;
+    atomic_uint64_t value;
     struct hmap waiters OVS_GUARDED; /* Contains 'struct seq_waiter's. */
 };
 
@@ -72,6 +72,7 @@  static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
 struct seq * OVS_EXCLUDED(seq_mutex)
 seq_create(void)
 {
+    uint64_t seq_value;
     struct seq *seq;
 
     seq_init();
@@ -81,7 +82,8 @@  seq_create(void)
     COVERAGE_INC(seq_change);
 
     ovs_mutex_lock(&seq_mutex);
-    seq->value = seq_next++;
+    seq_value = seq_next++;
+    atomic_store_relaxed(&seq->value, seq_value);
     hmap_init(&seq->waiters);
     ovs_mutex_unlock(&seq_mutex);
 
@@ -126,9 +128,11 @@  void
 seq_change_protected(struct seq *seq)
     OVS_REQUIRES(seq_mutex)
 {
+    uint64_t seq_value = seq_next++;
+
     COVERAGE_INC(seq_change);
 
-    seq->value = seq_next++;
+    atomic_store_relaxed(&seq->value, seq_value);
     seq_wake_waiters(seq);
 }
 
@@ -143,18 +147,6 @@  seq_change(struct seq *seq)
     ovs_mutex_unlock(&seq_mutex);
 }
 
-/* Returns 'seq''s current sequence number (which could change immediately).
- *
- * seq_read() and seq_wait() can be used together to yield a race-free wakeup
- * when an object changes, even without an ability to lock the object.  See
- * Usage in seq.h for details. */
-uint64_t
-seq_read_protected(const struct seq *seq)
-    OVS_REQUIRES(seq_mutex)
-{
-    return seq->value;
-}
-
 /* Returns 'seq''s current sequence number (which could change immediately).
  *
  * seq_read() and seq_wait() can be used together to yield a race-free wakeup
@@ -162,14 +154,15 @@  seq_read_protected(const struct seq *seq)
  * Usage in seq.h for details. */
 uint64_t
 seq_read(const struct seq *seq)
-    OVS_EXCLUDED(seq_mutex)
 {
     uint64_t value;
 
-    ovs_mutex_lock(&seq_mutex);
-    value = seq_read_protected(seq);
-    ovs_mutex_unlock(&seq_mutex);
-
+    /* We use atomic_read(), memory_order_seq_cst, to simulate the full
+     * memory barrier you would get with locking and unlocking the global
+     * mutex.
+     *
+     * Note that the odd CONST_CAST() is here to keep sparse happy. */
+    atomic_read(&CONST_CAST(struct seq *, seq)->value, &value);
     return value;
 }
 
@@ -224,9 +217,11 @@  seq_wait_at(const struct seq *seq_, uint64_t value, const char *where)
     OVS_EXCLUDED(seq_mutex)
 {
     struct seq *seq = CONST_CAST(struct seq *, seq_);
+    uint64_t seq_value;
 
     ovs_mutex_lock(&seq_mutex);
-    if (value == seq->value) {
+    atomic_read_relaxed(&seq->value, &seq_value);
+    if (value == seq_value) {
         seq_wait__(seq, value, where);
     } else {
         poll_immediate_wake_at(where);
diff --git a/lib/seq.h b/lib/seq.h
index c88b9d1c8..fcfa01037 100644
--- a/lib/seq.h
+++ b/lib/seq.h
@@ -128,7 +128,6 @@  void seq_unlock(void);
 
 /* For observers. */
 uint64_t seq_read(const struct seq *);
-uint64_t seq_read_protected(const struct seq *);
 
 void seq_wait_at(const struct seq *, uint64_t value, const char *where);
 #define seq_wait(seq, value) seq_wait_at(seq, value, OVS_SOURCE_LOCATOR)