diff mbox series

[ovs-dev,v2,14/28] dpif-netdev: Quiesce offload thread periodically

Message ID a4de1a1ad6c5e70fcc32db354fb8dbdd13b8cbd2.1618236421.git.grive@u256.net
State Changes Requested
Headers show
Series dpif-netdev: Parallel offload processing | expand

Commit Message

Gaetan Rivet April 12, 2021, 3:20 p.m. UTC
Similar to what was done for the PMD threads [1], reduce the performance
impact of quiescing too often in the offload thread.

After each processed offload, the offload thread currently quiesce and
will sync with RCU. This synchronization can be lengthy and make the
thread unnecessary slow.

Instead attempt to quiesce every 10 ms at most. While the queue is
empty, the offload thread remains quiescent.

[1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval
     in PMD main loop.")

Signed-off-by: Gaetan Rivet <grive@u256.net>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
---
 lib/dpif-netdev.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

Comments

Maxime Coquelin April 21, 2021, 5:24 p.m. UTC | #1
On 4/12/21 5:20 PM, Gaetan Rivet wrote:
> Similar to what was done for the PMD threads [1], reduce the performance
> impact of quiescing too often in the offload thread.
> 
> After each processed offload, the offload thread currently quiesce and
> will sync with RCU. This synchronization can be lengthy and make the
> thread unnecessary slow.
> 
> Instead attempt to quiesce every 10 ms at most. While the queue is
> empty, the offload thread remains quiescent.
> 
> [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval
>      in PMD main loop.")
> 
> Signed-off-by: Gaetan Rivet <grive@u256.net>
> Reviewed-by: Eli Britstein <elibr@nvidia.com>
> ---
>  lib/dpif-netdev.c | 18 ++++++++++++++++--
>  1 file changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index d458bcb12..4e403a9e5 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -2740,15 +2740,20 @@ err_free:
>      return -1;
>  }
>  
> +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
> +
>  static void *
>  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>  {
>      struct dp_offload_thread_item *offload;
>      struct ovs_list *list;
>      long long int latency_us;
> +    long long int next_rcu;
> +    long long int now;
>      const char *op;
>      int ret;
>  
> +    next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
>      for (;;) {
>          ovs_mutex_lock(&dp_offload_thread.mutex);
>          if (ovs_list_is_empty(&dp_offload_thread.list)) {
> @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>              ovs_mutex_cond_wait(&dp_offload_thread.cond,
>                                  &dp_offload_thread.mutex);
>              ovsrcu_quiesce_end();
> +            next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
>          }
>          list = ovs_list_pop_front(&dp_offload_thread.list);
>          dp_offload_thread.enqueued_item--;
> @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>              OVS_NOT_REACHED();
>          }
>  
> -        latency_us = time_usec() - offload->timestamp;
> +        now = time_usec();
> +
> +        latency_us = now - offload->timestamp;
>          mov_avg_cma_update(&dp_offload_thread.cma, latency_us);
>          mov_avg_ema_update(&dp_offload_thread.ema, latency_us);
>  
> @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>                   ret == 0 ? "succeed" : "failed", op,
>                   UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
>          dp_netdev_free_flow_offload(offload);
> -        ovsrcu_quiesce();
> +
> +        /* Do RCU synchronization at fixed interval. */
> +        if (now > next_rcu) {
> +            if (!ovsrcu_try_quiesce()) {
> +                next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;

Commit 81ac8b3b194c do it a bit differently, by using current time as a
base for calculating the next deadline while you use current deadline
(which is in the past) to calculate the next one.

Under heavy load, won't it just end up doing the RCU synchronization at
every it iteration, as next_rcu would always run behind?

> +            }
> +        }
>      }
>  
>      return NULL;
>
Gaetan Rivet April 22, 2021, 11:51 a.m. UTC | #2
On Wed, Apr 21, 2021, at 19:24, Maxime Coquelin wrote:
> 
> 
> On 4/12/21 5:20 PM, Gaetan Rivet wrote:
> > Similar to what was done for the PMD threads [1], reduce the performance
> > impact of quiescing too often in the offload thread.
> > 
> > After each processed offload, the offload thread currently quiesce and
> > will sync with RCU. This synchronization can be lengthy and make the
> > thread unnecessary slow.
> > 
> > Instead attempt to quiesce every 10 ms at most. While the queue is
> > empty, the offload thread remains quiescent.
> > 
> > [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval
> >      in PMD main loop.")
> > 
> > Signed-off-by: Gaetan Rivet <grive@u256.net>
> > Reviewed-by: Eli Britstein <elibr@nvidia.com>
> > ---
> >  lib/dpif-netdev.c | 18 ++++++++++++++++--
> >  1 file changed, 16 insertions(+), 2 deletions(-)
> > 
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index d458bcb12..4e403a9e5 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -2740,15 +2740,20 @@ err_free:
> >      return -1;
> >  }
> >  
> > +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
> > +
> >  static void *
> >  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
> >  {
> >      struct dp_offload_thread_item *offload;
> >      struct ovs_list *list;
> >      long long int latency_us;
> > +    long long int next_rcu;
> > +    long long int now;
> >      const char *op;
> >      int ret;
> >  
> > +    next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
> >      for (;;) {
> >          ovs_mutex_lock(&dp_offload_thread.mutex);
> >          if (ovs_list_is_empty(&dp_offload_thread.list)) {
> > @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
> >              ovs_mutex_cond_wait(&dp_offload_thread.cond,
> >                                  &dp_offload_thread.mutex);
> >              ovsrcu_quiesce_end();
> > +            next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
> >          }
> >          list = ovs_list_pop_front(&dp_offload_thread.list);
> >          dp_offload_thread.enqueued_item--;
> > @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
> >              OVS_NOT_REACHED();
> >          }
> >  
> > -        latency_us = time_usec() - offload->timestamp;
> > +        now = time_usec();
> > +
> > +        latency_us = now - offload->timestamp;
> >          mov_avg_cma_update(&dp_offload_thread.cma, latency_us);
> >          mov_avg_ema_update(&dp_offload_thread.ema, latency_us);
> >  
> > @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
> >                   ret == 0 ? "succeed" : "failed", op,
> >                   UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
> >          dp_netdev_free_flow_offload(offload);
> > -        ovsrcu_quiesce();
> > +
> > +        /* Do RCU synchronization at fixed interval. */
> > +        if (now > next_rcu) {
> > +            if (!ovsrcu_try_quiesce()) {
> > +                next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
> 
> Commit 81ac8b3b194c do it a bit differently, by using current time as a
> base for calculating the next deadline while you use current deadline
> (which is in the past) to calculate the next one.
> 
> Under heavy load, won't it just end up doing the RCU synchronization at
> every it iteration, as next_rcu would always run behind?
> 

Hello Maxime,

I think there is an issue, but I'm not sure it will be related to a specific load.
The try-lock here will fail if the global seq locking fails. It would be extremely unlucky for multiple calls
to fail, but it will be unrelated to a specific charge I believe. In the end seq locking happens for so many
concurrency primitives that all threads are touching that pretty often.

So even if we fail there, we will eventually succeed, and as the chance to succeed are always pretty high,
even if we were lagging 10 ms behind schedule, we will eventually catch-up with a limited number of quiescing.

So I've never had an issue with this, but still, I think it's badly written.
My initial thought was that I wanted to force another attempt quickly in case this one failed, as the RCU should take priority
over the offload thread.
However, unlike the PMD thread, the offload thread is meant to yield, and waiting a little on the seq lock is fine.
So I think instead of duck-taping a way to make sure we quiesce, I should just use ovsrcu_quiesce() and wait for
the call to resume. Only the PMDs are meant to run-to-completion.

So I propose to switch to ovsrcu_quiesce, and base next_rcu on 'now' instead of itself.
But if anyone has another opinion feel free to let me know.
Maxime Coquelin April 22, 2021, 12:50 p.m. UTC | #3
On 4/22/21 1:51 PM, Gaƫtan Rivet wrote:
> On Wed, Apr 21, 2021, at 19:24, Maxime Coquelin wrote:
>>
>>
>> On 4/12/21 5:20 PM, Gaetan Rivet wrote:
>>> Similar to what was done for the PMD threads [1], reduce the performance
>>> impact of quiescing too often in the offload thread.
>>>
>>> After each processed offload, the offload thread currently quiesce and
>>> will sync with RCU. This synchronization can be lengthy and make the
>>> thread unnecessary slow.
>>>
>>> Instead attempt to quiesce every 10 ms at most. While the queue is
>>> empty, the offload thread remains quiescent.
>>>
>>> [1]: 81ac8b3b194c ("dpif-netdev: Do RCU synchronization at fixed interval
>>>      in PMD main loop.")
>>>
>>> Signed-off-by: Gaetan Rivet <grive@u256.net>
>>> Reviewed-by: Eli Britstein <elibr@nvidia.com>
>>> ---
>>>  lib/dpif-netdev.c | 18 ++++++++++++++++--
>>>  1 file changed, 16 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index d458bcb12..4e403a9e5 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -2740,15 +2740,20 @@ err_free:
>>>      return -1;
>>>  }
>>>  
>>> +#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
>>> +
>>>  static void *
>>>  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>>>  {
>>>      struct dp_offload_thread_item *offload;
>>>      struct ovs_list *list;
>>>      long long int latency_us;
>>> +    long long int next_rcu;
>>> +    long long int now;
>>>      const char *op;
>>>      int ret;
>>>  
>>> +    next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
>>>      for (;;) {
>>>          ovs_mutex_lock(&dp_offload_thread.mutex);
>>>          if (ovs_list_is_empty(&dp_offload_thread.list)) {
>>> @@ -2756,6 +2761,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>>>              ovs_mutex_cond_wait(&dp_offload_thread.cond,
>>>                                  &dp_offload_thread.mutex);
>>>              ovsrcu_quiesce_end();
>>> +            next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
>>>          }
>>>          list = ovs_list_pop_front(&dp_offload_thread.list);
>>>          dp_offload_thread.enqueued_item--;
>>> @@ -2779,7 +2785,9 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>>>              OVS_NOT_REACHED();
>>>          }
>>>  
>>> -        latency_us = time_usec() - offload->timestamp;
>>> +        now = time_usec();
>>> +
>>> +        latency_us = now - offload->timestamp;
>>>          mov_avg_cma_update(&dp_offload_thread.cma, latency_us);
>>>          mov_avg_ema_update(&dp_offload_thread.ema, latency_us);
>>>  
>>> @@ -2787,7 +2795,13 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED)
>>>                   ret == 0 ? "succeed" : "failed", op,
>>>                   UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
>>>          dp_netdev_free_flow_offload(offload);
>>> -        ovsrcu_quiesce();
>>> +
>>> +        /* Do RCU synchronization at fixed interval. */
>>> +        if (now > next_rcu) {
>>> +            if (!ovsrcu_try_quiesce()) {
>>> +                next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
>>
>> Commit 81ac8b3b194c do it a bit differently, by using current time as a
>> base for calculating the next deadline while you use current deadline
>> (which is in the past) to calculate the next one.
>>
>> Under heavy load, won't it just end up doing the RCU synchronization at
>> every it iteration, as next_rcu would always run behind?
>>
> 
> Hello Maxime,
> 
> I think there is an issue, but I'm not sure it will be related to a specific load.
> The try-lock here will fail if the global seq locking fails. It would be extremely unlucky for multiple calls
> to fail, but it will be unrelated to a specific charge I believe. In the end seq locking happens for so many
> concurrency primitives that all threads are touching that pretty often.
> 
> So even if we fail there, we will eventually succeed, and as the chance to succeed are always pretty high,
> even if we were lagging 10 ms behind schedule, we will eventually catch-up with a limited number of quiescing.
> 
> So I've never had an issue with this, but still, I think it's badly written.
> My initial thought was that I wanted to force another attempt quickly in case this one failed, as the RCU should take priority
> over the offload thread.
> However, unlike the PMD thread, the offload thread is meant to yield, and waiting a little on the seq lock is fine.
> So I think instead of duck-taping a way to make sure we quiesce, I should just use ovsrcu_quiesce() and wait for
> the call to resume. Only the PMDs are meant to run-to-completion.

Thanks for the detailed analysis.

> So I propose to switch to ovsrcu_quiesce, and base next_rcu on 'now' instead of itself.
> But if anyone has another opinion feel free to let me know.
> 

I think your proposal makes sense.

Maxime
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index d458bcb12..4e403a9e5 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2740,15 +2740,20 @@  err_free:
     return -1;
 }
 
+#define DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US (10 * 1000) /* 10 ms */
+
 static void *
 dp_netdev_flow_offload_main(void *data OVS_UNUSED)
 {
     struct dp_offload_thread_item *offload;
     struct ovs_list *list;
     long long int latency_us;
+    long long int next_rcu;
+    long long int now;
     const char *op;
     int ret;
 
+    next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
     for (;;) {
         ovs_mutex_lock(&dp_offload_thread.mutex);
         if (ovs_list_is_empty(&dp_offload_thread.list)) {
@@ -2756,6 +2761,7 @@  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
             ovs_mutex_cond_wait(&dp_offload_thread.cond,
                                 &dp_offload_thread.mutex);
             ovsrcu_quiesce_end();
+            next_rcu = time_usec() + DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
         }
         list = ovs_list_pop_front(&dp_offload_thread.list);
         dp_offload_thread.enqueued_item--;
@@ -2779,7 +2785,9 @@  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
             OVS_NOT_REACHED();
         }
 
-        latency_us = time_usec() - offload->timestamp;
+        now = time_usec();
+
+        latency_us = now - offload->timestamp;
         mov_avg_cma_update(&dp_offload_thread.cma, latency_us);
         mov_avg_ema_update(&dp_offload_thread.ema, latency_us);
 
@@ -2787,7 +2795,13 @@  dp_netdev_flow_offload_main(void *data OVS_UNUSED)
                  ret == 0 ? "succeed" : "failed", op,
                  UUID_ARGS((struct uuid *) &offload->flow->mega_ufid));
         dp_netdev_free_flow_offload(offload);
-        ovsrcu_quiesce();
+
+        /* Do RCU synchronization at fixed interval. */
+        if (now > next_rcu) {
+            if (!ovsrcu_try_quiesce()) {
+                next_rcu += DP_NETDEV_OFFLOAD_QUIESCE_INTERVAL_US;
+            }
+        }
     }
 
     return NULL;