diff mbox

[ovs-dev] dpif-netdev: Remove PMD latency on seq_mutex

Message ID 1458791679-15894-1-git-send-email-fbl@redhat.com
State Changes Requested
Headers show

Commit Message

Flavio Leitner March 24, 2016, 3:54 a.m. UTC
The PMD thread needs to keep processing RX queues in order
archive maximum throughput.  However, it also needs to run
other tasks after some time processing the RX queues which
a mutex can block the PMD thread.  That causes latency
spikes and affects the throughput.

Convert to recursive mutex so that PMD thread can test first
and if it gets the lock, continue as before, otherwise try
again another time.  There is an additional logic to make
sure the PMD thread will try harder as the attempt to get
the mutex continues to fail.

Co-authored-by: Karl Rister <krister@redhat.com>
Signed-off-by: Flavio Leitner <fbl@redhat.com>
---
 include/openvswitch/thread.h |  3 +++
 lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
 lib/seq.c                    | 15 ++++++++++++++-
 lib/seq.h                    |  3 +++
 4 files changed, 42 insertions(+), 12 deletions(-)

Comments

Simon Jones March 24, 2016, 6:10 a.m. UTC | #1
So basically, you replace ovs_mutex_rwlock (or sth) into ovs_mutex_trylock
in the loop of "other tasks after some time processing the RX queues".
Is it ?

2016-03-24 11:54 GMT+08:00 Flavio Leitner <fbl@redhat.com>:

> The PMD thread needs to keep processing RX queues in order
> archive maximum throughput.  However, it also needs to run
> other tasks after some time processing the RX queues which
> a mutex can block the PMD thread.  That causes latency
> spikes and affects the throughput.
>
> Convert to recursive mutex so that PMD thread can test first
> and if it gets the lock, continue as before, otherwise try
> again another time.  There is an additional logic to make
> sure the PMD thread will try harder as the attempt to get
> the mutex continues to fail.
>
> Co-authored-by: Karl Rister <krister@redhat.com>
> Signed-off-by: Flavio Leitner <fbl@redhat.com>
> ---
>  include/openvswitch/thread.h |  3 +++
>  lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
>  lib/seq.c                    | 15 ++++++++++++++-
>  lib/seq.h                    |  3 +++
>  4 files changed, 42 insertions(+), 12 deletions(-)
>
> diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
> index af6f2bb..6d20720 100644
> --- a/include/openvswitch/thread.h
> +++ b/include/openvswitch/thread.h
> @@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
>  #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
>  #endif
>
> +#define OVS_RECURSIVE_MUTEX_INITIALIZER \
> +   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
> +
>  /* ovs_mutex functions analogous to pthread_mutex_*() functions.
>   *
>   * Most of these functions abort the process with an error message on any
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 0f2385a..a10b2d1 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -2668,12 +2668,15 @@ static void *
>  pmd_thread_main(void *f_)
>  {
>      struct dp_netdev_pmd_thread *pmd = f_;
> -    unsigned int lc = 0;
> +    unsigned int lc_max = 1024;
> +    unsigned int lc_start;
> +    unsigned int lc;
>      struct rxq_poll *poll_list;
>      unsigned int port_seq = PMD_INITIAL_SEQ;
>      int poll_cnt;
>      int i;
>
> +    lc_start = 0;
>      poll_cnt = 0;
>      poll_list = NULL;
>
> @@ -2698,24 +2701,32 @@ reload:
>       * reloading the updated configuration. */
>      dp_netdev_pmd_reload_done(pmd);
>
> +    lc = lc_start;
>      for (;;) {
>          for (i = 0; i < poll_cnt; i++) {
>              dp_netdev_process_rxq_port(pmd, poll_list[i].port,
> poll_list[i].rx);
>          }
>
> -        if (lc++ > 1024) {
> -            unsigned int seq;
> +        if (lc++ > lc_max) {
> +            if (!seq_pmd_trylock()) {
> +                unsigned int seq;
> +                lc_start = 0;
> +                lc = 0;
>
> -            lc = 0;
> +                emc_cache_slow_sweep(&pmd->flow_cache);
> +                coverage_try_clear();
> +                ovsrcu_quiesce();
>
> -            emc_cache_slow_sweep(&pmd->flow_cache);
> -            coverage_try_clear();
> -            ovsrcu_quiesce();
> +                seq_pmd_unlock();
>
> -            atomic_read_relaxed(&pmd->change_seq, &seq);
> -            if (seq != port_seq) {
> -                port_seq = seq;
> -                break;
> +                atomic_read_relaxed(&pmd->change_seq, &seq);
> +                if (seq != port_seq) {
> +                    port_seq = seq;
> +                    break;
> +                }
> +            } else {
> +                lc_start += (lc_start + lc_max)/2;
> +                lc = lc_start;
>              }
>          }
>      }
> diff --git a/lib/seq.c b/lib/seq.c
> index 9c3257c..198b2ce 100644
> --- a/lib/seq.c
> +++ b/lib/seq.c
> @@ -55,7 +55,7 @@ struct seq_thread {
>      bool waiting OVS_GUARDED;        /* True if latch_wait() already
> called. */
>  };
>
> -static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
> +static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
>
>  static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
>
> @@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
> OVS_REQUIRES(seq_mutex);
>  static void seq_waiter_destroy(struct seq_waiter *)
> OVS_REQUIRES(seq_mutex);
>  static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>
> +
> +int seq_pmd_trylock(void)
> +     OVS_TRY_LOCK(0, seq_mutex)
> +{
> +  return ovs_mutex_trylock(&seq_mutex);
> +}
> +
> +void seq_pmd_unlock(void)
> +    OVS_RELEASES(seq_mutex)
> +{
> +  ovs_mutex_unlock(&seq_mutex);
> +}
> +
>  /* Creates and returns a new 'seq' object. */
>  struct seq * OVS_EXCLUDED(seq_mutex)
>  seq_create(void)
> diff --git a/lib/seq.h b/lib/seq.h
> index b0ec6bf..f6b6e1a 100644
> --- a/lib/seq.h
> +++ b/lib/seq.h
> @@ -117,6 +117,9 @@
>  #include <stdint.h>
>  #include "util.h"
>
> +int seq_pmd_trylock(void);
> +void seq_pmd_unlock(void);
> +
>  /* For implementation of an object with a sequence number attached. */
>  struct seq *seq_create(void);
>  void seq_destroy(struct seq *);
> --
> 2.5.0
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
>
Flavio Leitner March 24, 2016, 1:15 p.m. UTC | #2
On Thu, 24 Mar 2016 14:10:14 +0800
yewgang <batmanustc@gmail.com> wrote:

> So basically, you replace ovs_mutex_rwlock (or sth) into ovs_mutex_trylock
> in the loop of "other tasks after some time processing the RX queues".
> Is it ?

It isn't replacing since the original locking remains the same.  But yeah,
it tries to lock before it is actually needed so make sure it will not
block later on.

Before:
-------

while () {
     for() {
         process_RX_queues()
     }

     other_tasks()  /* can block on seq_mutex */
}


After:
------

while () {
     for() {
         process_RX_queues()
     }

     if (ovs_mutex_trylock()) {
         other_tasks()   /* can't block on seq_mutex */
     }
}

fbl

> 
> 2016-03-24 11:54 GMT+08:00 Flavio Leitner <fbl@redhat.com>:
> 
> > The PMD thread needs to keep processing RX queues in order
> > archive maximum throughput.  However, it also needs to run
> > other tasks after some time processing the RX queues which
> > a mutex can block the PMD thread.  That causes latency
> > spikes and affects the throughput.
> >
> > Convert to recursive mutex so that PMD thread can test first
> > and if it gets the lock, continue as before, otherwise try
> > again another time.  There is an additional logic to make
> > sure the PMD thread will try harder as the attempt to get
> > the mutex continues to fail.
> >
> > Co-authored-by: Karl Rister <krister@redhat.com>
> > Signed-off-by: Flavio Leitner <fbl@redhat.com>
> > ---
> >  include/openvswitch/thread.h |  3 +++
> >  lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
> >  lib/seq.c                    | 15 ++++++++++++++-
> >  lib/seq.h                    |  3 +++
> >  4 files changed, 42 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
> > index af6f2bb..6d20720 100644
> > --- a/include/openvswitch/thread.h
> > +++ b/include/openvswitch/thread.h
> > @@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
> >  #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> >  #endif
> >
> > +#define OVS_RECURSIVE_MUTEX_INITIALIZER \
> > +   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
> > +
> >  /* ovs_mutex functions analogous to pthread_mutex_*() functions.
> >   *
> >   * Most of these functions abort the process with an error message on any
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 0f2385a..a10b2d1 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -2668,12 +2668,15 @@ static void *
> >  pmd_thread_main(void *f_)
> >  {
> >      struct dp_netdev_pmd_thread *pmd = f_;
> > -    unsigned int lc = 0;
> > +    unsigned int lc_max = 1024;
> > +    unsigned int lc_start;
> > +    unsigned int lc;
> >      struct rxq_poll *poll_list;
> >      unsigned int port_seq = PMD_INITIAL_SEQ;
> >      int poll_cnt;
> >      int i;
> >
> > +    lc_start = 0;
> >      poll_cnt = 0;
> >      poll_list = NULL;
> >
> > @@ -2698,24 +2701,32 @@ reload:
> >       * reloading the updated configuration. */
> >      dp_netdev_pmd_reload_done(pmd);
> >
> > +    lc = lc_start;
> >      for (;;) {
> >          for (i = 0; i < poll_cnt; i++) {
> >              dp_netdev_process_rxq_port(pmd, poll_list[i].port,
> > poll_list[i].rx);
> >          }
> >
> > -        if (lc++ > 1024) {
> > -            unsigned int seq;
> > +        if (lc++ > lc_max) {
> > +            if (!seq_pmd_trylock()) {
> > +                unsigned int seq;
> > +                lc_start = 0;
> > +                lc = 0;
> >
> > -            lc = 0;
> > +                emc_cache_slow_sweep(&pmd->flow_cache);
> > +                coverage_try_clear();
> > +                ovsrcu_quiesce();
> >
> > -            emc_cache_slow_sweep(&pmd->flow_cache);
> > -            coverage_try_clear();
> > -            ovsrcu_quiesce();
> > +                seq_pmd_unlock();
> >
> > -            atomic_read_relaxed(&pmd->change_seq, &seq);
> > -            if (seq != port_seq) {
> > -                port_seq = seq;
> > -                break;
> > +                atomic_read_relaxed(&pmd->change_seq, &seq);
> > +                if (seq != port_seq) {
> > +                    port_seq = seq;
> > +                    break;
> > +                }
> > +            } else {
> > +                lc_start += (lc_start + lc_max)/2;
> > +                lc = lc_start;
> >              }
> >          }
> >      }
> > diff --git a/lib/seq.c b/lib/seq.c
> > index 9c3257c..198b2ce 100644
> > --- a/lib/seq.c
> > +++ b/lib/seq.c
> > @@ -55,7 +55,7 @@ struct seq_thread {
> >      bool waiting OVS_GUARDED;        /* True if latch_wait() already
> > called. */
> >  };
> >
> > -static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
> > +static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
> >
> >  static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> >
> > @@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
> > OVS_REQUIRES(seq_mutex);
> >  static void seq_waiter_destroy(struct seq_waiter *)
> > OVS_REQUIRES(seq_mutex);
> >  static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
> >
> > +
> > +int seq_pmd_trylock(void)
> > +     OVS_TRY_LOCK(0, seq_mutex)
> > +{
> > +  return ovs_mutex_trylock(&seq_mutex);
> > +}
> > +
> > +void seq_pmd_unlock(void)
> > +    OVS_RELEASES(seq_mutex)
> > +{
> > +  ovs_mutex_unlock(&seq_mutex);
> > +}
> > +
> >  /* Creates and returns a new 'seq' object. */
> >  struct seq * OVS_EXCLUDED(seq_mutex)
> >  seq_create(void)
> > diff --git a/lib/seq.h b/lib/seq.h
> > index b0ec6bf..f6b6e1a 100644
> > --- a/lib/seq.h
> > +++ b/lib/seq.h
> > @@ -117,6 +117,9 @@
> >  #include <stdint.h>
> >  #include "util.h"
> >
> > +int seq_pmd_trylock(void);
> > +void seq_pmd_unlock(void);
> > +
> >  /* For implementation of an object with a sequence number attached. */
> >  struct seq *seq_create(void);
> >  void seq_destroy(struct seq *);
> > --
> > 2.5.0
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > http://openvswitch.org/mailman/listinfo/dev
> >  
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
Daniele Di Proietto March 29, 2016, 2:13 a.m. UTC | #3
Hi Flavio and Karl,

thanks for the patch! I have a couple of comments:

Can you point out a configuration where this is the bottleneck?
I'm interested in reproducing this.

I think the implementation would look simpler if we could
avoid explicitly taking the mutex in dpif-netdev and instead
having a ovsrcu_try_quiesce(). What do you think?

I think we can avoid the recursive mutex as well if we introduce
some explicit APIs in seq (seq_try_lock, seq_change_protected and
seq_unlock), but I'd like to understand the performance implication
of this commit first.

On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
<dev-bounces@openvswitch.org on behalf of fbl@redhat.com> wrote:

>The PMD thread needs to keep processing RX queues in order
>archive maximum throughput.  However, it also needs to run
>other tasks after some time processing the RX queues which
>a mutex can block the PMD thread.  That causes latency
>spikes and affects the throughput.
>
>Convert to recursive mutex so that PMD thread can test first
>and if it gets the lock, continue as before, otherwise try
>again another time.  There is an additional logic to make
>sure the PMD thread will try harder as the attempt to get
>the mutex continues to fail.
>
>Co-authored-by: Karl Rister <krister@redhat.com>
>Signed-off-by: Flavio Leitner <fbl@redhat.com>

Oh, we're going to need a signoff from Karl as well :-)

Thanks,

Daniele

>---
> include/openvswitch/thread.h |  3 +++
> lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
> lib/seq.c                    | 15 ++++++++++++++-
> lib/seq.h                    |  3 +++
> 4 files changed, 42 insertions(+), 12 deletions(-)
>
>diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
>index af6f2bb..6d20720 100644
>--- a/include/openvswitch/thread.h
>+++ b/include/openvswitch/thread.h
>@@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
> #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> #endif
> 
>+#define OVS_RECURSIVE_MUTEX_INITIALIZER \
>+   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
>+
> /* ovs_mutex functions analogous to pthread_mutex_*() functions.
>  *
>  * Most of these functions abort the process with an error message on any
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index 0f2385a..a10b2d1 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -2668,12 +2668,15 @@ static void *
> pmd_thread_main(void *f_)
> {
>     struct dp_netdev_pmd_thread *pmd = f_;
>-    unsigned int lc = 0;
>+    unsigned int lc_max = 1024;
>+    unsigned int lc_start;
>+    unsigned int lc;
>     struct rxq_poll *poll_list;
>     unsigned int port_seq = PMD_INITIAL_SEQ;
>     int poll_cnt;
>     int i;
> 
>+    lc_start = 0;
>     poll_cnt = 0;
>     poll_list = NULL;
> 
>@@ -2698,24 +2701,32 @@ reload:
>      * reloading the updated configuration. */
>     dp_netdev_pmd_reload_done(pmd);
> 
>+    lc = lc_start;
>     for (;;) {
>         for (i = 0; i < poll_cnt; i++) {
>             dp_netdev_process_rxq_port(pmd, poll_list[i].port,
>poll_list[i].rx);
>         }
> 
>-        if (lc++ > 1024) {
>-            unsigned int seq;
>+        if (lc++ > lc_max) {
>+            if (!seq_pmd_trylock()) {
>+                unsigned int seq;
>+                lc_start = 0;
>+                lc = 0;
> 
>-            lc = 0;
>+                emc_cache_slow_sweep(&pmd->flow_cache);
>+                coverage_try_clear();
>+                ovsrcu_quiesce();
> 
>-            emc_cache_slow_sweep(&pmd->flow_cache);
>-            coverage_try_clear();
>-            ovsrcu_quiesce();
>+                seq_pmd_unlock();
> 
>-            atomic_read_relaxed(&pmd->change_seq, &seq);
>-            if (seq != port_seq) {
>-                port_seq = seq;
>-                break;
>+                atomic_read_relaxed(&pmd->change_seq, &seq);
>+                if (seq != port_seq) {
>+                    port_seq = seq;
>+                    break;
>+                }
>+            } else {
>+                lc_start += (lc_start + lc_max)/2;
>+                lc = lc_start;
>             }
>         }
>     }
>diff --git a/lib/seq.c b/lib/seq.c
>index 9c3257c..198b2ce 100644
>--- a/lib/seq.c
>+++ b/lib/seq.c
>@@ -55,7 +55,7 @@ struct seq_thread {
>     bool waiting OVS_GUARDED;        /* True if latch_wait() already
>called. */
> };
> 
>-static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>+static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
> 
> static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> 
>@@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
>OVS_REQUIRES(seq_mutex);
> static void seq_waiter_destroy(struct seq_waiter *)
>OVS_REQUIRES(seq_mutex);
> static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
> 
>+
>+int seq_pmd_trylock(void)
>+     OVS_TRY_LOCK(0, seq_mutex)
>+{
>+  return ovs_mutex_trylock(&seq_mutex);
>+}
>+
>+void seq_pmd_unlock(void)
>+    OVS_RELEASES(seq_mutex)
>+{
>+  ovs_mutex_unlock(&seq_mutex);
>+}
>+
> /* Creates and returns a new 'seq' object. */
> struct seq * OVS_EXCLUDED(seq_mutex)
> seq_create(void)
>diff --git a/lib/seq.h b/lib/seq.h
>index b0ec6bf..f6b6e1a 100644
>--- a/lib/seq.h
>+++ b/lib/seq.h
>@@ -117,6 +117,9 @@
> #include <stdint.h>
> #include "util.h"
> 
>+int seq_pmd_trylock(void);
>+void seq_pmd_unlock(void);
>+
> /* For implementation of an object with a sequence number attached. */
> struct seq *seq_create(void);
> void seq_destroy(struct seq *);
>-- 
>2.5.0
>
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>http://openvswitch.org/mailman/listinfo/dev
Flavio Leitner March 29, 2016, 1:08 p.m. UTC | #4
On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
> Hi Flavio and Karl,
> 
> thanks for the patch! I have a couple of comments:
> 
> Can you point out a configuration where this is the bottleneck?
> I'm interested in reproducing this.

Karl, since you did the tests, could you please provide more details?


> I think the implementation would look simpler if we could
> avoid explicitly taking the mutex in dpif-netdev and instead
> having a ovsrcu_try_quiesce(). What do you think?

My concern is that it is freeing one entry from EMC each round
and it should quiesce to allow the callbacks to run.  If, for
some reason, it fails to quiesce for a long period, then it might
backlog a significant number of entries.


> I think we can avoid the recursive mutex as well if we introduce
> some explicit APIs in seq (seq_try_lock, seq_change_protected and
> seq_unlock), but I'd like to understand the performance implication
> of this commit first.

The issue is the latency spike when the PMD thread blocks on the
busy mutex.

The goal with recursive locking is to make sure we can sweep
the EMC cache and quiesce without blocking.  Fixing seq API
would help to not block, but then we have no control to whether
we did both tasks in the same round.

fbl


> 
> On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
> <dev-bounces@openvswitch.org on behalf of fbl@redhat.com> wrote:
> 
> >The PMD thread needs to keep processing RX queues in order
> >archive maximum throughput.  However, it also needs to run
> >other tasks after some time processing the RX queues which
> >a mutex can block the PMD thread.  That causes latency
> >spikes and affects the throughput.
> >
> >Convert to recursive mutex so that PMD thread can test first
> >and if it gets the lock, continue as before, otherwise try
> >again another time.  There is an additional logic to make
> >sure the PMD thread will try harder as the attempt to get
> >the mutex continues to fail.
> >
> >Co-authored-by: Karl Rister <krister@redhat.com>
> >Signed-off-by: Flavio Leitner <fbl@redhat.com>
> 
> Oh, we're going to need a signoff from Karl as well :-)
> 
> Thanks,
> 
> Daniele
> 
> >---
> > include/openvswitch/thread.h |  3 +++
> > lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
> > lib/seq.c                    | 15 ++++++++++++++-
> > lib/seq.h                    |  3 +++
> > 4 files changed, 42 insertions(+), 12 deletions(-)
> >
> >diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
> >index af6f2bb..6d20720 100644
> >--- a/include/openvswitch/thread.h
> >+++ b/include/openvswitch/thread.h
> >@@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
> > #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
> > #endif
> > 
> >+#define OVS_RECURSIVE_MUTEX_INITIALIZER \
> >+   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
> >+
> > /* ovs_mutex functions analogous to pthread_mutex_*() functions.
> >  *
> >  * Most of these functions abort the process with an error message on any
> >diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >index 0f2385a..a10b2d1 100644
> >--- a/lib/dpif-netdev.c
> >+++ b/lib/dpif-netdev.c
> >@@ -2668,12 +2668,15 @@ static void *
> > pmd_thread_main(void *f_)
> > {
> >     struct dp_netdev_pmd_thread *pmd = f_;
> >-    unsigned int lc = 0;
> >+    unsigned int lc_max = 1024;
> >+    unsigned int lc_start;
> >+    unsigned int lc;
> >     struct rxq_poll *poll_list;
> >     unsigned int port_seq = PMD_INITIAL_SEQ;
> >     int poll_cnt;
> >     int i;
> > 
> >+    lc_start = 0;
> >     poll_cnt = 0;
> >     poll_list = NULL;
> > 
> >@@ -2698,24 +2701,32 @@ reload:
> >      * reloading the updated configuration. */
> >     dp_netdev_pmd_reload_done(pmd);
> > 
> >+    lc = lc_start;
> >     for (;;) {
> >         for (i = 0; i < poll_cnt; i++) {
> >             dp_netdev_process_rxq_port(pmd, poll_list[i].port,
> >poll_list[i].rx);
> >         }
> > 
> >-        if (lc++ > 1024) {
> >-            unsigned int seq;
> >+        if (lc++ > lc_max) {
> >+            if (!seq_pmd_trylock()) {
> >+                unsigned int seq;
> >+                lc_start = 0;
> >+                lc = 0;
> > 
> >-            lc = 0;
> >+                emc_cache_slow_sweep(&pmd->flow_cache);
> >+                coverage_try_clear();
> >+                ovsrcu_quiesce();
> > 
> >-            emc_cache_slow_sweep(&pmd->flow_cache);
> >-            coverage_try_clear();
> >-            ovsrcu_quiesce();
> >+                seq_pmd_unlock();
> > 
> >-            atomic_read_relaxed(&pmd->change_seq, &seq);
> >-            if (seq != port_seq) {
> >-                port_seq = seq;
> >-                break;
> >+                atomic_read_relaxed(&pmd->change_seq, &seq);
> >+                if (seq != port_seq) {
> >+                    port_seq = seq;
> >+                    break;
> >+                }
> >+            } else {
> >+                lc_start += (lc_start + lc_max)/2;
> >+                lc = lc_start;
> >             }
> >         }
> >     }
> >diff --git a/lib/seq.c b/lib/seq.c
> >index 9c3257c..198b2ce 100644
> >--- a/lib/seq.c
> >+++ b/lib/seq.c
> >@@ -55,7 +55,7 @@ struct seq_thread {
> >     bool waiting OVS_GUARDED;        /* True if latch_wait() already
> >called. */
> > };
> > 
> >-static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
> >+static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
> > 
> > static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
> > 
> >@@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
> >OVS_REQUIRES(seq_mutex);
> > static void seq_waiter_destroy(struct seq_waiter *)
> >OVS_REQUIRES(seq_mutex);
> > static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
> > 
> >+
> >+int seq_pmd_trylock(void)
> >+     OVS_TRY_LOCK(0, seq_mutex)
> >+{
> >+  return ovs_mutex_trylock(&seq_mutex);
> >+}
> >+
> >+void seq_pmd_unlock(void)
> >+    OVS_RELEASES(seq_mutex)
> >+{
> >+  ovs_mutex_unlock(&seq_mutex);
> >+}
> >+
> > /* Creates and returns a new 'seq' object. */
> > struct seq * OVS_EXCLUDED(seq_mutex)
> > seq_create(void)
> >diff --git a/lib/seq.h b/lib/seq.h
> >index b0ec6bf..f6b6e1a 100644
> >--- a/lib/seq.h
> >+++ b/lib/seq.h
> >@@ -117,6 +117,9 @@
> > #include <stdint.h>
> > #include "util.h"
> > 
> >+int seq_pmd_trylock(void);
> >+void seq_pmd_unlock(void);
> >+
> > /* For implementation of an object with a sequence number attached. */
> > struct seq *seq_create(void);
> > void seq_destroy(struct seq *);
> >-- 
> >2.5.0
> >
> >_______________________________________________
> >dev mailing list
> >dev@openvswitch.org
> >http://openvswitch.org/mailman/listinfo/dev
>
Karl Rister March 29, 2016, 1:44 p.m. UTC | #5
On 03/29/2016 08:08 AM, Flavio Leitner wrote:
> On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
>> Hi Flavio and Karl,
>>
>> thanks for the patch! I have a couple of comments:
>>
>> Can you point out a configuration where this is the bottleneck?
>> I'm interested in reproducing this.
> 
> Karl, since you did the tests, could you please provide more details?

When performing packet forwarding latency tests, I first noticed system
and idle time when looking at CPU statistics when I expected the PMD
threads to be 100% in userspace.  I used the kernel ftrace facility to
track down what was happening and saw that the PMD thread was being
context switched out and going idle.  The PMD thread was pinned to CPU
core thread isolated with isolcpus so there was no competing task that
could be scheduled to cause the context switch and I would not expect
the polling thread to ever go idle.  Further analysis with frace and gdb
tracked the cause to seq_mutex blocking when another task held the mutex.

I would estimate that this change removed packet latency spikes of 35-45
usecs in our test scenario.

The test is forwarding packets through a KVM guest using OVS+DPDK in the
host and the DPDK testpmd application in the guest.

Flavio, I thought I remembered you also saying that you saw a throughput
improvement in a test you were running?

> 
> 
>> I think the implementation would look simpler if we could
>> avoid explicitly taking the mutex in dpif-netdev and instead
>> having a ovsrcu_try_quiesce(). What do you think?
> 
> My concern is that it is freeing one entry from EMC each round
> and it should quiesce to allow the callbacks to run.  If, for
> some reason, it fails to quiesce for a long period, then it might
> backlog a significant number of entries.

My initial approach, which Flavio's code is very similar to, was simply
trying to provide the simplest change to achieve what I was looking for.
 I could certainly see alternative solutions being more appropriate.

> 
> 
>> I think we can avoid the recursive mutex as well if we introduce
>> some explicit APIs in seq (seq_try_lock, seq_change_protected and
>> seq_unlock), but I'd like to understand the performance implication
>> of this commit first.

One other area of the sequence code that I thought was curious was a
single mutex that covered all sequences.  If updating the API is a
possibility I would think going to a mutex per sequence might be an
appropriate change as well.  That said, I don't have data that
specifically points this out as a problem.

> 
> The issue is the latency spike when the PMD thread blocks on the
> busy mutex.
> 
> The goal with recursive locking is to make sure we can sweep
> the EMC cache and quiesce without blocking.  Fixing seq API
> would help to not block, but then we have no control to whether
> we did both tasks in the same round.
> 
> fbl
> 
> 
>>
>> On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
>> <dev-bounces@openvswitch.org on behalf of fbl@redhat.com> wrote:
>>
>>> The PMD thread needs to keep processing RX queues in order
>>> archive maximum throughput.  However, it also needs to run
>>> other tasks after some time processing the RX queues which
>>> a mutex can block the PMD thread.  That causes latency
>>> spikes and affects the throughput.
>>>
>>> Convert to recursive mutex so that PMD thread can test first
>>> and if it gets the lock, continue as before, otherwise try
>>> again another time.  There is an additional logic to make
>>> sure the PMD thread will try harder as the attempt to get
>>> the mutex continues to fail.
>>>
>>> Co-authored-by: Karl Rister <krister@redhat.com>
>>> Signed-off-by: Flavio Leitner <fbl@redhat.com>
>>
>> Oh, we're going to need a signoff from Karl as well :-)

Signed-off-by: Karl Rister <krister@redhat.com>

Is this good enough?

>>
>> Thanks,
>>
>> Daniele
>>
>>> ---
>>> include/openvswitch/thread.h |  3 +++
>>> lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
>>> lib/seq.c                    | 15 ++++++++++++++-
>>> lib/seq.h                    |  3 +++
>>> 4 files changed, 42 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
>>> index af6f2bb..6d20720 100644
>>> --- a/include/openvswitch/thread.h
>>> +++ b/include/openvswitch/thread.h
>>> @@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
>>> #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
>>> #endif
>>>
>>> +#define OVS_RECURSIVE_MUTEX_INITIALIZER \
>>> +   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
>>> +
>>> /* ovs_mutex functions analogous to pthread_mutex_*() functions.
>>>  *
>>>  * Most of these functions abort the process with an error message on any
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index 0f2385a..a10b2d1 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -2668,12 +2668,15 @@ static void *
>>> pmd_thread_main(void *f_)
>>> {
>>>     struct dp_netdev_pmd_thread *pmd = f_;
>>> -    unsigned int lc = 0;
>>> +    unsigned int lc_max = 1024;
>>> +    unsigned int lc_start;
>>> +    unsigned int lc;
>>>     struct rxq_poll *poll_list;
>>>     unsigned int port_seq = PMD_INITIAL_SEQ;
>>>     int poll_cnt;
>>>     int i;
>>>
>>> +    lc_start = 0;
>>>     poll_cnt = 0;
>>>     poll_list = NULL;
>>>
>>> @@ -2698,24 +2701,32 @@ reload:
>>>      * reloading the updated configuration. */
>>>     dp_netdev_pmd_reload_done(pmd);
>>>
>>> +    lc = lc_start;
>>>     for (;;) {
>>>         for (i = 0; i < poll_cnt; i++) {
>>>             dp_netdev_process_rxq_port(pmd, poll_list[i].port,
>>> poll_list[i].rx);
>>>         }
>>>
>>> -        if (lc++ > 1024) {
>>> -            unsigned int seq;
>>> +        if (lc++ > lc_max) {
>>> +            if (!seq_pmd_trylock()) {
>>> +                unsigned int seq;
>>> +                lc_start = 0;
>>> +                lc = 0;
>>>
>>> -            lc = 0;
>>> +                emc_cache_slow_sweep(&pmd->flow_cache);
>>> +                coverage_try_clear();
>>> +                ovsrcu_quiesce();
>>>
>>> -            emc_cache_slow_sweep(&pmd->flow_cache);
>>> -            coverage_try_clear();
>>> -            ovsrcu_quiesce();
>>> +                seq_pmd_unlock();
>>>
>>> -            atomic_read_relaxed(&pmd->change_seq, &seq);
>>> -            if (seq != port_seq) {
>>> -                port_seq = seq;
>>> -                break;
>>> +                atomic_read_relaxed(&pmd->change_seq, &seq);
>>> +                if (seq != port_seq) {
>>> +                    port_seq = seq;
>>> +                    break;
>>> +                }
>>> +            } else {
>>> +                lc_start += (lc_start + lc_max)/2;
>>> +                lc = lc_start;
>>>             }
>>>         }
>>>     }
>>> diff --git a/lib/seq.c b/lib/seq.c
>>> index 9c3257c..198b2ce 100644
>>> --- a/lib/seq.c
>>> +++ b/lib/seq.c
>>> @@ -55,7 +55,7 @@ struct seq_thread {
>>>     bool waiting OVS_GUARDED;        /* True if latch_wait() already
>>> called. */
>>> };
>>>
>>> -static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>>> +static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
>>>
>>> static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
>>>
>>> @@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
>>> OVS_REQUIRES(seq_mutex);
>>> static void seq_waiter_destroy(struct seq_waiter *)
>>> OVS_REQUIRES(seq_mutex);
>>> static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>
>>> +
>>> +int seq_pmd_trylock(void)
>>> +     OVS_TRY_LOCK(0, seq_mutex)
>>> +{
>>> +  return ovs_mutex_trylock(&seq_mutex);
>>> +}
>>> +
>>> +void seq_pmd_unlock(void)
>>> +    OVS_RELEASES(seq_mutex)
>>> +{
>>> +  ovs_mutex_unlock(&seq_mutex);
>>> +}
>>> +
>>> /* Creates and returns a new 'seq' object. */
>>> struct seq * OVS_EXCLUDED(seq_mutex)
>>> seq_create(void)
>>> diff --git a/lib/seq.h b/lib/seq.h
>>> index b0ec6bf..f6b6e1a 100644
>>> --- a/lib/seq.h
>>> +++ b/lib/seq.h
>>> @@ -117,6 +117,9 @@
>>> #include <stdint.h>
>>> #include "util.h"
>>>
>>> +int seq_pmd_trylock(void);
>>> +void seq_pmd_unlock(void);
>>> +
>>> /* For implementation of an object with a sequence number attached. */
>>> struct seq *seq_create(void);
>>> void seq_destroy(struct seq *);
>>> -- 
>>> 2.5.0
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> http://openvswitch.org/mailman/listinfo/dev
>>
>
Daniele Di Proietto March 30, 2016, 1:53 a.m. UTC | #6
On 29/03/2016 06:08, "Flavio Leitner" <fbl@redhat.com> wrote:

>On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
>> Hi Flavio and Karl,
>> 
>> thanks for the patch! I have a couple of comments:
>> 
>> Can you point out a configuration where this is the bottleneck?
>> I'm interested in reproducing this.
>
>Karl, since you did the tests, could you please provide more details?
>
>
>> I think the implementation would look simpler if we could
>> avoid explicitly taking the mutex in dpif-netdev and instead
>> having a ovsrcu_try_quiesce(). What do you think?
>
>My concern is that it is freeing one entry from EMC each round
>and it should quiesce to allow the callbacks to run.  If, for
>some reason, it fails to quiesce for a long period, then it might
>backlog a significant number of entries.
>
>
>> I think we can avoid the recursive mutex as well if we introduce
>> some explicit APIs in seq (seq_try_lock, seq_change_protected and
>> seq_unlock), but I'd like to understand the performance implication
>> of this commit first.
>
>The issue is the latency spike when the PMD thread blocks on the
>busy mutex.
>
>The goal with recursive locking is to make sure we can sweep
>the EMC cache and quiesce without blocking.  Fixing seq API
>would help to not block, but then we have no control to whether
>we did both tasks in the same round.
>
>fbl

If I understand your concerns correctly, I think we can have something
like:

if (ovsrcu_try_quiesce()) {
    ...
    emc_cache_slow_sweep();
    ...
}

Sure, the swept flows will need to wait another round to actually get
freed,
but I think this is ok
Daniele Di Proietto March 30, 2016, 3:20 a.m. UTC | #7
On 29/03/2016 06:44, "Karl Rister" <krister@redhat.com> wrote:

>On 03/29/2016 08:08 AM, Flavio Leitner wrote:
>> On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
>>> Hi Flavio and Karl,
>>>
>>> thanks for the patch! I have a couple of comments:
>>>
>>> Can you point out a configuration where this is the bottleneck?
>>> I'm interested in reproducing this.
>> 
>> Karl, since you did the tests, could you please provide more details?
>
>When performing packet forwarding latency tests, I first noticed system
>and idle time when looking at CPU statistics when I expected the PMD
>threads to be 100% in userspace.  I used the kernel ftrace facility to
>track down what was happening and saw that the PMD thread was being
>context switched out and going idle.  The PMD thread was pinned to CPU
>core thread isolated with isolcpus so there was no competing task that
>could be scheduled to cause the context switch and I would not expect
>the polling thread to ever go idle.  Further analysis with frace and gdb
>tracked the cause to seq_mutex blocking when another task held the mutex.
>
>I would estimate that this change removed packet latency spikes of 35-45
>usecs in our test scenario.
>
>The test is forwarding packets through a KVM guest using OVS+DPDK in the
>host and the DPDK testpmd application in the guest.

Thanks the explanation

>
>Flavio, I thought I remembered you also saying that you saw a throughput
>improvement in a test you were running?
>
>> 
>> 
>>> I think the implementation would look simpler if we could
>>> avoid explicitly taking the mutex in dpif-netdev and instead
>>> having a ovsrcu_try_quiesce(). What do you think?
>> 
>> My concern is that it is freeing one entry from EMC each round
>> and it should quiesce to allow the callbacks to run.  If, for
>> some reason, it fails to quiesce for a long period, then it might
>> backlog a significant number of entries.
>
>My initial approach, which Flavio's code is very similar to, was simply
>trying to provide the simplest change to achieve what I was looking for.
> I could certainly see alternative solutions being more appropriate.
>
>> 
>> 
>>> I think we can avoid the recursive mutex as well if we introduce
>>> some explicit APIs in seq (seq_try_lock, seq_change_protected and
>>> seq_unlock), but I'd like to understand the performance implication
>>> of this commit first.
>
>One other area of the sequence code that I thought was curious was a
>single mutex that covered all sequences.  If updating the API is a
>possibility I would think going to a mutex per sequence might be an
>appropriate change as well.  That said, I don't have data that
>specifically points this out as a problem.

If we find this to be a bottleneck I think we can have a finer-grained
locking.

>
>> 
>> The issue is the latency spike when the PMD thread blocks on the
>> busy mutex.
>> 
>> The goal with recursive locking is to make sure we can sweep
>> the EMC cache and quiesce without blocking.  Fixing seq API
>> would help to not block, but then we have no control to whether
>> we did both tasks in the same round.
>> 
>> fbl
>> 
>> 
>>>
>>> On 23/03/2016 20:54, "dev on behalf of Flavio Leitner"
>>> <dev-bounces@openvswitch.org on behalf of fbl@redhat.com> wrote:
>>>
>>>> The PMD thread needs to keep processing RX queues in order
>>>> archive maximum throughput.  However, it also needs to run
>>>> other tasks after some time processing the RX queues which
>>>> a mutex can block the PMD thread.  That causes latency
>>>> spikes and affects the throughput.
>>>>
>>>> Convert to recursive mutex so that PMD thread can test first
>>>> and if it gets the lock, continue as before, otherwise try
>>>> again another time.  There is an additional logic to make
>>>> sure the PMD thread will try harder as the attempt to get
>>>> the mutex continues to fail.
>>>>
>>>> Co-authored-by: Karl Rister <krister@redhat.com>
>>>> Signed-off-by: Flavio Leitner <fbl@redhat.com>
>>>
>>> Oh, we're going to need a signoff from Karl as well :-)
>
>Signed-off-by: Karl Rister <krister@redhat.com>
>
>Is this good enough?

Absolutely, thanks!

>
>>>
>>> Thanks,
>>>
>>> Daniele
>>>
>>>> ---
>>>> include/openvswitch/thread.h |  3 +++
>>>> lib/dpif-netdev.c            | 33 ++++++++++++++++++++++-----------
>>>> lib/seq.c                    | 15 ++++++++++++++-
>>>> lib/seq.h                    |  3 +++
>>>> 4 files changed, 42 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/include/openvswitch/thread.h
>>>>b/include/openvswitch/thread.h
>>>> index af6f2bb..6d20720 100644
>>>> --- a/include/openvswitch/thread.h
>>>> +++ b/include/openvswitch/thread.h
>>>> @@ -44,6 +44,9 @@ struct OVS_LOCKABLE ovs_mutex {
>>>> #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
>>>> #endif
>>>>
>>>> +#define OVS_RECURSIVE_MUTEX_INITIALIZER \
>>>> +   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
>>>> +
>>>> /* ovs_mutex functions analogous to pthread_mutex_*() functions.
>>>>  *
>>>>  * Most of these functions abort the process with an error message on
>>>>any
>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>>> index 0f2385a..a10b2d1 100644
>>>> --- a/lib/dpif-netdev.c
>>>> +++ b/lib/dpif-netdev.c
>>>> @@ -2668,12 +2668,15 @@ static void *
>>>> pmd_thread_main(void *f_)
>>>> {
>>>>     struct dp_netdev_pmd_thread *pmd = f_;
>>>> -    unsigned int lc = 0;
>>>> +    unsigned int lc_max = 1024;
>>>> +    unsigned int lc_start;
>>>> +    unsigned int lc;
>>>>     struct rxq_poll *poll_list;
>>>>     unsigned int port_seq = PMD_INITIAL_SEQ;
>>>>     int poll_cnt;
>>>>     int i;
>>>>
>>>> +    lc_start = 0;
>>>>     poll_cnt = 0;
>>>>     poll_list = NULL;
>>>>
>>>> @@ -2698,24 +2701,32 @@ reload:
>>>>      * reloading the updated configuration. */
>>>>     dp_netdev_pmd_reload_done(pmd);
>>>>
>>>> +    lc = lc_start;
>>>>     for (;;) {
>>>>         for (i = 0; i < poll_cnt; i++) {
>>>>             dp_netdev_process_rxq_port(pmd, poll_list[i].port,
>>>> poll_list[i].rx);
>>>>         }
>>>>
>>>> -        if (lc++ > 1024) {
>>>> -            unsigned int seq;
>>>> +        if (lc++ > lc_max) {
>>>> +            if (!seq_pmd_trylock()) {
>>>> +                unsigned int seq;
>>>> +                lc_start = 0;
>>>> +                lc = 0;
>>>>
>>>> -            lc = 0;
>>>> +                emc_cache_slow_sweep(&pmd->flow_cache);
>>>> +                coverage_try_clear();
>>>> +                ovsrcu_quiesce();
>>>>
>>>> -            emc_cache_slow_sweep(&pmd->flow_cache);
>>>> -            coverage_try_clear();
>>>> -            ovsrcu_quiesce();
>>>> +                seq_pmd_unlock();
>>>>
>>>> -            atomic_read_relaxed(&pmd->change_seq, &seq);
>>>> -            if (seq != port_seq) {
>>>> -                port_seq = seq;
>>>> -                break;
>>>> +                atomic_read_relaxed(&pmd->change_seq, &seq);
>>>> +                if (seq != port_seq) {
>>>> +                    port_seq = seq;
>>>> +                    break;
>>>> +                }
>>>> +            } else {
>>>> +                lc_start += (lc_start + lc_max)/2;
>>>> +                lc = lc_start;
>>>>             }
>>>>         }
>>>>     }
>>>> diff --git a/lib/seq.c b/lib/seq.c
>>>> index 9c3257c..198b2ce 100644
>>>> --- a/lib/seq.c
>>>> +++ b/lib/seq.c
>>>> @@ -55,7 +55,7 @@ struct seq_thread {
>>>>     bool waiting OVS_GUARDED;        /* True if latch_wait() already
>>>> called. */
>>>> };
>>>>
>>>> -static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
>>>> +static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
>>>>
>>>> static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
>>>>
>>>> @@ -68,6 +68,19 @@ static void seq_thread_woke(struct seq_thread *)
>>>> OVS_REQUIRES(seq_mutex);
>>>> static void seq_waiter_destroy(struct seq_waiter *)
>>>> OVS_REQUIRES(seq_mutex);
>>>> static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
>>>>
>>>> +
>>>> +int seq_pmd_trylock(void)
>>>> +     OVS_TRY_LOCK(0, seq_mutex)
>>>> +{
>>>> +  return ovs_mutex_trylock(&seq_mutex);
>>>> +}
>>>> +
>>>> +void seq_pmd_unlock(void)
>>>> +    OVS_RELEASES(seq_mutex)
>>>> +{
>>>> +  ovs_mutex_unlock(&seq_mutex);
>>>> +}
>>>> +
>>>> /* Creates and returns a new 'seq' object. */
>>>> struct seq * OVS_EXCLUDED(seq_mutex)
>>>> seq_create(void)
>>>> diff --git a/lib/seq.h b/lib/seq.h
>>>> index b0ec6bf..f6b6e1a 100644
>>>> --- a/lib/seq.h
>>>> +++ b/lib/seq.h
>>>> @@ -117,6 +117,9 @@
>>>> #include <stdint.h>
>>>> #include "util.h"
>>>>
>>>> +int seq_pmd_trylock(void);
>>>> +void seq_pmd_unlock(void);
>>>> +
>>>> /* For implementation of an object with a sequence number attached. */
>>>> struct seq *seq_create(void);
>>>> void seq_destroy(struct seq *);
>>>> -- 
>>>> 2.5.0
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> http://openvswitch.org/mailman/listinfo/dev
>>>
>> 
>
>
>-- 
>Karl Rister <krister@redhat.com>
Flavio Leitner March 30, 2016, 2:15 p.m. UTC | #8
On Wed, Mar 30, 2016 at 01:53:55AM +0000, Daniele Di Proietto wrote:
> 
> 
> On 29/03/2016 06:08, "Flavio Leitner" <fbl@redhat.com> wrote:
> 
> >On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
> >> Hi Flavio and Karl,
> >> 
> >> thanks for the patch! I have a couple of comments:
> >> 
> >> Can you point out a configuration where this is the bottleneck?
> >> I'm interested in reproducing this.
> >
> >Karl, since you did the tests, could you please provide more details?
> >
> >
> >> I think the implementation would look simpler if we could
> >> avoid explicitly taking the mutex in dpif-netdev and instead
> >> having a ovsrcu_try_quiesce(). What do you think?
> >
> >My concern is that it is freeing one entry from EMC each round
> >and it should quiesce to allow the callbacks to run.  If, for
> >some reason, it fails to quiesce for a long period, then it might
> >backlog a significant number of entries.
> >
> >
> >> I think we can avoid the recursive mutex as well if we introduce
> >> some explicit APIs in seq (seq_try_lock, seq_change_protected and
> >> seq_unlock), but I'd like to understand the performance implication
> >> of this commit first.
> >
> >The issue is the latency spike when the PMD thread blocks on the
> >busy mutex.
> >
> >The goal with recursive locking is to make sure we can sweep
> >the EMC cache and quiesce without blocking.  Fixing seq API
> >would help to not block, but then we have no control to whether
> >we did both tasks in the same round.
> >
> >fbl
> 
> If I understand your concerns correctly, I think we can have something
> like:
> 
> if (ovsrcu_try_quiesce()) {
>     ...
>     emc_cache_slow_sweep();
>     ...
> }
> 
> Sure, the swept flows will need to wait another round to actually get
> freed,
> but I think this is ok

I was trying to make sure both tasks were executed in a logical order,
so you first sweep and then you quiesce, but yeah, it would be fine to
leave an entry for the next round.  It seems a reasonable price to
avoid recursive locks.

I will try expanding seq API to see how it goes.

Thanks for the review Daniele!
Ben Pfaff March 30, 2016, 6:31 p.m. UTC | #9
On Wed, Mar 30, 2016 at 03:20:33AM +0000, Daniele Di Proietto wrote:
> On 29/03/2016 06:44, "Karl Rister" <krister@redhat.com> wrote:
> >One other area of the sequence code that I thought was curious was a
> >single mutex that covered all sequences.  If updating the API is a
> >possibility I would think going to a mutex per sequence might be an
> >appropriate change as well.  That said, I don't have data that
> >specifically points this out as a problem.
> 
> If we find this to be a bottleneck I think we can have a finer-grained
> locking.

I designed the seq code to be really simple and figured that we might
need to revisit it one day.  I'd be happy to see it improved.
Flavio Leitner May 4, 2016, 8:16 p.m. UTC | #10
On Wed, Mar 30, 2016 at 01:53:55AM +0000, Daniele Di Proietto wrote:
> 
> 
> On 29/03/2016 06:08, "Flavio Leitner" <fbl@redhat.com> wrote:
> 
> >On Tue, Mar 29, 2016 at 02:13:18AM +0000, Daniele Di Proietto wrote:
> >> Hi Flavio and Karl,
> >> 
> >> thanks for the patch! I have a couple of comments:
> >> 
> >> Can you point out a configuration where this is the bottleneck?
> >> I'm interested in reproducing this.
> >
> >Karl, since you did the tests, could you please provide more details?
> >
> >
> >> I think the implementation would look simpler if we could
> >> avoid explicitly taking the mutex in dpif-netdev and instead
> >> having a ovsrcu_try_quiesce(). What do you think?
> >
> >My concern is that it is freeing one entry from EMC each round
> >and it should quiesce to allow the callbacks to run.  If, for
> >some reason, it fails to quiesce for a long period, then it might
> >backlog a significant number of entries.
> >
> >
> >> I think we can avoid the recursive mutex as well if we introduce
> >> some explicit APIs in seq (seq_try_lock, seq_change_protected and
> >> seq_unlock), but I'd like to understand the performance implication
> >> of this commit first.
> >
> >The issue is the latency spike when the PMD thread blocks on the
> >busy mutex.
> >
> >The goal with recursive locking is to make sure we can sweep
> >the EMC cache and quiesce without blocking.  Fixing seq API
> >would help to not block, but then we have no control to whether
> >we did both tasks in the same round.
> >
> >fbl
> 
> If I understand your concerns correctly, I think we can have something
> like:
> 
> if (ovsrcu_try_quiesce()) {
>     ...
>     emc_cache_slow_sweep();
>     ...
> }
> 
> Sure, the swept flows will need to wait another round to actually get
> freed,
> but I think this is ok

I posted v2 extending seq API.
http://openvswitch.org/pipermail/dev/2016-May/070486.html
diff mbox

Patch

diff --git a/include/openvswitch/thread.h b/include/openvswitch/thread.h
index af6f2bb..6d20720 100644
--- a/include/openvswitch/thread.h
+++ b/include/openvswitch/thread.h
@@ -44,6 +44,9 @@  struct OVS_LOCKABLE ovs_mutex {
 #define OVS_ADAPTIVE_MUTEX_INITIALIZER OVS_MUTEX_INITIALIZER
 #endif
 
+#define OVS_RECURSIVE_MUTEX_INITIALIZER \
+   { PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP, "<unlocked>" }
+
 /* ovs_mutex functions analogous to pthread_mutex_*() functions.
  *
  * Most of these functions abort the process with an error message on any
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 0f2385a..a10b2d1 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2668,12 +2668,15 @@  static void *
 pmd_thread_main(void *f_)
 {
     struct dp_netdev_pmd_thread *pmd = f_;
-    unsigned int lc = 0;
+    unsigned int lc_max = 1024;
+    unsigned int lc_start;
+    unsigned int lc;
     struct rxq_poll *poll_list;
     unsigned int port_seq = PMD_INITIAL_SEQ;
     int poll_cnt;
     int i;
 
+    lc_start = 0;
     poll_cnt = 0;
     poll_list = NULL;
 
@@ -2698,24 +2701,32 @@  reload:
      * reloading the updated configuration. */
     dp_netdev_pmd_reload_done(pmd);
 
+    lc = lc_start;
     for (;;) {
         for (i = 0; i < poll_cnt; i++) {
             dp_netdev_process_rxq_port(pmd, poll_list[i].port, poll_list[i].rx);
         }
 
-        if (lc++ > 1024) {
-            unsigned int seq;
+        if (lc++ > lc_max) {
+            if (!seq_pmd_trylock()) {
+                unsigned int seq;
+                lc_start = 0;
+                lc = 0;
 
-            lc = 0;
+                emc_cache_slow_sweep(&pmd->flow_cache);
+                coverage_try_clear();
+                ovsrcu_quiesce();
 
-            emc_cache_slow_sweep(&pmd->flow_cache);
-            coverage_try_clear();
-            ovsrcu_quiesce();
+                seq_pmd_unlock();
 
-            atomic_read_relaxed(&pmd->change_seq, &seq);
-            if (seq != port_seq) {
-                port_seq = seq;
-                break;
+                atomic_read_relaxed(&pmd->change_seq, &seq);
+                if (seq != port_seq) {
+                    port_seq = seq;
+                    break;
+                }
+            } else {
+                lc_start += (lc_start + lc_max)/2;
+                lc = lc_start;
             }
         }
     }
diff --git a/lib/seq.c b/lib/seq.c
index 9c3257c..198b2ce 100644
--- a/lib/seq.c
+++ b/lib/seq.c
@@ -55,7 +55,7 @@  struct seq_thread {
     bool waiting OVS_GUARDED;        /* True if latch_wait() already called. */
 };
 
-static struct ovs_mutex seq_mutex = OVS_MUTEX_INITIALIZER;
+static struct ovs_mutex seq_mutex = OVS_RECURSIVE_MUTEX_INITIALIZER;
 
 static uint64_t seq_next OVS_GUARDED_BY(seq_mutex) = 1;
 
@@ -68,6 +68,19 @@  static void seq_thread_woke(struct seq_thread *) OVS_REQUIRES(seq_mutex);
 static void seq_waiter_destroy(struct seq_waiter *) OVS_REQUIRES(seq_mutex);
 static void seq_wake_waiters(struct seq *) OVS_REQUIRES(seq_mutex);
 
+
+int seq_pmd_trylock(void)
+     OVS_TRY_LOCK(0, seq_mutex)
+{
+  return ovs_mutex_trylock(&seq_mutex);
+}
+
+void seq_pmd_unlock(void)
+    OVS_RELEASES(seq_mutex)
+{
+  ovs_mutex_unlock(&seq_mutex);
+}
+
 /* Creates and returns a new 'seq' object. */
 struct seq * OVS_EXCLUDED(seq_mutex)
 seq_create(void)
diff --git a/lib/seq.h b/lib/seq.h
index b0ec6bf..f6b6e1a 100644
--- a/lib/seq.h
+++ b/lib/seq.h
@@ -117,6 +117,9 @@ 
 #include <stdint.h>
 #include "util.h"
 
+int seq_pmd_trylock(void);
+void seq_pmd_unlock(void);
+
 /* For implementation of an object with a sequence number attached. */
 struct seq *seq_create(void);
 void seq_destroy(struct seq *);