diff mbox

[ovs-dev,00/17] DPDK/pmd reconfiguration refactor and bugfixes

Message ID 5d7e9b4a-274a-1fb1-92dc-f1220c230eca@samsung.com
State Not Applicable
Headers show

Commit Message

Ilya Maximets Nov. 25, 2016, 2:19 p.m. UTC
Sorry, CC to list.

On 25.11.2016 17:15, Ilya Maximets wrote:
Hi, Daniele.

I've prepared some incremental changes for this path-set (only very
simple tests performed). Mostly for the largest patch.
I hope, you will find some of this changes useful.
I'll continue to test and review this path-set.

---
 lib/dpif-netdev.c | 65 ++++++++++++++++++++++++-------------------------------
 lib/ovs-numa.c    | 23 ++++++--------------
 lib/ovs-numa.h    |  1 +
 3 files changed, 36 insertions(+), 53 deletions(-)

--


Best regards, Ilya Maximets.

> On 16.11.2016 03:45, diproiettod at vmware.com (Daniele Di Proietto) wrote:
>> The first two commits of the series are trivial bugfixes for dpif-netdev.
>>
>> Then the series fixes a long standing bug that caused a crash when the
>> admin link state of a port is changed while traffic is flowing.
>>
>> The next part makes use of reconfiguration for port add: this makes
>> the operation twice as fast and reduce some code duplication.  This part
>> conflicts with the port naming change, so I'm willing to postpone it, unless
>> we find it to be useful for the port naming change.
>>
>> The rest of the series refactors a lot of code if dpif-netdev:
>>
>> * We no longer start pmd threads on demand for each numa node.  This made
>>   the code very complicated and introduced a lot of bugs.
>> * The pmd threads state is now internal to dpif-netdev and it's not stored in
>>   ovs-numa.
>> * There's now a single function that handles pmd threads/ports changes: this
>>   reduces code duplication and makes port reconfiguration faster, as we don't
>>   have to bring down the whole datapath.
>>
>> Daniele Di Proietto (17):
>>   dpif-netdev: Fix memory leak.
>>   dpif-netdev: Take non_pmd_mutex to access tx cached ports.
>>   dpif-netdev: Don't try to output on a device without txqs.
>>   netdev-dpdk: Don't call rte_dev_stop() in update_flags().
>>   netdev-dpdk: Start the device only once on port-add.
>>   netdev-dpdk: Refactor construct and destruct.
>>   dpif-netdev: Use a boolean instead of pmd->port_seq.
>>   dpif-netdev: Block pmd threads if there are no ports.
>>   dpif-netdev: Create pmd threads for every numa node.
>>   dpif-netdev: Make 'static_tx_qid' const.
>>   dpctl: Avoid making assumptions on pmd threads.
>>   ovs-numa: New ovs_numa_dump_contains_core() function.
>>   ovs-numa: Add new dump types.
>>   dpif-netdev: core_id shouldn't be negative.
>>   dpif-netdev: Use hmap for poll_list in pmd threads.
>>   dpif-netdev: Centralized threads and queues handling code.
>>   ovs-numa: Remove unused functions.
>>
>>  lib/dpctl.c       |  107 +---
>>  lib/dpif-netdev.c | 1429 ++++++++++++++++++++++++++++-------------------------
>>  lib/dpif.c        |    6 +-
>>  lib/dpif.h        |   12 +-
>>  lib/netdev-dpdk.c |  173 +++----
>>  lib/netdev.c      |   21 +-
>>  lib/netdev.h      |    1 +
>>  lib/ovs-numa.c    |  227 +++------
>>  lib/ovs-numa.h    |   21 +-
>>  tests/pmd.at      |   49 +-
>>  vswitchd/bridge.c |    2 +
>>  11 files changed, 1030 insertions(+), 1018 deletions(-)
>>

Comments

Daniele Di Proietto Nov. 30, 2016, 3:02 a.m. UTC | #1
Thanks for the review and for the incremental!

I applied everything to the various commits, except one thing, see below

I accumulated enough comments to send a v2, but I wanted to have a look first at a couple of patches that may conflict with this (mostly vhost pmd).

Daniele




On 25/11/2016 06:19, "Ilya Maximets" <i.maximets@samsung.com> wrote:

>Sorry, CC to list.
>
>On 25.11.2016 17:15, Ilya Maximets wrote:
>Hi, Daniele.
>
>I've prepared some incremental changes for this path-set (only very
>simple tests performed). Mostly for the largest patch.
>I hope, you will find some of this changes useful.
>I'll continue to test and review this path-set.
>
>---
> lib/dpif-netdev.c | 65 ++++++++++++++++++++++++-------------------------------
> lib/ovs-numa.c    | 23 ++++++--------------
> lib/ovs-numa.h    |  1 +
> 3 files changed, 36 insertions(+), 53 deletions(-)
>
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index 8fad4f3..31c5b5a 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -3067,11 +3067,11 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
>                 if (!pmd) {
>                     VLOG_WARN("There is no PMD thread on core %d. Queue "
>                               "%d on port \'%s\' will not be polled.",
>-                              port->rxqs[qid].core_id, qid,
>-                              netdev_get_name(port->netdev));
>+                              q->core_id, qid, netdev_get_name(port->netdev));
>                 } else {
>                     q->pmd = pmd;
>                     pmd->isolated = true;
>+                    dp_netdev_pmd_unref(pmd);
>                 }
>             } else if (!pinned && q->core_id == RXQ_CORE_UNPINNED) {
>                 if (!numa) {
>@@ -3107,31 +3107,31 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>     }
> 
>-    /* Check for unwanted pmd threads */
>-    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>-        if (pmd->core_id != NON_PMD_CORE_ID &&
>-            !ovs_numa_dump_contains_core(pmd_cores,
>-                                         pmd->numa_id,
>-                                         pmd->core_id)) {
>-            changed = true;
>-        }
>-    }
>-
>-    /* Check for required new pmd threads */
>-    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>-        pmd = dp_netdev_get_pmd(dp, core->core_id);
>-        if (pmd) {
>-            dp_netdev_pmd_unref(pmd);
>-            continue;
>-        }
>+    /* Check for changed configuration */
>+    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>         changed = true;
>+    } else {
>+        CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>+            if (pmd->core_id != NON_PMD_CORE_ID
>+                && !ovs_numa_dump_contains_core(pmd_cores,
>+                                                pmd->numa_id,
>+                                                pmd->core_id)) {
>+                changed = true;
>+                break;
>+            }
>+        }

Less lines of code with the same effect, looks good to me

>     }
> 
>     /* Destroy the old and recreate the new pmd threads.  We don't perform an
>      * incremental update because we would have to adjust 'static_tx_qid'. */
>     if (changed) {
>-        struct ovs_numa_dump *all_numas;
>-        struct ovs_numa_info *numa;
>+        uint32_t *n_threads_on_numa;
>+        int i, n_numas = ovs_numa_get_n_numas();
>+
>+        ovs_assert(n_numas != OVS_NUMA_UNSPEC);
>+
>+        /* This is not expensive because n_numas <= MAX_NUMA_NODES */
>+        n_threads_on_numa = xzalloc(n_numas * sizeof *n_threads_on_numa);
> 
>         /* Do not destroy the non pmd thread. */
>         dp_netdev_destroy_all_pmds(dp, false);
>@@ -3141,26 +3141,17 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
> 
>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>+            n_threads_on_numa[core->numa_id]++;
>         }
> 
>         /* Log the number of pmd threads per numa node. */
>-        all_numas = ovs_numa_dump_n_cores_per_numa(1);
>-
>-        FOR_EACH_CORE_ON_DUMP(numa, all_numas) {
>-            int n = 0;
>-
>-            FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>-                if (core->numa_id == numa->numa_id) {
>-                    n++;
>-                }
>-            }
>-
>-            if (n) {
>+        for (i = 0; i < n_numas; i++) {
>+            if (n_threads_on_numa[i]) {
>                 VLOG_INFO("Created %d pmd threads on numa node %d",
>-                          n, numa->numa_id);
>+                          n_threads_on_numa[i], i);
>             }
>         }
>-        ovs_numa_dump_destroy(all_numas);
>+        free(n_threads_on_numa);

It's probably not a big deal, but this code doesn't properly handle
non-consecutive numa nodes.  I decided to add helpers in ovs-numa to
simplify this, like you suggested before.

>     }
> 
>     ovs_numa_dump_destroy(pmd_cores);
>@@ -3321,7 +3312,7 @@ reconfigure_datapath(struct dp_netdev *dp)
> 
>             if (q->pmd) {
>                 ovs_mutex_lock(&q->pmd->port_mutex);
>-                dp_netdev_add_rxq_to_pmd(q->pmd, &port->rxqs[qid]);
>+                dp_netdev_add_rxq_to_pmd(q->pmd, q);
>                 ovs_mutex_unlock(&q->pmd->port_mutex);
>             }
>         }
>@@ -3728,7 +3719,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
>     /* All flows (including their dpcls_rules) have been deleted already */
>     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
>         dpcls_destroy(cls);
>-        free(cls);
>+        ovsrcu_postpone(free, cls);
>     }
>     cmap_destroy(&pmd->classifiers);
>     cmap_destroy(&pmd->flow_table);
>diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
>index 6599542..b7c305f 100644
>--- a/lib/ovs-numa.c
>+++ b/lib/ovs-numa.c
>@@ -406,21 +406,6 @@ ovs_numa_dump_cores_on_numa(int numa_id)
>     return dump;
> }
> 
>-static struct cpu_core *
>-get_cpu_core(int core_id)
>-{
>-    struct cpu_core *core;
>-
>-    HMAP_FOR_EACH_WITH_HASH(core, hmap_node, hash_int(core_id, 0),
>-                            &all_cpu_cores) {
>-        if (core->core_id == core_id) {
>-            return core;
>-        }
>-    }
>-
>-    return NULL;
>-}
>-
> struct ovs_numa_dump *
> ovs_numa_dump_cores_with_cmask(const char *cmask)
> {
>@@ -444,7 +429,7 @@ ovs_numa_dump_cores_with_cmask(const char *cmask)
> 
>         for (j = 0; j < 4; j++) {
>             if ((bin >> j) & 0x1) {
>-                struct cpu_core *core = get_cpu_core(core_id);
>+                struct cpu_core *core = get_core_by_core_id(core_id);

Thanks for spotting this, I would have carried the duplicated version forever

> 
>                 if (core) {
>                     struct ovs_numa_info *info = xmalloc(sizeof *info);
>@@ -508,6 +493,12 @@ ovs_numa_dump_contains_core(const struct ovs_numa_dump *dump,
>     return false;
> }
> 
>+size_t
>+ovs_numa_dump_count(struct ovs_numa_dump *dump)
>+{
>+    return hmap_count(&dump->dump);
>+}
>+
> void
> ovs_numa_dump_destroy(struct ovs_numa_dump *dump)
> {
>diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
>index 89b6a30..9c6f08f 100644
>--- a/lib/ovs-numa.h
>+++ b/lib/ovs-numa.h
>@@ -49,6 +49,7 @@ int ovs_numa_get_n_cores_on_numa(int numa_id);
> struct ovs_numa_dump *ovs_numa_dump_cores_on_numa(int numa_id);
> struct ovs_numa_dump *ovs_numa_dump_cores_with_cmask(const char *cmask);
> struct ovs_numa_dump *ovs_numa_dump_n_cores_per_numa(int n);
>+size_t ovs_numa_dump_count(struct ovs_numa_dump *);
> bool ovs_numa_dump_contains_core(const struct ovs_numa_dump *,
>                                  int numa_id, unsigned core_id);
> void ovs_numa_dump_destroy(struct ovs_numa_dump *);
>--
>
>
>Best regards, Ilya Maximets.
>
>> On 16.11.2016 03:45, diproiettod at vmware.com (Daniele Di Proietto) wrote:
>>> The first two commits of the series are trivial bugfixes for dpif-netdev.
>>>
>>> Then the series fixes a long standing bug that caused a crash when the
>>> admin link state of a port is changed while traffic is flowing.
>>>
>>> The next part makes use of reconfiguration for port add: this makes
>>> the operation twice as fast and reduce some code duplication.  This part
>>> conflicts with the port naming change, so I'm willing to postpone it, unless
>>> we find it to be useful for the port naming change.
>>>
>>> The rest of the series refactors a lot of code if dpif-netdev:
>>>
>>> * We no longer start pmd threads on demand for each numa node.  This made
>>>   the code very complicated and introduced a lot of bugs.
>>> * The pmd threads state is now internal to dpif-netdev and it's not stored in
>>>   ovs-numa.
>>> * There's now a single function that handles pmd threads/ports changes: this
>>>   reduces code duplication and makes port reconfiguration faster, as we don't
>>>   have to bring down the whole datapath.
>>>
>>> Daniele Di Proietto (17):
>>>   dpif-netdev: Fix memory leak.
>>>   dpif-netdev: Take non_pmd_mutex to access tx cached ports.
>>>   dpif-netdev: Don't try to output on a device without txqs.
>>>   netdev-dpdk: Don't call rte_dev_stop() in update_flags().
>>>   netdev-dpdk: Start the device only once on port-add.
>>>   netdev-dpdk: Refactor construct and destruct.
>>>   dpif-netdev: Use a boolean instead of pmd->port_seq.
>>>   dpif-netdev: Block pmd threads if there are no ports.
>>>   dpif-netdev: Create pmd threads for every numa node.
>>>   dpif-netdev: Make 'static_tx_qid' const.
>>>   dpctl: Avoid making assumptions on pmd threads.
>>>   ovs-numa: New ovs_numa_dump_contains_core() function.
>>>   ovs-numa: Add new dump types.
>>>   dpif-netdev: core_id shouldn't be negative.
>>>   dpif-netdev: Use hmap for poll_list in pmd threads.
>>>   dpif-netdev: Centralized threads and queues handling code.
>>>   ovs-numa: Remove unused functions.
>>>
>>>  lib/dpctl.c       |  107 +---
>>>  lib/dpif-netdev.c | 1429 ++++++++++++++++++++++++++++-------------------------
>>>  lib/dpif.c        |    6 +-
>>>  lib/dpif.h        |   12 +-
>>>  lib/netdev-dpdk.c |  173 +++----
>>>  lib/netdev.c      |   21 +-
>>>  lib/netdev.h      |    1 +
>>>  lib/ovs-numa.c    |  227 +++------
>>>  lib/ovs-numa.h    |   21 +-
>>>  tests/pmd.at      |   49 +-
>>>  vswitchd/bridge.c |    2 +
>>>  11 files changed, 1030 insertions(+), 1018 deletions(-)
>>>
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8fad4f3..31c5b5a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3067,11 +3067,11 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
                 if (!pmd) {
                     VLOG_WARN("There is no PMD thread on core %d. Queue "
                               "%d on port \'%s\' will not be polled.",
-                              port->rxqs[qid].core_id, qid,
-                              netdev_get_name(port->netdev));
+                              q->core_id, qid, netdev_get_name(port->netdev));
                 } else {
                     q->pmd = pmd;
                     pmd->isolated = true;
+                    dp_netdev_pmd_unref(pmd);
                 }
             } else if (!pinned && q->core_id == RXQ_CORE_UNPINNED) {
                 if (!numa) {
@@ -3107,31 +3107,31 @@  reconfigure_pmd_threads(struct dp_netdev *dp)
         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
     }
 
-    /* Check for unwanted pmd threads */
-    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
-        if (pmd->core_id != NON_PMD_CORE_ID &&
-            !ovs_numa_dump_contains_core(pmd_cores,
-                                         pmd->numa_id,
-                                         pmd->core_id)) {
-            changed = true;
-        }
-    }
-
-    /* Check for required new pmd threads */
-    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
-        pmd = dp_netdev_get_pmd(dp, core->core_id);
-        if (pmd) {
-            dp_netdev_pmd_unref(pmd);
-            continue;
-        }
+    /* Check for changed configuration */
+    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
         changed = true;
+    } else {
+        CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
+            if (pmd->core_id != NON_PMD_CORE_ID
+                && !ovs_numa_dump_contains_core(pmd_cores,
+                                                pmd->numa_id,
+                                                pmd->core_id)) {
+                changed = true;
+                break;
+            }
+        }
     }
 
     /* Destroy the old and recreate the new pmd threads.  We don't perform an
      * incremental update because we would have to adjust 'static_tx_qid'. */
     if (changed) {
-        struct ovs_numa_dump *all_numas;
-        struct ovs_numa_info *numa;
+        uint32_t *n_threads_on_numa;
+        int i, n_numas = ovs_numa_get_n_numas();
+
+        ovs_assert(n_numas != OVS_NUMA_UNSPEC);
+
+        /* This is not expensive because n_numas <= MAX_NUMA_NODES */
+        n_threads_on_numa = xzalloc(n_numas * sizeof *n_threads_on_numa);
 
         /* Do not destroy the non pmd thread. */
         dp_netdev_destroy_all_pmds(dp, false);
@@ -3141,26 +3141,17 @@  reconfigure_pmd_threads(struct dp_netdev *dp)
             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
 
             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
+            n_threads_on_numa[core->numa_id]++;
         }
 
         /* Log the number of pmd threads per numa node. */
-        all_numas = ovs_numa_dump_n_cores_per_numa(1);
-
-        FOR_EACH_CORE_ON_DUMP(numa, all_numas) {
-            int n = 0;
-
-            FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
-                if (core->numa_id == numa->numa_id) {
-                    n++;
-                }
-            }
-
-            if (n) {
+        for (i = 0; i < n_numas; i++) {
+            if (n_threads_on_numa[i]) {
                 VLOG_INFO("Created %d pmd threads on numa node %d",
-                          n, numa->numa_id);
+                          n_threads_on_numa[i], i);
             }
         }
-        ovs_numa_dump_destroy(all_numas);
+        free(n_threads_on_numa);
     }
 
     ovs_numa_dump_destroy(pmd_cores);
@@ -3321,7 +3312,7 @@  reconfigure_datapath(struct dp_netdev *dp)
 
             if (q->pmd) {
                 ovs_mutex_lock(&q->pmd->port_mutex);
-                dp_netdev_add_rxq_to_pmd(q->pmd, &port->rxqs[qid]);
+                dp_netdev_add_rxq_to_pmd(q->pmd, q);
                 ovs_mutex_unlock(&q->pmd->port_mutex);
             }
         }
@@ -3728,7 +3719,7 @@  dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
     /* All flows (including their dpcls_rules) have been deleted already */
     CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
         dpcls_destroy(cls);
-        free(cls);
+        ovsrcu_postpone(free, cls);
     }
     cmap_destroy(&pmd->classifiers);
     cmap_destroy(&pmd->flow_table);
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6599542..b7c305f 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -406,21 +406,6 @@  ovs_numa_dump_cores_on_numa(int numa_id)
     return dump;
 }
 
-static struct cpu_core *
-get_cpu_core(int core_id)
-{
-    struct cpu_core *core;
-
-    HMAP_FOR_EACH_WITH_HASH(core, hmap_node, hash_int(core_id, 0),
-                            &all_cpu_cores) {
-        if (core->core_id == core_id) {
-            return core;
-        }
-    }
-
-    return NULL;
-}
-
 struct ovs_numa_dump *
 ovs_numa_dump_cores_with_cmask(const char *cmask)
 {
@@ -444,7 +429,7 @@  ovs_numa_dump_cores_with_cmask(const char *cmask)
 
         for (j = 0; j < 4; j++) {
             if ((bin >> j) & 0x1) {
-                struct cpu_core *core = get_cpu_core(core_id);
+                struct cpu_core *core = get_core_by_core_id(core_id);
 
                 if (core) {
                     struct ovs_numa_info *info = xmalloc(sizeof *info);
@@ -508,6 +493,12 @@  ovs_numa_dump_contains_core(const struct ovs_numa_dump *dump,
     return false;
 }
 
+size_t
+ovs_numa_dump_count(struct ovs_numa_dump *dump)
+{
+    return hmap_count(&dump->dump);
+}
+
 void
 ovs_numa_dump_destroy(struct ovs_numa_dump *dump)
 {
diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
index 89b6a30..9c6f08f 100644
--- a/lib/ovs-numa.h
+++ b/lib/ovs-numa.h
@@ -49,6 +49,7 @@  int ovs_numa_get_n_cores_on_numa(int numa_id);
 struct ovs_numa_dump *ovs_numa_dump_cores_on_numa(int numa_id);
 struct ovs_numa_dump *ovs_numa_dump_cores_with_cmask(const char *cmask);
 struct ovs_numa_dump *ovs_numa_dump_n_cores_per_numa(int n);
+size_t ovs_numa_dump_count(struct ovs_numa_dump *);
 bool ovs_numa_dump_contains_core(const struct ovs_numa_dump *,
                                  int numa_id, unsigned core_id);
 void ovs_numa_dump_destroy(struct ovs_numa_dump *);