[ovs-dev,v6,5/6] dpif-netdev: Change pmd selection order.
diff mbox

Message ID 1503617885-32501-6-git-send-email-ktraynor@redhat.com
State Accepted
Delegated to: Darrell Ball
Headers show

Commit Message

Kevin Traynor Aug. 24, 2017, 11:38 p.m. UTC
Up to his point rxqs are sorted by processing cycles they
consumed and assigned to pmds in a round robin manner.

Ian pointed out that on wrap around the most loaded pmd will be
the next one to be assigned an additional rxq and that it would be
better to reverse the pmd order when wraparound occurs.

In other words, change from assigning by rr to assigning in a forward
and reverse cycle through pmds.

Also, now that the algorithm has finalized, document an example.

Suggested-by: Ian Stokes <ian.stokes@intel.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
---
 Documentation/howto/dpdk.rst | 16 ++++++++++++++++
 lib/dpif-netdev.c            | 27 ++++++++++++++++++++++++++-
 tests/pmd.at                 |  2 +-
 3 files changed, 43 insertions(+), 2 deletions(-)

Comments

Ilya Maximets Aug. 28, 2017, 1:31 p.m. UTC | #1
> Up to his point rxqs are sorted by processing cycles they
> consumed and assigned to pmds in a round robin manner.
> 
> Ian pointed out that on wrap around the most loaded pmd will be
> the next one to be assigned an additional rxq and that it would be
> better to reverse the pmd order when wraparound occurs.
> 
> In other words, change from assigning by rr to assigning in a forward
> and reverse cycle through pmds.

Enabling this feature by default will force users to trigger
'dpif-netdev/pmd-rxq-rebalance' periodically. Otherwise, they will may
have completely uneven distribution of queues.
For example:

	* OVS started with 4 PMD threads
	* Added port_1 with 2 queues (cycles accounting started for port_1)
	* Sending small traffic through port_1
	* Added port_2 with 4 queues
		At this point OVS thinks that port_1 is a very heavy loaded
		port and distributes it's queues between PMD threads first:

		queue_1 of port_1 ---> PMD_1
		queue_2 of port_1 ---> PMD_2

		At this point port_2 has zero stats and it's queues distributed
		as follows:

		queue_1 of port_2 ---> PMD_3
		queue_2 of port_2 ---> PMD_4
		queue_3 of port_2 ---> PMD_3
		queue_4 of port_2 ---> PMD_4

	* Start heavy traffic on port_2.

		At this point PMD threads #3 and #4 will be overloaded and
		threads #1 and #2 will be underloaded.
		This situation will not be fixed until rebalancing triggered
		by hands.

If the PMD threads are placed in round-robin manner we will have more
fairly distributed load.

Anyway,
As soon as traffic patterns may vary significantly in time, there
can not be good distribution unless we're rebalancing often enough.
But too often rebalancing will trigger a bunch of other issues.

I'm suggesting to disable cycles based scheduling by default and enable
on a specific appctl call because it may significantly affect
users and has few unsolved issues.

> Also, now that the algorithm has finalized, document an example.
> 
> Suggested-by: Ian Stokes <ian.stokes at intel.com>
> Signed-off-by: Kevin Traynor <ktraynor at redhat.com>
> ---
>  Documentation/howto/dpdk.rst | 16 ++++++++++++++++
>  lib/dpif-netdev.c            | 27 ++++++++++++++++++++++++++-
>  tests/pmd.at                 |  2 +-
>  3 files changed, 43 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
> index a67f3a1..bac51de 100644
> --- a/Documentation/howto/dpdk.rst
> +++ b/Documentation/howto/dpdk.rst
> @@ -124,4 +124,20 @@ will be used where known to assign rxqs to pmd based on a round robin of the
>  sorted rxqs.
>  
> +For example, in the case where here there are 5 rxqs and 3 cores (e.g. 3,7,8)
> +available, and the measured usage of core cycles per rxq over the last
> +interval is seen to be:
> +
> +- Queue #0: 30%
> +- Queue #1: 80%
> +- Queue #3: 60%
> +- Queue #4: 70%
> +- Queue #5: 10%
> +
> +The rxqs will be assigned to cores 3,7,8 in the following order:
> +
> +Core 3: Q1 (80%) |
> +Core 7: Q4 (70%) | Q5 (10%)
> +core 8: Q3 (60%) | Q0 (30%)
> +
>  Rxq to pmds assignment takes place whenever there are configuration changes.
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 4b1cb85..0023f19 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -3288,4 +3288,5 @@ struct rr_numa {
>  
>      int cur_index;
> +    bool idx_inc;
>  };
>  
> @@ -3344,11 +3345,35 @@ rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
>          numa->pmds[numa->n_pmds - 1] = pmd;
> +        /* At least one pmd so initialise curr_idx and idx_inc. */
> +        numa->cur_index = 0;
> +        numa->idx_inc = true;
>      }
>  }
>  
> +/* Returns the next pmd from the numa node in
> + * incrementing or decrementing order. */
>  static struct dp_netdev_pmd_thread *
>  rr_numa_get_pmd(struct rr_numa *numa)
>  {
> -    return numa->pmds[numa->cur_index++ % numa->n_pmds];
> +    int numa_idx = numa->cur_index;
> +
> +    if (numa->idx_inc == true) {
> +        /* Incrementing through list of pmds. */
> +        if (numa->cur_index == numa->n_pmds-1) {
> +            /* Reached the last pmd. */
> +            numa->idx_inc = false;
> +        } else {
> +            numa->cur_index++;
> +        }
> +    } else {
> +        /* Decrementing through list of pmds. */
> +        if (numa->cur_index == 0) {
> +            /* Reached the first pmd. */
> +            numa->idx_inc = true;
> +        } else {
> +            numa->cur_index--;
> +        }
> +    }
> +    return numa->pmds[numa_idx];
>  }
>  
> diff --git a/tests/pmd.at b/tests/pmd.at
> index b6732ea..e39a23a 100644
> --- a/tests/pmd.at
> +++ b/tests/pmd.at
> @@ -54,5 +54,5 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
>  
>  m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/"])
> -m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)0 2 4 6/\1<cleared>/;s/\(queue-id: \)1 3 5 7/\1<cleared>/"])
> +m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)1 2 5 6/\1<cleared>/;s/\(queue-id: \)0 3 4 7/\1<cleared>/"])
>  m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
>  
> -- 
> 1.8.3.1
Kevin Traynor Aug. 30, 2017, 4:58 p.m. UTC | #2
On 08/28/2017 02:31 PM, Ilya Maximets wrote:
>> Up to his point rxqs are sorted by processing cycles they
>> consumed and assigned to pmds in a round robin manner.
>>
>> Ian pointed out that on wrap around the most loaded pmd will be
>> the next one to be assigned an additional rxq and that it would be
>> better to reverse the pmd order when wraparound occurs.
>>
>> In other words, change from assigning by rr to assigning in a forward
>> and reverse cycle through pmds.
> 
> Enabling this feature by default will force users to trigger
> 'dpif-netdev/pmd-rxq-rebalance' periodically. Otherwise, they will may
> have completely uneven distribution of queues.
> For example:
> 
> 	* OVS started with 4 PMD threads
> 	* Added port_1 with 2 queues (cycles accounting started for port_1)
> 	* Sending small traffic through port_1
> 	* Added port_2 with 4 queues
> 		At this point OVS thinks that port_1 is a very heavy loaded
> 		port and distributes it's queues between PMD threads first:
> 
> 		queue_1 of port_1 ---> PMD_1
> 		queue_2 of port_1 ---> PMD_2
> 
> 		At this point port_2 has zero stats and it's queues distributed
> 		as follows:
> 
> 		queue_1 of port_2 ---> PMD_3
> 		queue_2 of port_2 ---> PMD_4
> 		queue_3 of port_2 ---> PMD_3
> 		queue_4 of port_2 ---> PMD_4
> 
> 	* Start heavy traffic on port_2.
> 
> 		At this point PMD threads #3 and #4 will be overloaded and
> 		threads #1 and #2 will be underloaded.
> 		This situation will not be fixed until rebalancing triggered
> 		by hands.
> 
> If the PMD threads are placed in round-robin manner we will have more
> fairly distributed load.
> 

In this carefully hand crafted example, yes. OTOH, in the examples Ian
who suggested this gave, or in the docs below it would be worse using
round-robin. As you rightly say, this situation would be fixed by a
rebalance triggered by hand (or during a reconfig), whereas an
equivalent type problem in the previous scheme would never be addressed.

> Anyway,
> As soon as traffic patterns may vary significantly in time, there
> can not be good distribution unless we're rebalancing often enough.
> But too often rebalancing will trigger a bunch of other issues.
> 

I think it's worth to try and optimize. Saying that traffic patterns may
change over time is actually more of an argument for having the ability
to rebalance IMHO. If you are concerned about variance over time, then
the history stored for each queue could be increased, or a user option
could be added to change the length.

> I'm suggesting to disable cycles based scheduling by default and enable
> on a specific appctl call because it may significantly affect
> users and has few unsolved issues.
> 

I think in general it will be helpful for users. I see the impacts of
not being able to rebalance when round-robin lands you with a horribly
balanced rxq-pmd assignment as worse than any argument against this.

>> Also, now that the algorithm has finalized, document an example.
>>
>> Suggested-by: Ian Stokes <ian.stokes at intel.com>
>> Signed-off-by: Kevin Traynor <ktraynor at redhat.com>
>> ---
>>  Documentation/howto/dpdk.rst | 16 ++++++++++++++++
>>  lib/dpif-netdev.c            | 27 ++++++++++++++++++++++++++-
>>  tests/pmd.at                 |  2 +-
>>  3 files changed, 43 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
>> index a67f3a1..bac51de 100644
>> --- a/Documentation/howto/dpdk.rst
>> +++ b/Documentation/howto/dpdk.rst
>> @@ -124,4 +124,20 @@ will be used where known to assign rxqs to pmd based on a round robin of the
>>  sorted rxqs.
>>  
>> +For example, in the case where here there are 5 rxqs and 3 cores (e.g. 3,7,8)
>> +available, and the measured usage of core cycles per rxq over the last
>> +interval is seen to be:
>> +
>> +- Queue #0: 30%
>> +- Queue #1: 80%
>> +- Queue #3: 60%
>> +- Queue #4: 70%
>> +- Queue #5: 10%
>> +
>> +The rxqs will be assigned to cores 3,7,8 in the following order:
>> +
>> +Core 3: Q1 (80%) |
>> +Core 7: Q4 (70%) | Q5 (10%)
>> +core 8: Q3 (60%) | Q0 (30%)
>> +
>>  Rxq to pmds assignment takes place whenever there are configuration changes.
>>  
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 4b1cb85..0023f19 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -3288,4 +3288,5 @@ struct rr_numa {
>>  
>>      int cur_index;
>> +    bool idx_inc;
>>  };
>>  
>> @@ -3344,11 +3345,35 @@ rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
>>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
>>          numa->pmds[numa->n_pmds - 1] = pmd;
>> +        /* At least one pmd so initialise curr_idx and idx_inc. */
>> +        numa->cur_index = 0;
>> +        numa->idx_inc = true;
>>      }
>>  }
>>  
>> +/* Returns the next pmd from the numa node in
>> + * incrementing or decrementing order. */
>>  static struct dp_netdev_pmd_thread *
>>  rr_numa_get_pmd(struct rr_numa *numa)
>>  {
>> -    return numa->pmds[numa->cur_index++ % numa->n_pmds];
>> +    int numa_idx = numa->cur_index;
>> +
>> +    if (numa->idx_inc == true) {
>> +        /* Incrementing through list of pmds. */
>> +        if (numa->cur_index == numa->n_pmds-1) {
>> +            /* Reached the last pmd. */
>> +            numa->idx_inc = false;
>> +        } else {
>> +            numa->cur_index++;
>> +        }
>> +    } else {
>> +        /* Decrementing through list of pmds. */
>> +        if (numa->cur_index == 0) {
>> +            /* Reached the first pmd. */
>> +            numa->idx_inc = true;
>> +        } else {
>> +            numa->cur_index--;
>> +        }
>> +    }
>> +    return numa->pmds[numa_idx];
>>  }
>>  
>> diff --git a/tests/pmd.at b/tests/pmd.at
>> index b6732ea..e39a23a 100644
>> --- a/tests/pmd.at
>> +++ b/tests/pmd.at
>> @@ -54,5 +54,5 @@ m4_define([CHECK_PMD_THREADS_CREATED], [
>>  
>>  m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/"])
>> -m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)0 2 4 6/\1<cleared>/;s/\(queue-id: \)1 3 5 7/\1<cleared>/"])
>> +m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)1 2 5 6/\1<cleared>/;s/\(queue-id: \)0 3 4 7/\1<cleared>/"])
>>  m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])
>>  
>> -- 
>> 1.8.3.1

Patch
diff mbox

diff --git a/Documentation/howto/dpdk.rst b/Documentation/howto/dpdk.rst
index a67f3a1..bac51de 100644
--- a/Documentation/howto/dpdk.rst
+++ b/Documentation/howto/dpdk.rst
@@ -124,4 +124,20 @@  will be used where known to assign rxqs to pmd based on a round robin of the
 sorted rxqs.
 
+For example, in the case where here there are 5 rxqs and 3 cores (e.g. 3,7,8)
+available, and the measured usage of core cycles per rxq over the last
+interval is seen to be:
+
+- Queue #0: 30%
+- Queue #1: 80%
+- Queue #3: 60%
+- Queue #4: 70%
+- Queue #5: 10%
+
+The rxqs will be assigned to cores 3,7,8 in the following order:
+
+Core 3: Q1 (80%) |
+Core 7: Q4 (70%) | Q5 (10%)
+core 8: Q3 (60%) | Q0 (30%)
+
 Rxq to pmds assignment takes place whenever there are configuration changes.
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4b1cb85..0023f19 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3288,4 +3288,5 @@  struct rr_numa {
 
     int cur_index;
+    bool idx_inc;
 };
 
@@ -3344,11 +3345,35 @@  rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
         numa->pmds[numa->n_pmds - 1] = pmd;
+        /* At least one pmd so initialise curr_idx and idx_inc. */
+        numa->cur_index = 0;
+        numa->idx_inc = true;
     }
 }
 
+/* Returns the next pmd from the numa node in
+ * incrementing or decrementing order. */
 static struct dp_netdev_pmd_thread *
 rr_numa_get_pmd(struct rr_numa *numa)
 {
-    return numa->pmds[numa->cur_index++ % numa->n_pmds];
+    int numa_idx = numa->cur_index;
+
+    if (numa->idx_inc == true) {
+        /* Incrementing through list of pmds. */
+        if (numa->cur_index == numa->n_pmds-1) {
+            /* Reached the last pmd. */
+            numa->idx_inc = false;
+        } else {
+            numa->cur_index++;
+        }
+    } else {
+        /* Decrementing through list of pmds. */
+        if (numa->cur_index == 0) {
+            /* Reached the first pmd. */
+            numa->idx_inc = true;
+        } else {
+            numa->cur_index--;
+        }
+    }
+    return numa->pmds[numa_idx];
 }
 
diff --git a/tests/pmd.at b/tests/pmd.at
index b6732ea..e39a23a 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -54,5 +54,5 @@  m4_define([CHECK_PMD_THREADS_CREATED], [
 
 m4_define([SED_NUMA_CORE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/"])
-m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)0 2 4 6/\1<cleared>/;s/\(queue-id: \)1 3 5 7/\1<cleared>/"])
+m4_define([SED_NUMA_CORE_QUEUE_PATTERN], ["s/\(numa_id \)[[0-9]]*\( core_id \)[[0-9]]*:/\1<cleared>\2<cleared>:/;s/\(queue-id: \)1 2 5 6/\1<cleared>/;s/\(queue-id: \)0 3 4 7/\1<cleared>/"])
 m4_define([DUMMY_NUMA], [--dummy-numa="0,0,0,0"])