Patchwork [3/3] sched: Aggressive balance in domains whose groups share package resources

login
register
mail settings
Submitter Vaidyanathan Srinivasan
Date Oct. 21, 2013, 11:45 a.m.
Message ID <20131021114502.13291.60794.stgit@drishya>
Download mbox | patch
Permalink /patch/285176/
State Not Applicable
Headers show

Comments

Vaidyanathan Srinivasan - Oct. 21, 2013, 11:45 a.m.
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>

The current logic in load balance is such that after picking the
busiest group, the load is attempted to be moved from the busiest cpu
in that group to the dst_cpu. If the load cannot be moved from the
busiest cpu to dst_cpu due to either tsk_cpus_allowed mask or cache
hot tasks, then the dst_cpu is changed to be another idle cpu within
the dst->grpmask. If even then, the load cannot be moved from the
busiest cpu, then the source group is changed. The next busiest group
is found and the above steps are repeated.

However if the cpus in the group share package resources, then when
a load movement from the busiest cpu in this group fails as above,
instead of finding the next busiest group to move load from, find the
next busiest cpu *within the same group* from which to move load away.
By doing so, a conscious effort is made during load balancing to keep
just one cpu busy as much as possible within domains that have
SHARED_PKG_RESOURCES flag set unless under scenarios of high load.
Having multiple cpus busy within a domain which share package resource
could lead to a performance hit.

A similar scenario arises in active load balancing as well. When the
current task on the busiest cpu cannot be moved away due to task
pinning, currently no more attempts at load balancing is made. This
patch checks if the balancing is being done on a group whose cpus
share package resources. If so, then check if the load balancing can
be done for other cpus in the same group.

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
---
 kernel/sched/fair.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
Peter Zijlstra - Oct. 22, 2013, 10:23 p.m.
On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>  kernel/sched/fair.c |   18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 828ed97..bbcd96b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5165,6 +5165,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>  {
>  	int ld_moved, cur_ld_moved, active_balance = 0;
>  	struct sched_group *group;
> +	struct sched_domain *child;
> +	int share_pkg_res = 0;
>  	struct rq *busiest;
>  	unsigned long flags;
>  	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
> @@ -5190,6 +5192,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>  
>  	schedstat_inc(sd, lb_count[idle]);
>  
> +	child = sd->child;
> +	if (child && child->flags & SD_SHARE_PKG_RESOURCES)
> +		share_pkg_res = 1;
> +
>  redo:
>  	if (!should_we_balance(&env)) {
>  		*continue_balancing = 0;
> @@ -5202,6 +5208,7 @@ redo:
>  		goto out_balanced;
>  	}
>  
> +redo_grp:
>  	busiest = find_busiest_queue(&env, group);
>  	if (!busiest) {
>  		schedstat_inc(sd, lb_nobusyq[idle]);
> @@ -5292,6 +5299,11 @@ more_balance:
>  			if (!cpumask_empty(cpus)) {
>  				env.loop = 0;
>  				env.loop_break = sched_nr_migrate_break;
> +				if (share_pkg_res &&
> +					cpumask_intersects(cpus,
> +						to_cpumask(group->cpumask)))

sched_group_cpus()

> +					goto redo_grp;
> +
>  				goto redo;
>  			}
>  			goto out_balanced;
> @@ -5318,9 +5330,15 @@ more_balance:
>  			 */
>  			if (!cpumask_test_cpu(this_cpu,
>  					tsk_cpus_allowed(busiest->curr))) {
> +				cpumask_clear_cpu(cpu_of(busiest), cpus);
>  				raw_spin_unlock_irqrestore(&busiest->lock,
>  							    flags);
>  				env.flags |= LBF_ALL_PINNED;
> +				if (share_pkg_res &&
> +					cpumask_intersects(cpus,
> +						to_cpumask(group->cpumask)))
> +					goto redo_grp;
> +
>  				goto out_one_pinned;
>  			}

Man this retry logic is getting annoying.. isn't there anything saner we
can do?
Preeti U Murthy - Oct. 24, 2013, 4:04 a.m.
Hi Peter,

On 10/23/2013 03:53 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   18 ++++++++++++++++++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 828ed97..bbcd96b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5165,6 +5165,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>  {
>>  	int ld_moved, cur_ld_moved, active_balance = 0;
>>  	struct sched_group *group;
>> +	struct sched_domain *child;
>> +	int share_pkg_res = 0;
>>  	struct rq *busiest;
>>  	unsigned long flags;
>>  	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
>> @@ -5190,6 +5192,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>  
>>  	schedstat_inc(sd, lb_count[idle]);
>>  
>> +	child = sd->child;
>> +	if (child && child->flags & SD_SHARE_PKG_RESOURCES)
>> +		share_pkg_res = 1;
>> +
>>  redo:
>>  	if (!should_we_balance(&env)) {
>>  		*continue_balancing = 0;
>> @@ -5202,6 +5208,7 @@ redo:
>>  		goto out_balanced;
>>  	}
>>  
>> +redo_grp:
>>  	busiest = find_busiest_queue(&env, group);
>>  	if (!busiest) {
>>  		schedstat_inc(sd, lb_nobusyq[idle]);
>> @@ -5292,6 +5299,11 @@ more_balance:
>>  			if (!cpumask_empty(cpus)) {
>>  				env.loop = 0;
>>  				env.loop_break = sched_nr_migrate_break;
>> +				if (share_pkg_res &&
>> +					cpumask_intersects(cpus,
>> +						to_cpumask(group->cpumask)))
> 
> sched_group_cpus()
> 
>> +					goto redo_grp;
>> +
>>  				goto redo;
>>  			}
>>  			goto out_balanced;
>> @@ -5318,9 +5330,15 @@ more_balance:
>>  			 */
>>  			if (!cpumask_test_cpu(this_cpu,
>>  					tsk_cpus_allowed(busiest->curr))) {
>> +				cpumask_clear_cpu(cpu_of(busiest), cpus);
>>  				raw_spin_unlock_irqrestore(&busiest->lock,
>>  							    flags);
>>  				env.flags |= LBF_ALL_PINNED;
>> +				if (share_pkg_res &&
>> +					cpumask_intersects(cpus,
>> +						to_cpumask(group->cpumask)))
>> +					goto redo_grp;
>> +
>>  				goto out_one_pinned;
>>  			}
> 
> Man this retry logic is getting annoying.. isn't there anything saner we
> can do?

Let me give this a thought and get back.

Regards
Preeti U Murthy
>
Preeti U Murthy - Oct. 25, 2013, 1:19 p.m.
Hi Peter,

On 10/23/2013 03:53 AM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>>  kernel/sched/fair.c |   18 ++++++++++++++++++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 828ed97..bbcd96b 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5165,6 +5165,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>  {
>>  	int ld_moved, cur_ld_moved, active_balance = 0;
>>  	struct sched_group *group;
>> +	struct sched_domain *child;
>> +	int share_pkg_res = 0;
>>  	struct rq *busiest;
>>  	unsigned long flags;
>>  	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
>> @@ -5190,6 +5192,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
>>  
>>  	schedstat_inc(sd, lb_count[idle]);
>>  
>> +	child = sd->child;
>> +	if (child && child->flags & SD_SHARE_PKG_RESOURCES)
>> +		share_pkg_res = 1;
>> +
>>  redo:
>>  	if (!should_we_balance(&env)) {
>>  		*continue_balancing = 0;
>> @@ -5202,6 +5208,7 @@ redo:
>>  		goto out_balanced;
>>  	}
>>  
>> +redo_grp:
>>  	busiest = find_busiest_queue(&env, group);
>>  	if (!busiest) {
>>  		schedstat_inc(sd, lb_nobusyq[idle]);
>> @@ -5292,6 +5299,11 @@ more_balance:
>>  			if (!cpumask_empty(cpus)) {
>>  				env.loop = 0;
>>  				env.loop_break = sched_nr_migrate_break;
>> +				if (share_pkg_res &&
>> +					cpumask_intersects(cpus,
>> +						to_cpumask(group->cpumask)))
> 
> sched_group_cpus()
> 
>> +					goto redo_grp;
>> +
>>  				goto redo;
>>  			}
>>  			goto out_balanced;
>> @@ -5318,9 +5330,15 @@ more_balance:
>>  			 */
>>  			if (!cpumask_test_cpu(this_cpu,
>>  					tsk_cpus_allowed(busiest->curr))) {
>> +				cpumask_clear_cpu(cpu_of(busiest), cpus);
>>  				raw_spin_unlock_irqrestore(&busiest->lock,
>>  							    flags);
>>  				env.flags |= LBF_ALL_PINNED;
>> +				if (share_pkg_res &&
>> +					cpumask_intersects(cpus,
>> +						to_cpumask(group->cpumask)))
>> +					goto redo_grp;
>> +
>>  				goto out_one_pinned;
>>  			}
> 
> Man this retry logic is getting annoying.. isn't there anything saner we
> can do?

Maybe we can do this just at the SIBLINGS level? Having the hyper
threads busy due to the scenario described in the changelog is bad for
performance.

Regards
Preeti U Murthy
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
Peter Zijlstra - Oct. 28, 2013, 3:53 p.m.
On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
> From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
> 
> The current logic in load balance is such that after picking the
> busiest group, the load is attempted to be moved from the busiest cpu
> in that group to the dst_cpu. If the load cannot be moved from the
> busiest cpu to dst_cpu due to either tsk_cpus_allowed mask or cache
> hot tasks, then the dst_cpu is changed to be another idle cpu within
> the dst->grpmask. If even then, the load cannot be moved from the
> busiest cpu, then the source group is changed. The next busiest group
> is found and the above steps are repeated.
> 
> However if the cpus in the group share package resources, then when
> a load movement from the busiest cpu in this group fails as above,
> instead of finding the next busiest group to move load from, find the
> next busiest cpu *within the same group* from which to move load away.
> By doing so, a conscious effort is made during load balancing to keep
> just one cpu busy as much as possible within domains that have
> SHARED_PKG_RESOURCES flag set unless under scenarios of high load.
> Having multiple cpus busy within a domain which share package resource
> could lead to a performance hit.
> 
> A similar scenario arises in active load balancing as well. When the
> current task on the busiest cpu cannot be moved away due to task
> pinning, currently no more attempts at load balancing is made.

> This
> patch checks if the balancing is being done on a group whose cpus
> share package resources. If so, then check if the load balancing can
> be done for other cpus in the same group.

So I absolutely hate this patch... Also I'm not convinced I actually
understand the explanation above.

Furthermore; there is nothing special about spreading tasks for
SHARED_PKG_RESOURCES and special casing that one case is just wrong.

If anything it should be keyed off of SD_PREFER_SIBLING and or
cpu_power.
Preeti U Murthy - Oct. 29, 2013, 5:35 a.m.
Hi Peter,

On 10/28/2013 09:23 PM, Peter Zijlstra wrote:
> On Mon, Oct 21, 2013 at 05:15:02PM +0530, Vaidyanathan Srinivasan wrote:
>> From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
>>
>> The current logic in load balance is such that after picking the
>> busiest group, the load is attempted to be moved from the busiest cpu
>> in that group to the dst_cpu. If the load cannot be moved from the
>> busiest cpu to dst_cpu due to either tsk_cpus_allowed mask or cache
>> hot tasks, then the dst_cpu is changed to be another idle cpu within
>> the dst->grpmask. If even then, the load cannot be moved from the
>> busiest cpu, then the source group is changed. The next busiest group
>> is found and the above steps are repeated.
>>
>> However if the cpus in the group share package resources, then when
>> a load movement from the busiest cpu in this group fails as above,
>> instead of finding the next busiest group to move load from, find the
>> next busiest cpu *within the same group* from which to move load away.
>> By doing so, a conscious effort is made during load balancing to keep
>> just one cpu busy as much as possible within domains that have
>> SHARED_PKG_RESOURCES flag set unless under scenarios of high load.
>> Having multiple cpus busy within a domain which share package resource
>> could lead to a performance hit.
>>
>> A similar scenario arises in active load balancing as well. When the
>> current task on the busiest cpu cannot be moved away due to task
>> pinning, currently no more attempts at load balancing is made.
> 
>> This
>> patch checks if the balancing is being done on a group whose cpus
>> share package resources. If so, then check if the load balancing can
>> be done for other cpus in the same group.
> 
> So I absolutely hate this patch... Also I'm not convinced I actually
> understand the explanation above.
> 
> Furthermore; there is nothing special about spreading tasks for
> SHARED_PKG_RESOURCES and special casing that one case is just wrong.
> 
> If anything it should be keyed off of SD_PREFER_SIBLING and or
> cpu_power.

At a SIBLING level, which has SHARED_PKG_RESOURCES set, cpu_power in
fact takes care of ensuring that the scheduler mostly spreads the load
when there is more than one running task by nominating the group as
busy. But the issue that this patch is bringing to the front is a bit
different; its not during the time of this nomination, its at the time
of load balancing. It is explained below.

So metrics like cpu_power and flags like SD_PREFER_SIBLING ensure that
we spread the load by nominating such groups as busiest in
update_sg_lb_stats() and update_sd_lb_stats(). So "nominating a group"
as busiest by virtue of cpu_power or flags is taken care of.

However, in load_balance(), if the imbalance cannot be offset by moving
load from the busiest_cpu in the busiest_group, then today we do not try
the *next busiest cpu in the group*; instead we try the next busiest_group.

So whatever effort we put in by nominating this group as busiest, if the
grp_power and flags do not favour tasks on it, seems relevant only if
the busiest cpu in that group co-operates in offloading tasks. Should we
not be trying our best to move load from any other cpu in this group ?

This patch identifies one such situation, which led to too many tasks on
a core and got me to ponder over this question. I agree that the fix in
this patch is not right. But I thought this would open up discussion
around the above question. Its true that iterating over all the cpus in
a group during the actual load balance is too much of an overhead, but
isn't there a balance we can strike during load balance iterations for
such groups which have limited cpu power?
> 
Thanks

Regards
Preeti U Murthy

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 828ed97..bbcd96b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5165,6 +5165,8 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 {
 	int ld_moved, cur_ld_moved, active_balance = 0;
 	struct sched_group *group;
+	struct sched_domain *child;
+	int share_pkg_res = 0;
 	struct rq *busiest;
 	unsigned long flags;
 	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
@@ -5190,6 +5192,10 @@  static int load_balance(int this_cpu, struct rq *this_rq,
 
 	schedstat_inc(sd, lb_count[idle]);
 
+	child = sd->child;
+	if (child && child->flags & SD_SHARE_PKG_RESOURCES)
+		share_pkg_res = 1;
+
 redo:
 	if (!should_we_balance(&env)) {
 		*continue_balancing = 0;
@@ -5202,6 +5208,7 @@  redo:
 		goto out_balanced;
 	}
 
+redo_grp:
 	busiest = find_busiest_queue(&env, group);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
@@ -5292,6 +5299,11 @@  more_balance:
 			if (!cpumask_empty(cpus)) {
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
+				if (share_pkg_res &&
+					cpumask_intersects(cpus,
+						to_cpumask(group->cpumask)))
+					goto redo_grp;
+
 				goto redo;
 			}
 			goto out_balanced;
@@ -5318,9 +5330,15 @@  more_balance:
 			 */
 			if (!cpumask_test_cpu(this_cpu,
 					tsk_cpus_allowed(busiest->curr))) {
+				cpumask_clear_cpu(cpu_of(busiest), cpus);
 				raw_spin_unlock_irqrestore(&busiest->lock,
 							    flags);
 				env.flags |= LBF_ALL_PINNED;
+				if (share_pkg_res &&
+					cpumask_intersects(cpus,
+						to_cpumask(group->cpumask)))
+					goto redo_grp;
+
 				goto out_one_pinned;
 			}