diff mbox

[1/3] sched: Fix nohz_kick_needed to consider the nr_busy of the parent domain's group

Message ID 5268D54A.9060604@linux.vnet.ibm.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Preeti U Murthy Oct. 24, 2013, 8:07 a.m. UTC
Hi Vincent,

I have addressed your comments and below is the fresh patch. This patch
applies on PATCH 2/3 posted in this thread.

Regards
Preeti U Murthy


sched:Remove un-necessary iterations over sched domains to update/query nr_busy_cpus

From: Preeti U Murthy <preeti@linux.vnet.ibm.com>

nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number
of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set.
Therefore instead of updating nr_busy_cpus at every level of sched domain,
since it is irrelevant, we can update this parameter only at the parent
domain of the sd which has this flag set. Introduce a per-cpu parameter
sd_busy which represents this parent domain.

In nohz_kick_needed() we directly query the nr_busy_cpus parameter
associated with the groups of sd_busy.

By associating sd_busy with the highest domain which has
SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could
have this flag set and trigger nohz_idle_balancing if any of the levels have
more than one busy cpu.

sd_busy is irrelevant for asymmetric load balancing.

While we are at it, we might as well change the nohz_idle parameter to be
updated at the sd_busy domain level alone and not the base domain level of a CPU.
This will unify the concept of busy cpus at just one level of sched domain
where it is currently used.

Signed-off-by: Preeti U Murthy<preeti@linux.vnet.ibm.com>
---
 kernel/sched/core.c  |    5 +++++
 kernel/sched/fair.c  |   38 ++++++++++++++++++++------------------
 kernel/sched/sched.h |    1 +
 3 files changed, 26 insertions(+), 18 deletions(-)

Comments

Peter Zijlstra Oct. 28, 2013, 1:50 p.m. UTC | #1
On Thu, Oct 24, 2013 at 01:37:38PM +0530, Preeti U Murthy wrote:
>  kernel/sched/core.c  |    5 +++++
>  kernel/sched/fair.c  |   38 ++++++++++++++++++++------------------
>  kernel/sched/sched.h |    1 +
>  3 files changed, 26 insertions(+), 18 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c06b8d3..c540392 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>  DEFINE_PER_CPU(int, sd_llc_size);
>  DEFINE_PER_CPU(int, sd_llc_id);
>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
>  
>  static void update_top_cache_domain(int cpu)
>  {
> @@ -5290,6 +5291,10 @@ static void update_top_cache_domain(int cpu)
>  
>  	sd = lowest_flag_domain(cpu, SD_NUMA);
>  	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
> +
> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> +	if (sd)
> +		rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
>  }
>  
>  /*
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e9c9549..f66cfd9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
>  static inline void set_cpu_sd_state_busy(void)
>  {
>  	struct sched_domain *sd;
> +	int cpu = smp_processor_id();
>  
>  	rcu_read_lock();
> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>  
>  	if (!sd || !sd->nohz_idle)
>  		goto unlock;
>  	sd->nohz_idle = 0;
>  
> +	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
>  unlock:
>  	rcu_read_unlock();
>  }
> @@ -6532,16 +6532,16 @@ unlock:
>  void set_cpu_sd_state_idle(void)
>  {
>  	struct sched_domain *sd;
> +	int cpu = smp_processor_id();
>  
>  	rcu_read_lock();
> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>  
>  	if (!sd || sd->nohz_idle)
>  		goto unlock;
>  	sd->nohz_idle = 1;
>  
> +	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
>  unlock:
>  	rcu_read_unlock();
>  }

Oh nice, that gets rid of the multiple atomics, and it nicely splits
this nohz logic into per topology groups -- now if only we could split
the rest too :-)

> @@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>  {
>  	unsigned long now = jiffies;
>  	struct sched_domain *sd;
> +	struct sched_group_power *sgp;
> +	int nr_busy;
>  
>  	if (unlikely(idle_cpu(cpu)))
>  		return 0;
> @@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>  		goto need_kick;
>  
>  	rcu_read_lock();
> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>  
> +	if (sd) {
> +		sgp = sd->groups->sgp;
> +		nr_busy = atomic_read(&sgp->nr_busy_cpus);
>  
> +		if (nr_busy > 1)
>  			goto need_kick_unlock;
>  	}

OK, so far so good.

> +
> +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> +
> +	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
> +				  sched_domain_span(sd)) < cpu))
> +		goto need_kick_unlock;
> +
>  	rcu_read_unlock();
>  	return 0;

This again is a bit sad; most archs will not have SD_ASYM_PACKING set at
all; this means that they all will do a complete (and pointless) sched
domain tree walk here.

It would be much better to also introduce sd_asym and do the analogous
thing to the new sd_busy.
Preeti U Murthy Oct. 29, 2013, 3:30 a.m. UTC | #2
Hi Peter,

On 10/28/2013 07:20 PM, Peter Zijlstra wrote:
> On Thu, Oct 24, 2013 at 01:37:38PM +0530, Preeti U Murthy wrote:
>>  kernel/sched/core.c  |    5 +++++
>>  kernel/sched/fair.c  |   38 ++++++++++++++++++++------------------
>>  kernel/sched/sched.h |    1 +
>>  3 files changed, 26 insertions(+), 18 deletions(-)
>>
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index c06b8d3..c540392 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -5271,6 +5271,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc);
>>  DEFINE_PER_CPU(int, sd_llc_size);
>>  DEFINE_PER_CPU(int, sd_llc_id);
>>  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
>> +DEFINE_PER_CPU(struct sched_domain *, sd_busy);
>>  
>>  static void update_top_cache_domain(int cpu)
>>  {
>> @@ -5290,6 +5291,10 @@ static void update_top_cache_domain(int cpu)
>>  
>>  	sd = lowest_flag_domain(cpu, SD_NUMA);
>>  	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
>> +
>> +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>> +	if (sd)
>> +		rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
>>  }
>>  
>>  /*
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e9c9549..f66cfd9 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -6515,16 +6515,16 @@ static inline void nohz_balance_exit_idle(int cpu)
>>  static inline void set_cpu_sd_state_busy(void)
>>  {
>>  	struct sched_domain *sd;
>> +	int cpu = smp_processor_id();
>>  
>>  	rcu_read_lock();
>> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>>  	if (!sd || !sd->nohz_idle)
>>  		goto unlock;
>>  	sd->nohz_idle = 0;
>>  
>> +	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
>>  unlock:
>>  	rcu_read_unlock();
>>  }
>> @@ -6532,16 +6532,16 @@ unlock:
>>  void set_cpu_sd_state_idle(void)
>>  {
>>  	struct sched_domain *sd;
>> +	int cpu = smp_processor_id();
>>  
>>  	rcu_read_lock();
>> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>>  	if (!sd || sd->nohz_idle)
>>  		goto unlock;
>>  	sd->nohz_idle = 1;
>>  
>> +	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
>>  unlock:
>>  	rcu_read_unlock();
>>  }
> 
> Oh nice, that gets rid of the multiple atomics, and it nicely splits
> this nohz logic into per topology groups -- now if only we could split
> the rest too :-)

I am sorry, I don't get you here. By the 'rest', do you refer to
nohz_kick_needed() as below? Or am I missing something?

> 
>> @@ -6748,6 +6748,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>>  {
>>  	unsigned long now = jiffies;
>>  	struct sched_domain *sd;
>> +	struct sched_group_power *sgp;
>> +	int nr_busy;
>>  
>>  	if (unlikely(idle_cpu(cpu)))
>>  		return 0;
>> @@ -6773,22 +6775,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
>>  		goto need_kick;
>>  
>>  	rcu_read_lock();
>> +	sd = rcu_dereference(per_cpu(sd_busy, cpu));
>>  
>> +	if (sd) {
>> +		sgp = sd->groups->sgp;
>> +		nr_busy = atomic_read(&sgp->nr_busy_cpus);
>>  
>> +		if (nr_busy > 1)
>>  			goto need_kick_unlock;
>>  	}
> 
> OK, so far so good.
> 
>> +
>> +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
>> +
>> +	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
>> +				  sched_domain_span(sd)) < cpu))
>> +		goto need_kick_unlock;
>> +
>>  	rcu_read_unlock();
>>  	return 0;
> 
> This again is a bit sad; most archs will not have SD_ASYM_PACKING set at
> all; this means that they all will do a complete (and pointless) sched
> domain tree walk here.

There will not be a 'complete' sched domain tree walk right? The
iteration will break at the first level of the sched domain for those
archs which do not have SD_ASYM_PACKING set at all.

But it is true that doing a sched domain tree walk regularly is a bad
idea, might as well update the domain with SD_ASYM_PACKING flag set once
and query this domain when required.

I will send out the patch with sd_asym domain introduced rather than the
above.

Thanks

Regards
Preeti U Murthy

> 
> It would be much better to also introduce sd_asym and do the analogous
> thing to the new sd_busy.
>
Peter Zijlstra Oct. 29, 2013, 1:26 p.m. UTC | #3
On Tue, Oct 29, 2013 at 09:00:52AM +0530, Preeti U Murthy wrote:
> > Oh nice, that gets rid of the multiple atomics, and it nicely splits
> > this nohz logic into per topology groups -- now if only we could split
> > the rest too :-)
> 
> I am sorry, I don't get you here. By the 'rest', do you refer to
> nohz_kick_needed() as below? Or am I missing something?

Nah, the rest of the NOHZ infrastructure. Currently its global state;
there were some patches a few years ago that attempted to make that
per-node state, but that work stalled due to people switching jobs.


> >> +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
> >> +
> >> +	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
> >> +				  sched_domain_span(sd)) < cpu))
> >> +		goto need_kick_unlock;
> >> +
> >>  	rcu_read_unlock();
> >>  	return 0;
> > 
> > This again is a bit sad; most archs will not have SD_ASYM_PACKING set at
> > all; this means that they all will do a complete (and pointless) sched
> > domain tree walk here.
> 
> There will not be a 'complete' sched domain tree walk right? The
> iteration will break at the first level of the sched domain for those
> archs which do not have SD_ASYM_PACKING set at all.

Ah indeed; I think I got confused due to me modifying
highest_flag_domain() earlier to assume a flag is carried from the
lowest domain upwards.

> But it is true that doing a sched domain tree walk regularly is a bad
> idea, might as well update the domain with SD_ASYM_PACKING flag set once
> and query this domain when required.
> 
> I will send out the patch with sd_asym domain introduced rather than the
> above.

Thanks
diff mbox

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c06b8d3..c540392 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5271,6 +5271,7 @@  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5290,6 +5291,10 @@  static void update_top_cache_domain(int cpu)
 
 	sd = lowest_flag_domain(cpu, SD_NUMA);
 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+
+	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+	if (sd)
+		rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9c9549..f66cfd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6515,16 +6515,16 @@  static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
 	struct sched_domain *sd;
+	int cpu = smp_processor_id();
 
 	rcu_read_lock();
-	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
 	if (!sd || !sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 0;
 
-	for (; sd; sd = sd->parent)
-		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -6532,16 +6532,16 @@  unlock:
 void set_cpu_sd_state_idle(void)
 {
 	struct sched_domain *sd;
+	int cpu = smp_processor_id();
 
 	rcu_read_lock();
-	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
 	if (!sd || sd->nohz_idle)
 		goto unlock;
 	sd->nohz_idle = 1;
 
-	for (; sd; sd = sd->parent)
-		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -6748,6 +6748,8 @@  static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
 	unsigned long now = jiffies;
 	struct sched_domain *sd;
+	struct sched_group_power *sgp;
+	int nr_busy;
 
 	if (unlikely(idle_cpu(cpu)))
 		return 0;
@@ -6773,22 +6775,22 @@  static inline int nohz_kick_needed(struct rq *rq, int cpu)
 		goto need_kick;
 
 	rcu_read_lock();
-	for_each_domain(cpu, sd) {
-		struct sched_group *sg = sd->groups;
-		struct sched_group_power *sgp = sg->sgp;
-		int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+	sd = rcu_dereference(per_cpu(sd_busy, cpu));
 
-		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-			goto need_kick_unlock;
+	if (sd) {
+		sgp = sd->groups->sgp;
+		nr_busy = atomic_read(&sgp->nr_busy_cpus);
 
-		if (sd->flags & SD_ASYM_PACKING
-		    && (cpumask_first_and(nohz.idle_cpus_mask,
-					  sched_domain_span(sd)) < cpu))
+		if (nr_busy > 1)
 			goto need_kick_unlock;
-
-		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-			break;
 	}
+
+	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+
+	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+				  sched_domain_span(sd)) < cpu))
+		goto need_kick_unlock;
+
 	rcu_read_unlock();
 	return 0;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ffc7087..0f1253f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,7 @@  DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 
 struct sched_group_power {
 	atomic_t ref;