diff mbox

[2/2] sched: Try tp catch cpu_power being set to 0

Message ID 1295364863-9028-3-git-send-email-stefan.bader@canonical.com
State Accepted
Delegated to: Stefan Bader
Headers show

Commit Message

Stefan Bader Jan. 18, 2011, 3:34 p.m. UTC
This is an optional change to try catching the culprit which changes
cpu_power to 0 (should never happen) and causes divide by zero crashes
later on in the scheduler code.

BugLink: http://bugs.launchpad.net/bugs/614853

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
---
 kernel/sched.c |   12 +++++++++++-
 1 files changed, 11 insertions(+), 1 deletions(-)

Comments

Andy Whitcroft Jan. 19, 2011, 2:20 p.m. UTC | #1
On Tue, Jan 18, 2011 at 04:34:23PM +0100, Stefan Bader wrote:
> This is an optional change to try catching the culprit which changes
> cpu_power to 0 (should never happen) and causes divide by zero crashes
> later on in the scheduler code.
> 
> BugLink: http://bugs.launchpad.net/bugs/614853
> 
> Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
> ---
>  kernel/sched.c |   12 +++++++++++-
>  1 files changed, 11 insertions(+), 1 deletions(-)
> 
> diff --git a/kernel/sched.c b/kernel/sched.c
> index d4a4b14..7ef70c0 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -3713,6 +3713,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>  	unsigned long weight = sd->span_weight;
>  	unsigned long power = SCHED_LOAD_SCALE;
>  	struct sched_group *sdg = sd->groups;
> +	unsigned long scale_rt;
>  
>  	if (sched_feat(ARCH_POWER))
>  		power *= arch_scale_freq_power(sd, cpu);
> @@ -3730,12 +3731,18 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>  		power >>= SCHED_LOAD_SHIFT;
>  	}
>  
> -	power *= scale_rt_power(cpu);
> +	scale_rt = scale_rt_power(cpu);
> +	power *= scale_rt;
> +
>  	power >>= SCHED_LOAD_SHIFT;
>  
>  	if (!power)
>  		power = 1;
>  
> +	if (WARN_ON((long) power <= 0))
> +		printk(KERN_ERR "cpu_power = %ld; scale_rt = %ld\n",
> +			power, scale_rt);
> +

This is meant to catch when power == 0, but just above power is zapped
to 1 if it is 0.  Did we want to catch the 0 ie should this be above the
power zapping ?  I guess this is more checking >2^32 ?

>  	sdg->cpu_power = power;
>  }
>  
> @@ -3759,6 +3766,9 @@ static void update_group_power(struct sched_domain *sd, int cpu)
>  	} while (group != child->groups);
>  
>  	sdg->cpu_power = power;
> +
> +	if (WARN_ON((long) power <= 0))
> +		printk(KERN_ERR "cpu_power = %ld\n", power);

This is a little odd, I assume its really checking that power is
not >2^32 and != 0.

Otherwise I suspect the intent is reasonable.  If this has been boot
tested I guess it is ok.  Cirtainly 1/2 is a major paper over the bug
type fix so something needs to be added to try and catch it.

Acked-by: Andy Whitcroft <apw@canonical.com>

-apw
Henrik Rydberg Jan. 19, 2011, 2:50 p.m. UTC | #2
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index d4a4b14..7ef70c0 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -3713,6 +3713,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
> >  	unsigned long weight = sd->span_weight;
> >  	unsigned long power = SCHED_LOAD_SCALE;
> >  	struct sched_group *sdg = sd->groups;
> > +	unsigned long scale_rt;
> >  
> >  	if (sched_feat(ARCH_POWER))
> >  		power *= arch_scale_freq_power(sd, cpu);
> > @@ -3730,12 +3731,18 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
> >  		power >>= SCHED_LOAD_SHIFT;
> >  	}
> >  
> > -	power *= scale_rt_power(cpu);
> > +	scale_rt = scale_rt_power(cpu);
> > +	power *= scale_rt;
> > +
> >  	power >>= SCHED_LOAD_SHIFT;
> >  
> >  	if (!power)
> >  		power = 1;
> >  
> > +	if (WARN_ON((long) power <= 0))
> > +		printk(KERN_ERR "cpu_power = %ld; scale_rt = %ld\n",
> > +			power, scale_rt);
> > +
> 
> This is meant to catch when power == 0, but just above power is zapped
> to 1 if it is 0.  Did we want to catch the 0 ie should this be above the
> power zapping ?  I guess this is more checking >2^32 ?

Or should it perhaps be placed below the following assignment,
checking the actual register? Is the problem really on 64 bit? Since
power is unsigned long locally, but unsigned int in sched_group, on
32-bit machines...

> 
> >  	sdg->cpu_power = power;
> >  }
> >  

Henrik
Stefan Bader Jan. 20, 2011, 4:11 p.m. UTC | #3
On 01/19/2011 03:20 PM, Andy Whitcroft wrote:
> On Tue, Jan 18, 2011 at 04:34:23PM +0100, Stefan Bader wrote:
>> This is an optional change to try catching the culprit which changes
>> cpu_power to 0 (should never happen) and causes divide by zero crashes
>> later on in the scheduler code.
>>
>> BugLink: http://bugs.launchpad.net/bugs/614853
>>
>> Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
>> ---
>>  kernel/sched.c |   12 +++++++++++-
>>  1 files changed, 11 insertions(+), 1 deletions(-)
>>
>> diff --git a/kernel/sched.c b/kernel/sched.c
>> index d4a4b14..7ef70c0 100644
>> --- a/kernel/sched.c
>> +++ b/kernel/sched.c
>> @@ -3713,6 +3713,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>>  	unsigned long weight = sd->span_weight;
>>  	unsigned long power = SCHED_LOAD_SCALE;
>>  	struct sched_group *sdg = sd->groups;
>> +	unsigned long scale_rt;
>>  
>>  	if (sched_feat(ARCH_POWER))
>>  		power *= arch_scale_freq_power(sd, cpu);
>> @@ -3730,12 +3731,18 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>>  		power >>= SCHED_LOAD_SHIFT;
>>  	}
>>  
>> -	power *= scale_rt_power(cpu);
>> +	scale_rt = scale_rt_power(cpu);
>> +	power *= scale_rt;
>> +
>>  	power >>= SCHED_LOAD_SHIFT;
>>  
>>  	if (!power)
>>  		power = 1;
>>  
>> +	if (WARN_ON((long) power <= 0))
>> +		printk(KERN_ERR "cpu_power = %ld; scale_rt = %ld\n",
>> +			power, scale_rt);
>> +
> 
> This is meant to catch when power == 0, but just above power is zapped
> to 1 if it is 0.  Did we want to catch the 0 ie should this be above the
> power zapping ?  I guess this is more checking >2^32 ?
> 
>>  	sdg->cpu_power = power;
>>  }
>>  
>> @@ -3759,6 +3766,9 @@ static void update_group_power(struct sched_domain *sd, int cpu)
>>  	} while (group != child->groups);
>>  
>>  	sdg->cpu_power = power;
>> +
>> +	if (WARN_ON((long) power <= 0))
>> +		printk(KERN_ERR "cpu_power = %ld\n", power);
> 
> This is a little odd, I assume its really checking that power is
> not >2^32 and != 0.
> 
> Otherwise I suspect the intent is reasonable.  If this has been boot
> tested I guess it is ok.  Cirtainly 1/2 is a major paper over the bug
> type fix so something needs to be added to try and catch it.
> 
> Acked-by: Andy Whitcroft <apw@canonical.com>
> 
> -apw

Ok, so without changing those I just tried a Lucid ec2 kernel for 32 and 64 bit
with all the 3 recently posted patches applied and it still boots without any
complaints. So it seems (even with the questionable sense) it does not cause
pain to add them.

-Stefan
Tim Gardner Jan. 20, 2011, 6:14 p.m. UTC | #4
On 01/20/2011 09:11 AM, Stefan Bader wrote:
> On 01/19/2011 03:20 PM, Andy Whitcroft wrote:
>> On Tue, Jan 18, 2011 at 04:34:23PM +0100, Stefan Bader wrote:
>>> This is an optional change to try catching the culprit which changes
>>> cpu_power to 0 (should never happen) and causes divide by zero crashes
>>> later on in the scheduler code.
>>>
>>> BugLink: http://bugs.launchpad.net/bugs/614853
>>>
>>> Signed-off-by: Stefan Bader<stefan.bader@canonical.com>
>>> ---
>>>   kernel/sched.c |   12 +++++++++++-
>>>   1 files changed, 11 insertions(+), 1 deletions(-)
>>>
>>> diff --git a/kernel/sched.c b/kernel/sched.c
>>> index d4a4b14..7ef70c0 100644
>>> --- a/kernel/sched.c
>>> +++ b/kernel/sched.c
>>> @@ -3713,6 +3713,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>>>   	unsigned long weight = sd->span_weight;
>>>   	unsigned long power = SCHED_LOAD_SCALE;
>>>   	struct sched_group *sdg = sd->groups;
>>> +	unsigned long scale_rt;
>>>
>>>   	if (sched_feat(ARCH_POWER))
>>>   		power *= arch_scale_freq_power(sd, cpu);
>>> @@ -3730,12 +3731,18 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
>>>   		power>>= SCHED_LOAD_SHIFT;
>>>   	}
>>>
>>> -	power *= scale_rt_power(cpu);
>>> +	scale_rt = scale_rt_power(cpu);
>>> +	power *= scale_rt;
>>> +
>>>   	power>>= SCHED_LOAD_SHIFT;
>>>
>>>   	if (!power)
>>>   		power = 1;
>>>
>>> +	if (WARN_ON((long) power<= 0))
>>> +		printk(KERN_ERR "cpu_power = %ld; scale_rt = %ld\n",
>>> +			power, scale_rt);
>>> +
>>
>> This is meant to catch when power == 0, but just above power is zapped
>> to 1 if it is 0.  Did we want to catch the 0 ie should this be above the
>> power zapping ?  I guess this is more checking>2^32 ?
>>
>>>   	sdg->cpu_power = power;
>>>   }
>>>
>>> @@ -3759,6 +3766,9 @@ static void update_group_power(struct sched_domain *sd, int cpu)
>>>   	} while (group != child->groups);
>>>
>>>   	sdg->cpu_power = power;
>>> +
>>> +	if (WARN_ON((long) power<= 0))
>>> +		printk(KERN_ERR "cpu_power = %ld\n", power);
>>
>> This is a little odd, I assume its really checking that power is
>> not>2^32 and != 0.
>>
>> Otherwise I suspect the intent is reasonable.  If this has been boot
>> tested I guess it is ok.  Cirtainly 1/2 is a major paper over the bug
>> type fix so something needs to be added to try and catch it.
>>
>> Acked-by: Andy Whitcroft<apw@canonical.com>
>>
>> -apw
>
> Ok, so without changing those I just tried a Lucid ec2 kernel for 32 and 64 bit
> with all the 3 recently posted patches applied and it still boots without any
> complaints. So it seems (even with the questionable sense) it does not cause
> pain to add them.
>
> -Stefan
>

Since this is quite testable:

Acked-by: Tim Gardner <tim.gardner@canonical.com>
diff mbox

Patch

diff --git a/kernel/sched.c b/kernel/sched.c
index d4a4b14..7ef70c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3713,6 +3713,7 @@  static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long weight = sd->span_weight;
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
+	unsigned long scale_rt;
 
 	if (sched_feat(ARCH_POWER))
 		power *= arch_scale_freq_power(sd, cpu);
@@ -3730,12 +3731,18 @@  static void update_cpu_power(struct sched_domain *sd, int cpu)
 		power >>= SCHED_LOAD_SHIFT;
 	}
 
-	power *= scale_rt_power(cpu);
+	scale_rt = scale_rt_power(cpu);
+	power *= scale_rt;
+
 	power >>= SCHED_LOAD_SHIFT;
 
 	if (!power)
 		power = 1;
 
+	if (WARN_ON((long) power <= 0))
+		printk(KERN_ERR "cpu_power = %ld; scale_rt = %ld\n",
+			power, scale_rt);
+
 	sdg->cpu_power = power;
 }
 
@@ -3759,6 +3766,9 @@  static void update_group_power(struct sched_domain *sd, int cpu)
 	} while (group != child->groups);
 
 	sdg->cpu_power = power;
+
+	if (WARN_ON((long) power <= 0))
+		printk(KERN_ERR "cpu_power = %ld\n", power);
 }
 
 /**