diff mbox

[RFC,v2,1/4] cpuidle: (powerpc) Add cpu_idle_wait() to allow switching of idle routines

Message ID 20111117112830.9191.1951.stgit@localhost6.localdomain6 (mailing list archive)
State Changes Requested
Delegated to: Benjamin Herrenschmidt
Headers show

Commit Message

Deepthi Dharwar Nov. 17, 2011, 11:28 a.m. UTC
This patch provides cpu_idle_wait() routine for the powerpc
platform which is required by the cpuidle subsystem. This
routine is requied to change the idle handler on SMP systems.
The equivalent routine for x86 is in arch/x86/kernel/process.c
but the powerpc implementation is different.

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Signed-off-by: Arun R Bharadwaj <arun.r.bharadwaj@gmail.com>
---
 arch/powerpc/Kconfig                 |    4 ++++
 arch/powerpc/include/asm/processor.h |    2 ++
 arch/powerpc/include/asm/system.h    |    1 +
 arch/powerpc/kernel/idle.c           |   26 ++++++++++++++++++++++++++
 4 files changed, 33 insertions(+), 0 deletions(-)

Comments

Benjamin Herrenschmidt Nov. 27, 2011, 10:48 p.m. UTC | #1
On Thu, 2011-11-17 at 16:58 +0530, Deepthi Dharwar wrote:
> This patch provides cpu_idle_wait() routine for the powerpc
> platform which is required by the cpuidle subsystem. This
> routine is requied to change the idle handler on SMP systems.
> The equivalent routine for x86 is in arch/x86/kernel/process.c
> but the powerpc implementation is different.
> 
> Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
> Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
> Signed-off-by: Arun R Bharadwaj <arun.r.bharadwaj@gmail.com>
> ---

No, that patch also adds this idle boot override thing (can you pick a
shorter name for boot_option_idle_override btw ?) which seems unrelated
and without any explanation as to what it's supposed to be about.

Additionally, I'm a bit worried (but maybe we already discussed that a
while back, I don't know) but cpu_idle_wait() has "wait" in the name,
which makes me think it might need to actually -wait- for all cpus to
have come out of the function.

Now your implementation doesn't provide that guarantee. It might be
fine, I don't know, but if it is, you'd better document it well in the
comments surrounding the code, because as it is, all you do is shoot an
interrupt which will cause the target CPU to eventually come out of idle
some time in the future.

Cheers,
Ben.

>  arch/powerpc/Kconfig                 |    4 ++++
>  arch/powerpc/include/asm/processor.h |    2 ++
>  arch/powerpc/include/asm/system.h    |    1 +
>  arch/powerpc/kernel/idle.c           |   26 ++++++++++++++++++++++++++
>  4 files changed, 33 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index b177caa..87f8604 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -87,6 +87,10 @@ config ARCH_HAS_ILOG2_U64
>  	bool
>  	default y if 64BIT
>  
> +config ARCH_HAS_CPU_IDLE_WAIT
> +	bool
> +	default y
> +
>  config GENERIC_HWEIGHT
>  	bool
>  	default y
> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
> index eb11a44..811b7e7 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -382,6 +382,8 @@ static inline unsigned long get_clean_sp(struct pt_regs *regs, int is_32)
>  }
>  #endif
>  
> +enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
> +
>  #endif /* __KERNEL__ */
>  #endif /* __ASSEMBLY__ */
>  #endif /* _ASM_POWERPC_PROCESSOR_H */
> diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
> index e30a13d..ff66680 100644
> --- a/arch/powerpc/include/asm/system.h
> +++ b/arch/powerpc/include/asm/system.h
> @@ -221,6 +221,7 @@ extern unsigned long klimit;
>  extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
>  
>  extern int powersave_nap;	/* set if nap mode can be used in idle loop */
> +void cpu_idle_wait(void);
>  
>  /*
>   * Atomic exchange
> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
> index 39a2baa..b478c72 100644
> --- a/arch/powerpc/kernel/idle.c
> +++ b/arch/powerpc/kernel/idle.c
> @@ -39,9 +39,13 @@
>  #define cpu_should_die()	0
>  #endif
>  
> +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
> +EXPORT_SYMBOL(boot_option_idle_override);
> +
>  static int __init powersave_off(char *arg)
>  {
>  	ppc_md.power_save = NULL;
> +	boot_option_idle_override = IDLE_POWERSAVE_OFF;
>  	return 0;
>  }
>  __setup("powersave=off", powersave_off);
> @@ -102,6 +106,28 @@ void cpu_idle(void)
>  	}
>  }
>  
> +
> +/*
> + * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
> + * idle loop and start using the new idle loop.
> + * Required while changing idle handler on SMP systems.
> + * Caller must have changed idle handler to the new value before the call.
> + */
> +void cpu_idle_wait(void)
> +{
> +	int cpu;
> +	smp_mb();
> +
> +	/* kick all the CPUs so that they exit out of old idle routine */
> +	get_online_cpus();
> +	for_each_online_cpu(cpu) {
> +		if (cpu != smp_processor_id())
> +			smp_send_reschedule(cpu);
> +	}
> +	put_online_cpus();
> +}
> +EXPORT_SYMBOL_GPL(cpu_idle_wait);
> +
>  int powersave_nap;
>  
>  #ifdef CONFIG_SYSCTL
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
Deepthi Dharwar Nov. 28, 2011, 11:02 a.m. UTC | #2
Hi Ben,

Thanks a lot for the review.

On 11/28/2011 04:18 AM, Benjamin Herrenschmidt wrote:

> On Thu, 2011-11-17 at 16:58 +0530, Deepthi Dharwar wrote:
>> This patch provides cpu_idle_wait() routine for the powerpc
>> platform which is required by the cpuidle subsystem. This
>> routine is requied to change the idle handler on SMP systems.
>> The equivalent routine for x86 is in arch/x86/kernel/process.c
>> but the powerpc implementation is different.
>>
>> Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
>> Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
>> Signed-off-by: Arun R Bharadwaj <arun.r.bharadwaj@gmail.com>
>> ---
> 
> No, that patch also adds this idle boot override thing (can you pick a
> shorter name for boot_option_idle_override btw ?) which seems unrelated
> and without any explanation as to what it's supposed to be about.


 Yes, we can pick a better and shorter name for this variable.
 This variable is used to determine if cpuidle framework
 needs to be enabled and pseries_driver to be loaded or not.
 We disable cpuidle framework only when powersave_off option is set or
 not enabled by the user.

> 
> Additionally, I'm a bit worried (but maybe we already discussed that a
> while back, I don't know) but cpu_idle_wait() has "wait" in the name,
> which makes me think it might need to actually -wait- for all cpus to
> have come out of the function.


cpu_idle_wait is used to ensure that all the CPUs discard old idle
handler and update to new one.  Required while changing idle
handler on SMP systems.

> Now your implementation doesn't provide that guarantee. It might be
> fine, I don't know, but if it is, you'd better document it well in the
> comments surrounding the code, because as it is, all you do is shoot an
> interrupt which will cause the target CPU to eventually come out of idle
> some time in the future.


I was hoping that sending an explicit reschedule to the cpus would
do the trick but sure we can add some documentation around the code.

> 
> Cheers,
> Ben.
> 
>>  arch/powerpc/Kconfig                 |    4 ++++
>>  arch/powerpc/include/asm/processor.h |    2 ++
>>  arch/powerpc/include/asm/system.h    |    1 +
>>  arch/powerpc/kernel/idle.c           |   26 ++++++++++++++++++++++++++
>>  4 files changed, 33 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index b177caa..87f8604 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -87,6 +87,10 @@ config ARCH_HAS_ILOG2_U64
>>  	bool
>>  	default y if 64BIT
>>  
>> +config ARCH_HAS_CPU_IDLE_WAIT
>> +	bool
>> +	default y
>> +
>>  config GENERIC_HWEIGHT
>>  	bool
>>  	default y
>> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
>> index eb11a44..811b7e7 100644
>> --- a/arch/powerpc/include/asm/processor.h
>> +++ b/arch/powerpc/include/asm/processor.h
>> @@ -382,6 +382,8 @@ static inline unsigned long get_clean_sp(struct pt_regs *regs, int is_32)
>>  }
>>  #endif
>>  
>> +enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
>> +
>>  #endif /* __KERNEL__ */
>>  #endif /* __ASSEMBLY__ */
>>  #endif /* _ASM_POWERPC_PROCESSOR_H */
>> diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
>> index e30a13d..ff66680 100644
>> --- a/arch/powerpc/include/asm/system.h
>> +++ b/arch/powerpc/include/asm/system.h
>> @@ -221,6 +221,7 @@ extern unsigned long klimit;
>>  extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
>>  
>>  extern int powersave_nap;	/* set if nap mode can be used in idle loop */
>> +void cpu_idle_wait(void);
>>  
>>  /*
>>   * Atomic exchange
>> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
>> index 39a2baa..b478c72 100644
>> --- a/arch/powerpc/kernel/idle.c
>> +++ b/arch/powerpc/kernel/idle.c
>> @@ -39,9 +39,13 @@
>>  #define cpu_should_die()	0
>>  #endif
>>  
>> +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
>> +EXPORT_SYMBOL(boot_option_idle_override);
>> +
>>  static int __init powersave_off(char *arg)
>>  {
>>  	ppc_md.power_save = NULL;
>> +	boot_option_idle_override = IDLE_POWERSAVE_OFF;
>>  	return 0;
>>  }
>>  __setup("powersave=off", powersave_off);
>> @@ -102,6 +106,28 @@ void cpu_idle(void)
>>  	}
>>  }
>>  
>> +
>> +/*
>> + * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
>> + * idle loop and start using the new idle loop.
>> + * Required while changing idle handler on SMP systems.
>> + * Caller must have changed idle handler to the new value before the call.
>> + */
>> +void cpu_idle_wait(void)
>> +{
>> +	int cpu;
>> +	smp_mb();
>> +
>> +	/* kick all the CPUs so that they exit out of old idle routine */
>> +	get_online_cpus();
>> +	for_each_online_cpu(cpu) {
>> +		if (cpu != smp_processor_id())
>> +			smp_send_reschedule(cpu);
>> +	}
>> +	put_online_cpus();
>> +}
>> +EXPORT_SYMBOL_GPL(cpu_idle_wait);
>> +
>>  int powersave_nap;
>>  
>>  #ifdef CONFIG_SYSCTL
>>
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 
> 


Regards,
Deepthi
Benjamin Herrenschmidt Nov. 28, 2011, 8:35 p.m. UTC | #3
On Mon, 2011-11-28 at 16:32 +0530, Deepthi Dharwar wrote:

> > Additionally, I'm a bit worried (but maybe we already discussed that a
> > while back, I don't know) but cpu_idle_wait() has "wait" in the name,
> > which makes me think it might need to actually -wait- for all cpus to
> > have come out of the function.
> 
> cpu_idle_wait is used to ensure that all the CPUs discard old idle
> handler and update to new one.  Required while changing idle
> handler on SMP systems.
>
> > Now your implementation doesn't provide that guarantee. It might be
> > fine, I don't know, but if it is, you'd better document it well in the
> > comments surrounding the code, because as it is, all you do is shoot an
> > interrupt which will cause the target CPU to eventually come out of idle
> > some time in the future.
> 
> 
> I was hoping that sending an explicit reschedule to the cpus would
> do the trick but sure we can add some documentation around the code.

Well, the question is what guarantee do you expect. Sending a reschedule
IPI will take the other CPUs out of the actual sleep mode, but it will
be some time from there back to getting out of the handler function
(first back out of hypervisor etc...).

The code as you implemented it doesn't wait for that to happen. It might
be fine ... or not. I don't know what semantics you are after precisely.

Cheers,
Ben.
Deepthi Dharwar Nov. 29, 2011, 6:42 a.m. UTC | #4
On 11/29/2011 02:05 AM, Benjamin Herrenschmidt wrote:

> On Mon, 2011-11-28 at 16:32 +0530, Deepthi Dharwar wrote:
> 
>>> Additionally, I'm a bit worried (but maybe we already discussed that a
>>> while back, I don't know) but cpu_idle_wait() has "wait" in the name,
>>> which makes me think it might need to actually -wait- for all cpus to
>>> have come out of the function.
>>
>> cpu_idle_wait is used to ensure that all the CPUs discard old idle
>> handler and update to new one.  Required while changing idle
>> handler on SMP systems.
>>
>>> Now your implementation doesn't provide that guarantee. It might be
>>> fine, I don't know, but if it is, you'd better document it well in the
>>> comments surrounding the code, because as it is, all you do is shoot an
>>> interrupt which will cause the target CPU to eventually come out of idle
>>> some time in the future.
>>
>>
>> I was hoping that sending an explicit reschedule to the cpus would
>> do the trick but sure we can add some documentation around the code.
> 
> Well, the question is what guarantee do you expect. Sending a reschedule
> IPI will take the other CPUs out of the actual sleep mode, but it will
> be some time from there back to getting out of the handler function
> (first back out of hypervisor etc...).
> 
> The code as you implemented it doesn't wait for that to happen. It might
> be fine ... or not. I don't know what semantics you are after precisely.
> 
> Cheers,
> Ben.
> 
> 


Yes, this could be problematic as there is small window for the
race condition to occur . Otherwise we need to manually schedule
it by running a kernel thread but this would definitely have a
overhead and would be an overkill.

Regards,
Deepthi
Benjamin Herrenschmidt Nov. 29, 2011, 7:01 a.m. UTC | #5
On Tue, 2011-11-29 at 12:12 +0530, Deepthi Dharwar wrote:
> 
> Yes, this could be problematic as there is small window for the
> race condition to occur . Otherwise we need to manually schedule
> it by running a kernel thread but this would definitely have a
> overhead and would be an overkill. 

Depends what this "window" is. IE. What are you trying to protect
yourself against ? What's the risk ?

If it's just module unload, then stop_machine is probably your
friend :-)

Cheers,
Ben.
Deepthi Dharwar Nov. 29, 2011, 7:15 a.m. UTC | #6
On 11/29/2011 12:31 PM, Benjamin Herrenschmidt wrote:

> On Tue, 2011-11-29 at 12:12 +0530, Deepthi Dharwar wrote:
>>
>> Yes, this could be problematic as there is small window for the
>> race condition to occur . Otherwise we need to manually schedule
>> it by running a kernel thread but this would definitely have a
>> overhead and would be an overkill. 
> 
> Depends what this "window" is. IE. What are you trying to protect
> yourself against ? What's the risk ?
> 
> If it's just module unload, then stop_machine is probably your
> friend :-)
> 
> Cheers,
> Ben.
> 
> 


Yup, it is the module unload that I am worried about. Otherwise
manually doing it using kernel thread would be an overkill -:(

Regards,
Deepthi
diff mbox

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b177caa..87f8604 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -87,6 +87,10 @@  config ARCH_HAS_ILOG2_U64
 	bool
 	default y if 64BIT
 
+config ARCH_HAS_CPU_IDLE_WAIT
+	bool
+	default y
+
 config GENERIC_HWEIGHT
 	bool
 	default y
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index eb11a44..811b7e7 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -382,6 +382,8 @@  static inline unsigned long get_clean_sp(struct pt_regs *regs, int is_32)
 }
 #endif
 
+enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
+
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PROCESSOR_H */
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index e30a13d..ff66680 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -221,6 +221,7 @@  extern unsigned long klimit;
 extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
 
 extern int powersave_nap;	/* set if nap mode can be used in idle loop */
+void cpu_idle_wait(void);
 
 /*
  * Atomic exchange
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 39a2baa..b478c72 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -39,9 +39,13 @@ 
 #define cpu_should_die()	0
 #endif
 
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
 static int __init powersave_off(char *arg)
 {
 	ppc_md.power_save = NULL;
+	boot_option_idle_override = IDLE_POWERSAVE_OFF;
 	return 0;
 }
 __setup("powersave=off", powersave_off);
@@ -102,6 +106,28 @@  void cpu_idle(void)
 	}
 }
 
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
+ * idle loop and start using the new idle loop.
+ * Required while changing idle handler on SMP systems.
+ * Caller must have changed idle handler to the new value before the call.
+ */
+void cpu_idle_wait(void)
+{
+	int cpu;
+	smp_mb();
+
+	/* kick all the CPUs so that they exit out of old idle routine */
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if (cpu != smp_processor_id())
+			smp_send_reschedule(cpu);
+	}
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
 int powersave_nap;
 
 #ifdef CONFIG_SYSCTL