diff mbox

[v2] powerpc/kexec: Fix orphaned offline CPUs across kexec

Message ID 4C511216.30109@ozlabs.org (mailing list archive)
State Changes Requested
Headers show

Commit Message

Matt Evans July 29, 2010, 5:31 a.m. UTC
When CPU hotplug is used, some CPUs may be offline at the time a kexec is
performed.  The subsequent kernel may expect these CPUs to be already running,
and will declare them stuck.  On pseries, there's also a soft-offline (cede)
state that CPUs may be in; this can also cause problems as the kexeced kernel
may ask RTAS if they're online -- and RTAS would say they are.  Again, stuck.

This patch kicks each present offline CPU awake before the kexec, so that
none are lost to these assumptions in the subsequent kernel.

Signed-off-by: Matt Evans <matt@ozlabs.org>
---
v2:	Added FIXME comment noting a possible problem with incorrectly
	started secondary CPUs, following feedback from Milton.

 arch/powerpc/kernel/machine_kexec_64.c |   55 ++++++++++++++++++++++++++++---
 1 files changed, 49 insertions(+), 6 deletions(-)

Comments

Michael Neuling July 30, 2010, 12:08 a.m. UTC | #1
In message <4C511216.30109@ozlabs.org> you wrote:
> When CPU hotplug is used, some CPUs may be offline at the time a kexec is
> performed.  The subsequent kernel may expect these CPUs to be already running
,
> and will declare them stuck.  On pseries, there's also a soft-offline (cede)
> state that CPUs may be in; this can also cause problems as the kexeced kernel
> may ask RTAS if they're online -- and RTAS would say they are.  Again, stuck.
> 
> This patch kicks each present offline CPU awake before the kexec, so that
> none are lost to these assumptions in the subsequent kernel.

There are a lot of cleanups in this patch.  The change you are making
would be a lot clearer without all the additional cleanups in there.  I
think I'd like to see this as two patches.  One for cleanups and one for
the addition of wake_offline_cpus().

Other than that, I'm not completely convinced this is the functionality
we want.  Do we really want to online these cpus?  Why where they
offlined in the first place?  I understand the stuck problem, but is the
solution to online them, or to change the device tree so that the second
kernel doesn't detect them as stuck?  

Mikey

> 
> Signed-off-by: Matt Evans <matt@ozlabs.org>
> ---
> v2:	Added FIXME comment noting a possible problem with incorrectly
> 	started secondary CPUs, following feedback from Milton.
> 
>  arch/powerpc/kernel/machine_kexec_64.c |   55 ++++++++++++++++++++++++++++--
-
>  1 files changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/mac
hine_kexec_64.c
> index 4fbb3be..37f805e 100644
> --- a/arch/powerpc/kernel/machine_kexec_64.c
> +++ b/arch/powerpc/kernel/machine_kexec_64.c
> @@ -15,6 +15,8 @@
>  #include <linux/thread_info.h>
>  #include <linux/init_task.h>
>  #include <linux/errno.h>
> +#include <linux/kernel.h>
> +#include <linux/cpu.h>
>  
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -181,7 +183,20 @@ static void kexec_prepare_cpus_wait(int wait_state)
>  	int my_cpu, i, notified=-1;
>  
>  	my_cpu = get_cpu();
> -	/* Make sure each CPU has atleast made it to the state we need */
> +	/* Make sure each CPU has at least made it to the state we need.
> +	 *
> +	 * FIXME: There is a (slim) chance of a problem if not all of the CPUs
> +	 * are correctly onlined.  If somehow we start a CPU on boot with RTAS
> +	 * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
> +	 * time, the boot CPU will timeout.  If it does eventually execute
> +	 * stuff, the secondary will start up (paca[].cpu_start was written) an
d
> +	 * get into a peculiar state.  If the platform supports
> +	 * smp_ops->take_timebase(), the secondary CPU will probably be spinnin
g
> +	 * in there.  If not (i.e. pseries), the secondary will continue on and
> +	 * try to online itself/idle/etc. If it survives that, we need to find
> +	 * these possible-but-not-online-but-should-be CPUs and chaperone them
> +	 * into kexec_smp_wait().
> +	 */
>  	for_each_online_cpu(i) {
>  		if (i == my_cpu)
>  			continue;
> @@ -189,9 +204,9 @@ static void kexec_prepare_cpus_wait(int wait_state)
>  		while (paca[i].kexec_state < wait_state) {
>  			barrier();
>  			if (i != notified) {
> -				printk( "kexec: waiting for cpu %d (physical"
> -						" %d) to enter %i state\n",
> -					i, paca[i].hw_cpu_id, wait_state);
> +				printk(KERN_INFO "kexec: waiting for cpu %d "
> +				       "(physical %d) to enter %i state\n",
> +				       i, paca[i].hw_cpu_id, wait_state);
>  				notified = i;
>  			}
>  		}
> @@ -199,9 +214,32 @@ static void kexec_prepare_cpus_wait(int wait_state)
>  	mb();
>  }
>  
> -static void kexec_prepare_cpus(void)
> +/*
> + * We need to make sure each present CPU is online.  The next kernel will sc
an
> + * the device tree and assume primary threads are online and query secondary
> + * threads via RTAS to online them if required.  If we don't online primary
> + * threads, they will be stuck.  However, we also online secondary threads a
s we
> + * may be using 'cede offline'.  In this case RTAS doesn't see the secondary
> + * threads as offline -- and again, these CPUs will be stuck.
> + *
> + * So, we online all CPUs that should be running, including secondary thread
s.
> + */
> +static void wake_offline_cpus(void)
>  {
> +	int cpu = 0;
>  
> +	for_each_present_cpu(cpu) {
> +		if (!cpu_online(cpu)) {
> +			printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
> +			       cpu);
> +			cpu_up(cpu);
> +		}
> +	}
> +}
> +
> +static void kexec_prepare_cpus(void)
> +{
> +	wake_offline_cpus();
>  	smp_call_function(kexec_smp_down, NULL, /* wait */0);
>  	local_irq_disable();
>  	mb(); /* make sure IRQs are disabled before we say they are */
> @@ -215,7 +253,10 @@ static void kexec_prepare_cpus(void)
>  	if (ppc_md.kexec_cpu_down)
>  		ppc_md.kexec_cpu_down(0, 0);
>  
> -	/* Before removing MMU mapings make sure all CPUs have entered real mod
e */
> +	/*
> +	 * Before removing MMU mappings make sure all CPUs have entered real
> +	 * mode:
> +	 */
>  	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
>  
>  	put_cpu();
> @@ -284,6 +325,8 @@ void default_machine_kexec(struct kimage *image)
>  	if (crashing_cpu == -1)
>  		kexec_prepare_cpus();
>  
> +	pr_debug("kexec: Starting switchover sequence.\n");
> +
>  	/* switch to a staticly allocated stack.  Based on irq stack code.
>  	 * XXX: the task struct will likely be invalid once we do the copy!
>  	 */
> -- 
> 1.6.3.3
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
Matt Evans July 30, 2010, 12:41 a.m. UTC | #2
Michael Neuling wrote:
> In message <4C511216.30109@ozlabs.org> you wrote:
>> When CPU hotplug is used, some CPUs may be offline at the time a kexec is
>> performed.  The subsequent kernel may expect these CPUs to be already running
> ,
>> and will declare them stuck.  On pseries, there's also a soft-offline (cede)
>> state that CPUs may be in; this can also cause problems as the kexeced kernel
>> may ask RTAS if they're online -- and RTAS would say they are.  Again, stuck.
>>
>> This patch kicks each present offline CPU awake before the kexec, so that
>> none are lost to these assumptions in the subsequent kernel.
> 
> There are a lot of cleanups in this patch.  The change you are making
> would be a lot clearer without all the additional cleanups in there.  I
> think I'd like to see this as two patches.  One for cleanups and one for
> the addition of wake_offline_cpus().

Okay, I can split this.  Typofixy-add-debug in one, wake_offline_cpus in another.

> Other than that, I'm not completely convinced this is the functionality
> we want.  Do we really want to online these cpus?  Why where they
> offlined in the first place?  I understand the stuck problem, but is the
> solution to online them, or to change the device tree so that the second
> kernel doesn't detect them as stuck?

Well... There are two cases.  If a CPU is soft-offlined on pseries, it must be woken from that cede loop (in platforms/pseries/hotplug-cpu.c) as we're replacing code under its feet.  We could either special-case the wakeup from this cede loop to get that CPU to RTAS "stop-self" itself properly.  (Kind of like a "wake to die".)

So that leaves hard-offline CPUs (perhaps including the above): I don't know why they might have been offlined.  If it's something serious, like fire, they'd be removed from the present set too (and thus not be considered in this restarting case).  We could add a mask to the CPU node to show which of the threads (if any) are running, and alter the startup code to start everything if this mask doesn't exist (non-kexec) or only online currently-running threads if the mask is present.  That feels a little weird.

My reasoning for restarting everything was:  The first time you boot, all of your present CPUs are started up.  When you reboot, any CPUs you offlined for fun are restarted.  Kexec is (in this non-crash sense) a user-initiated 'quick reboot', so I reasoned that it should look the same as a 'hard reboot' and your new invocation would have all available CPUs running as is usual.


Cheers,


Matt
Michael Neuling July 30, 2010, 3:15 a.m. UTC | #3
(adding kexec list to CC)

In message <4C521FD2.4050301@ozlabs.org> you wrote:
> Michael Neuling wrote:
> > In message <4C511216.30109@ozlabs.org> you wrote:
> >> When CPU hotplug is used, some CPUs may be offline at the time a kexec is
> >> performed.  The subsequent kernel may expect these CPUs to be already runn
ing
> > ,
> >> and will declare them stuck.  On pseries, there's also a soft-offline (ced
e)
> >> state that CPUs may be in; this can also cause problems as the kexeced ker
nel
> >> may ask RTAS if they're online -- and RTAS would say they are.  Again, stu
ck.
> >>
> >> This patch kicks each present offline CPU awake before the kexec, so that
> >> none are lost to these assumptions in the subsequent kernel.
> > 
> > There are a lot of cleanups in this patch.  The change you are making
> > would be a lot clearer without all the additional cleanups in there.  I
> > think I'd like to see this as two patches.  One for cleanups and one for
> > the addition of wake_offline_cpus().
> 
> Okay, I can split this.  Typofixy-add-debug in one, wake_offline_cpus
> in another. 

Thanks.

> 
> > Other than that, I'm not completely convinced this is the functionality
> > we want.  Do we really want to online these cpus?  Why where they
> > offlined in the first place?  I understand the stuck problem, but is the
> > solution to online them, or to change the device tree so that the second
> > kernel doesn't detect them as stuck?
> 
> Well... There are two cases.  If a CPU is soft-offlined on pseries, it
> must b e woken from that cede loop (in
> platforms/pseries/hotplug-cpu.c) as we're repla cing code under its
> feet.  We could either special-case the wakeup from this ce de loop to
> get that CPU to RTAS "stop-self" itself properly.  (Kind of like a "
> wake to die".)

Makes sense.  

> So that leaves hard-offline CPUs (perhaps including the above): I
> don't know why they might have been offlined.  If it's something
> serious, like fire, they'd be removed from the present set too (and
> thus not be considered in this restarting case).  We could add a mask
> to the CPU node to show which of the threads (if any) are running, and
> alter the startup code to start everything if this mask doesn't exist
> (non-kexec) or only online currently-running threads if the mask is
> present.  That feels a little weird.
> 
> My reasoning for restarting everything was: The first time you boot,
> all of your present CPUs are started up.  When you reboot, any CPUs
> you offlined for fun are restarted.  Kexec is (in this non-crash
> sense) a user-initiated 'quick reboot', so I reasoned that it should
> look the same as a 'hard reboot' and your new invocation would have
> all available CPUs running as is usual.

OK, I like this justification.  Would be good to include it in the
checkin comment since we're changing functionality somewhat.

Mikey
Simon Horman July 30, 2010, 3:25 a.m. UTC | #4
On Fri, Jul 30, 2010 at 01:15:14PM +1000, Michael Neuling wrote:
> (adding kexec list to CC)
> 
> In message <4C521FD2.4050301@ozlabs.org> you wrote:
> > Michael Neuling wrote:
> > > In message <4C511216.30109@ozlabs.org> you wrote:
> > >> When CPU hotplug is used, some CPUs may be offline at the time a kexec is
> > >> performed.  The subsequent kernel may expect these CPUs to be already runn
> ing
> > > ,
> > >> and will declare them stuck.  On pseries, there's also a soft-offline (ced
> e)
> > >> state that CPUs may be in; this can also cause problems as the kexeced ker
> nel
> > >> may ask RTAS if they're online -- and RTAS would say they are.  Again, stu
> ck.
> > >>
> > >> This patch kicks each present offline CPU awake before the kexec, so that
> > >> none are lost to these assumptions in the subsequent kernel.
> > > 
> > > There are a lot of cleanups in this patch.  The change you are making
> > > would be a lot clearer without all the additional cleanups in there.  I
> > > think I'd like to see this as two patches.  One for cleanups and one for
> > > the addition of wake_offline_cpus().
> > 
> > Okay, I can split this.  Typofixy-add-debug in one, wake_offline_cpus
> > in another. 
> 
> Thanks.
> 
> > 
> > > Other than that, I'm not completely convinced this is the functionality
> > > we want.  Do we really want to online these cpus?  Why where they
> > > offlined in the first place?  I understand the stuck problem, but is the
> > > solution to online them, or to change the device tree so that the second
> > > kernel doesn't detect them as stuck?
> > 
> > Well... There are two cases.  If a CPU is soft-offlined on pseries, it
> > must b e woken from that cede loop (in
> > platforms/pseries/hotplug-cpu.c) as we're repla cing code under its
> > feet.  We could either special-case the wakeup from this ce de loop to
> > get that CPU to RTAS "stop-self" itself properly.  (Kind of like a "
> > wake to die".)
> 
> Makes sense.  
> 
> > So that leaves hard-offline CPUs (perhaps including the above): I
> > don't know why they might have been offlined.  If it's something
> > serious, like fire, they'd be removed from the present set too (and
> > thus not be considered in this restarting case).  We could add a mask
> > to the CPU node to show which of the threads (if any) are running, and
> > alter the startup code to start everything if this mask doesn't exist
> > (non-kexec) or only online currently-running threads if the mask is
> > present.  That feels a little weird.
> > 
> > My reasoning for restarting everything was: The first time you boot,
> > all of your present CPUs are started up.  When you reboot, any CPUs
> > you offlined for fun are restarted.  Kexec is (in this non-crash
> > sense) a user-initiated 'quick reboot', so I reasoned that it should
> > look the same as a 'hard reboot' and your new invocation would have
> > all available CPUs running as is usual.
> 
> OK, I like this justification.  Would be good to include it in the
> checkin comment since we're changing functionality somewhat.

FWIW, I do too. Personally I like to think of kexec as soft-reboot.
Where soft means, in software, not soft-touch.
diff mbox

Patch

diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 4fbb3be..37f805e 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -15,6 +15,8 @@ 
 #include <linux/thread_info.h>
 #include <linux/init_task.h>
 #include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/cpu.h>
 
 #include <asm/page.h>
 #include <asm/current.h>
@@ -181,7 +183,20 @@  static void kexec_prepare_cpus_wait(int wait_state)
 	int my_cpu, i, notified=-1;
 
 	my_cpu = get_cpu();
-	/* Make sure each CPU has atleast made it to the state we need */
+	/* Make sure each CPU has at least made it to the state we need.
+	 *
+	 * FIXME: There is a (slim) chance of a problem if not all of the CPUs
+	 * are correctly onlined.  If somehow we start a CPU on boot with RTAS
+	 * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in
+	 * time, the boot CPU will timeout.  If it does eventually execute
+	 * stuff, the secondary will start up (paca[].cpu_start was written) and
+	 * get into a peculiar state.  If the platform supports
+	 * smp_ops->take_timebase(), the secondary CPU will probably be spinning
+	 * in there.  If not (i.e. pseries), the secondary will continue on and
+	 * try to online itself/idle/etc. If it survives that, we need to find
+	 * these possible-but-not-online-but-should-be CPUs and chaperone them
+	 * into kexec_smp_wait().
+	 */
 	for_each_online_cpu(i) {
 		if (i == my_cpu)
 			continue;
@@ -189,9 +204,9 @@  static void kexec_prepare_cpus_wait(int wait_state)
 		while (paca[i].kexec_state < wait_state) {
 			barrier();
 			if (i != notified) {
-				printk( "kexec: waiting for cpu %d (physical"
-						" %d) to enter %i state\n",
-					i, paca[i].hw_cpu_id, wait_state);
+				printk(KERN_INFO "kexec: waiting for cpu %d "
+				       "(physical %d) to enter %i state\n",
+				       i, paca[i].hw_cpu_id, wait_state);
 				notified = i;
 			}
 		}
@@ -199,9 +214,32 @@  static void kexec_prepare_cpus_wait(int wait_state)
 	mb();
 }
 
-static void kexec_prepare_cpus(void)
+/*
+ * We need to make sure each present CPU is online.  The next kernel will scan
+ * the device tree and assume primary threads are online and query secondary
+ * threads via RTAS to online them if required.  If we don't online primary
+ * threads, they will be stuck.  However, we also online secondary threads as we
+ * may be using 'cede offline'.  In this case RTAS doesn't see the secondary
+ * threads as offline -- and again, these CPUs will be stuck.
+ *
+ * So, we online all CPUs that should be running, including secondary threads.
+ */
+static void wake_offline_cpus(void)
 {
+	int cpu = 0;
 
+	for_each_present_cpu(cpu) {
+		if (!cpu_online(cpu)) {
+			printk(KERN_INFO "kexec: Waking offline cpu %d.\n",
+			       cpu);
+			cpu_up(cpu);
+		}
+	}
+}
+
+static void kexec_prepare_cpus(void)
+{
+	wake_offline_cpus();
 	smp_call_function(kexec_smp_down, NULL, /* wait */0);
 	local_irq_disable();
 	mb(); /* make sure IRQs are disabled before we say they are */
@@ -215,7 +253,10 @@  static void kexec_prepare_cpus(void)
 	if (ppc_md.kexec_cpu_down)
 		ppc_md.kexec_cpu_down(0, 0);
 
-	/* Before removing MMU mapings make sure all CPUs have entered real mode */
+	/*
+	 * Before removing MMU mappings make sure all CPUs have entered real
+	 * mode:
+	 */
 	kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE);
 
 	put_cpu();
@@ -284,6 +325,8 @@  void default_machine_kexec(struct kimage *image)
 	if (crashing_cpu == -1)
 		kexec_prepare_cpus();
 
+	pr_debug("kexec: Starting switchover sequence.\n");
+
 	/* switch to a staticly allocated stack.  Based on irq stack code.
 	 * XXX: the task struct will likely be invalid once we do the copy!
 	 */