Patchwork [3/3] powerpc/ppc64/kdump: better flag for running relocatable

login
register
mail settings
Submitter Milton Miller
Date Oct. 22, 2008, 8:39 p.m.
Message ID <kexec-kern-2@bga.com>
Download mbox | patch
Permalink /patch/5380/
State Superseded
Headers show

Comments

Milton Miller - Oct. 22, 2008, 8:39 p.m.
The __kdump_flag ABI is overly constraining for future development.  

As of 2.6.27, the kernel entry point has 4 constraints:  Offset 0 is
the starting point for the master (boot) cpu (entered with r3 pointing
to the device tree structure), offset 0x60 is code for the slave cpus
(entered with r3 set to their device tree physical id), offset 0x20 is
used by the iseries hypervisor, and secondary cpus must be well behaved
when the first 256 bytes are copied to address 0.

Placing the __kdump_flag at 0x18 is bad because:

- It was taking the last 8 bytes before the iseries hypervisor data.  
- It was 8 bytes for a boolean flag
- It had no way of identifying that the flag was present
- It does leave any room for the master to add any additional code
  before branching, which hurts debug.
- It will be unnecessarily hard for 32 bit code to be common (8 bytes)

Now that we have eliminated the use of __kdump_flag in favor of
the standard is_kdump_kernel(), this flag only controls run without
relocating the kernel to PHYSICAL_START (0), so rename it __run_at_load.

Move the flag to 0x5c, 1 word before the secondary cpu entry point at
0x60.  Use the copy at address 0 not the one in the base kernel image to
make it easier on kexec-tools.  Initialize it with "run0" to say it will
run at 0 unless it is set to 1.  It only exists if we are relocatable.

Signed-off-by: Milton Miller <miltonm@bga.com>
---
I left it global so it appears that way in System.map, but it would
not need to be.

I kept the guards with CONFIG_CRASH_DUMP for now.  They could be relaxed
to just CONFIG_RELOCATABLE.

Tested with normal kexec (kernel moved to 0) and a custom boot-loader
(kernel stayed at loaded 16MB start).
Michael Neuling - Oct. 23, 2008, 3:23 a.m.
In message <kexec-kern-2@bga.com> you wrote:
> The __kdump_flag ABI is overly constraining for future development.  
> 
> As of 2.6.27, the kernel entry point has 4 constraints:  Offset 0 is
> the starting point for the master (boot) cpu (entered with r3 pointing
> to the device tree structure), offset 0x60 is code for the slave cpus
> (entered with r3 set to their device tree physical id), offset 0x20 is
> used by the iseries hypervisor, and secondary cpus must be well behaved
> when the first 256 bytes are copied to address 0.
> 
> Placing the __kdump_flag at 0x18 is bad because:
> 
> - It was taking the last 8 bytes before the iseries hypervisor data.  
> - It was 8 bytes for a boolean flag
> - It had no way of identifying that the flag was present
> - It does leave any room for the master to add any additional code
>   before branching, which hurts debug.
> - It will be unnecessarily hard for 32 bit code to be common (8 bytes)
> 
> Now that we have eliminated the use of __kdump_flag in favor of
> the standard is_kdump_kernel(), this flag only controls run without
> relocating the kernel to PHYSICAL_START (0), so rename it __run_at_load.
> 
> Move the flag to 0x5c, 1 word before the secondary cpu entry point at
> 0x60.  Use the copy at address 0 not the one in the base kernel image to
> make it easier on kexec-tools.  Initialize it with "run0" to say it will
> run at 0 unless it is set to 1.  It only exists if we are relocatable.
> 
> Signed-off-by: Milton Miller <miltonm@bga.com>
> ---
> I left it global so it appears that way in System.map, but it would
> not need to be.
> 
> I kept the guards with CONFIG_CRASH_DUMP for now.  They could be relaxed
> to just CONFIG_RELOCATABLE.
> 
> Tested with normal kexec (kernel moved to 0) and a custom boot-loader
> (kernel stayed at loaded 16MB start).
> 
> Index: next.git/arch/powerpc/kernel/head_64.S
> ===================================================================
> --- next.git.orig/arch/powerpc/kernel/head_64.S	2008-10-22 04:30:08.000
000000 -0500
> +++ next.git/arch/powerpc/kernel/head_64.S	2008-10-22 04:59:55.000000000 -
0500
> @@ -97,12 +97,6 @@ __secondary_hold_spinloop:
>  __secondary_hold_acknowledge:
>  	.llong	0x0
>  
> -	/* This flag is set by purgatory if we should be a kdump kernel. */
> -	/* Do not move this variable as purgatory knows about it. */
> -	.globl	__kdump_flag
> -__kdump_flag:
> -	.llong	0x0
> -
>  #ifdef CONFIG_PPC_ISERIES
>  	/*
>  	 * At offset 0x20, there is a pointer to iSeries LPAR data.
> @@ -112,6 +106,20 @@ __kdump_flag:
>  	.llong hvReleaseData-KERNELBASE
>  #endif /* CONFIG_PPC_ISERIES */
>  
> +#ifdef CONFIG_CRASH_DUMP
> +	/* This flag is set to 1 by a loader if the kernel should run
> +	 * at the loaded address instead of the linked address.  This
> +	 * is used by kexec-tools to keep the the kdump kernel in the
> +	 * crash_kernel region.  The loader is responsible for
> +	 * observing the alignment requirement.
> +	 */
> +	/* Do not move this variable as kexec-tools knows about it. */
> +	. = 0x5c
> +	.globl	__run_at_load
> +__run_at_load:
> +	.long	0x72756e30	/* "run0" -- relocate to 0 by default */
> +#endif
> +
>  	. = 0x60
>  /*
>   * The following code is used to hold secondary processors
> @@ -1391,8 +1399,8 @@ _STATIC(__after_prom_start)
>  	lis	r25,PAGE_OFFSET@highest	/* compute virtual base of kernel */
>  	sldi	r25,r25,32
>  #ifdef CONFIG_CRASH_DUMP
> -	ld	r7,__kdump_flag-_stext(r26)
> -	cmpldi	cr0,r7,1	/* kdump kernel ? - stay where we are */
> +	lwz	r7,__run_at_load-_stext(0)
> +	cmplwi	cr0,r7,1	/* kdump kernel ? - stay where we are */

Do we really want the flag to always be at 0x5c not 0x5c + kernel offset?

Also, comment "kdump kernel" needs to be updated to reflect the new
name.  

Other than that, the patch series works for me.

Mikey

>  	bne	1f
>  	add	r25,r25,r26
>  #endif
> @@ -1416,11 +1424,11 @@ _STATIC(__after_prom_start)
>  #ifdef CONFIG_CRASH_DUMP
>  /*
>   * Check if the kernel has to be running as relocatable kernel based on the
> - * variable __kdump_flag, if it is set the kernel is treated as relocatable
> + * variable __run_at_load, if it is set the kernel is treated as relocatable
>   * kernel, otherwise it will be moved to PHYSICAL_START
>   */
> -	ld	r7,__kdump_flag-_stext(r26)
> -	cmpldi	cr0,r7,1
> +	lwz	r7,__run_at_load-_stext(0)
> +	cmplwi	cr0,r7,1
>  	bne	3f
>  
>  	li	r5,__end_interrupts - _stext	/* just copy interrupts */
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
>
Paul Mackerras - Oct. 23, 2008, 3:32 a.m.
Milton Miller writes:

> Move the flag to 0x5c, 1 word before the secondary cpu entry point at
> 0x60.  Use the copy at address 0 not the one in the base kernel image to
> make it easier on kexec-tools.

Why is it easier on kexec-tools?  Doesn't kexec-tools know where it
put the kernel?

I'd much rather keep the flag inside the kdump kernel image, rather
than having kexec/kdump start using random fixed locations outside the
new kernel image.

Paul.
Paul Mackerras - Oct. 23, 2008, 3:43 a.m.
Paul Mackerras writes:
> Milton Miller writes:
> 
> > Move the flag to 0x5c, 1 word before the secondary cpu entry point at
> > 0x60.  Use the copy at address 0 not the one in the base kernel image to
> > make it easier on kexec-tools.
> 
> Why is it easier on kexec-tools?  Doesn't kexec-tools know where it
> put the kernel?
> 
> I'd much rather keep the flag inside the kdump kernel image, rather
> than having kexec/kdump start using random fixed locations outside the
> new kernel image.

In fact the cliching argument is that when the kernel is loaded by OF
or yaboot, we have no way to tell what will be at location 0x5c,
whereas we know that the word at offset 0x5c in the kernel image will
have been initialized to 0.  So we had better put the flag inside the
kernel image.

Paul.
Mohan Kumar M - Oct. 23, 2008, 3:15 p.m.
Hi Milton,

My suggestions:

Milton Miller wrote:
> The __kdump_flag ABI is overly constraining for future development.  
> 
> As of 2.6.27, the kernel entry point has 4 constraints:  Offset 0 is
> the starting point for the master (boot) cpu (entered with r3 pointing
> to the device tree structure), offset 0x60 is code for the slave cpus
> (entered with r3 set to their device tree physical id), offset 0x20 is
> used by the iseries hypervisor, and secondary cpus must be well behaved
> when the first 256 bytes are copied to address 0.
> 
> Placing the __kdump_flag at 0x18 is bad because:
> 
> - It was taking the last 8 bytes before the iseries hypervisor data.  
> - It was 8 bytes for a boolean flag
> - It had no way of identifying that the flag was present
> - It does leave any room for the master to add any additional code
>   before branching, which hurts debug.
> - It will be unnecessarily hard for 32 bit code to be common (8 bytes)
> 
> Now that we have eliminated the use of __kdump_flag in favor of
> the standard is_kdump_kernel(), this flag only controls run without
> relocating the kernel to PHYSICAL_START (0), so rename it __run_at_load.
>
We could try both of our approaches. Instead of passing the information 
that next kernel should be relocatable from kexec_sequence to purgatory 
code, we will do it from kexec-tools path (following your approach). But 
instead of setting the __run_at_load value in the purgatory code (ie at 
physical address 0x5c), we will set the variable __run_at_load at kernel 
  image itself.

i.e.,
[code snip 1]
	lwz	r7,__run_at_load-_stext(r26)
	cmplwi	cr0,r7,1	/* kdump kernel ? - stay where we are */
  	bne	1f
  	add	r25,r25,r26

	lwz	r7,__run_at_load-_stext(r26)
	cmplwi	cr0,r7,1
  	bne	3f

kexec-tools
[code snip 2]
	LOADADDR(6,run_at_load)
	ld	18,0(6)
	cmpd	18,1
	bne	skip
	li	7,1
	stw	7,92(4)		# mark __run_at_load flag at kernel
skip:
	lwz	7,0(4)		# get the first instruction that we stole
	stw	7,0(0)		# and put it in the slave loop at 0
  				# skip cache flush, do we care?

[code snip 3]
	if (info->kexec_flags & KEXEC_ON_CRASH) {
		....
		elf_rel_set_symbol(&info->rhdr, "run_at_load",
                                 &my_run_at_load, 							 
sizeof(my_run_at_load));
	}

Using this approach we are not breaking the kexec_sequence ABI and we 
directly modifying the flag in kernel image.

Regards,
Mohan.
Milton Miller - Nov. 7, 2008, 1:52 p.m.
On Oct 22, 2008, at 10:43 PM, Paul Mackerras wrote:
> Paul Mackerras writes:
>> Milton Miller writes:
>>> Move the flag to 0x5c, 1 word before the secondary cpu entry point at
>>> 0x60.  Use the copy at address 0 not the one in the base kernel 
>>> image to
>>> make it easier on kexec-tools.
>>
>> Why is it easier on kexec-tools?  Doesn't kexec-tools know where it
>> put the kernel?

The archictecture code calls cross-platform code to identify what is
loaded.   It isn't specified if this is a shared mmap or a read into
a buffer.

>>
>> I'd much rather keep the flag inside the kdump kernel image, rather
>> than having kexec/kdump start using random fixed locations outside the
>> new kernel image.
>
> In fact the cliching argument is that when the kernel is loaded by OF
> or yaboot, we have no way to tell what will be at location 0x5c,
> whereas we know that the word at offset 0x5c in the kernel image will
> have been initialized to 0.  So we had better put the flag inside the
> kernel image.

Well, prom_init will copy the 256 bytes to 0 before the code checks
that location.

However, there is an arguement for using the same code from an epapr
or book-e relocatable, and that would need it at 0.   And today
the kexec tool does not do a shared mmap.  Since the change has
been made, I will make a new patch for kexec-tools.

milton
Milton Miller - Nov. 7, 2008, 1:59 p.m.
On Oct 23, 2008, at 10:15 AM, Mohan Kumar M wrote:
> Hi Milton,
> My suggestions:
> Milton Miller wrote:
>> The __kdump_flag ABI is overly constraining for future development.
...
>> Now that we have eliminated the use of __kdump_flag in favor of
>> the standard is_kdump_kernel(), this flag only controls run without
>> relocating the kernel to PHYSICAL_START (0), so rename it 
>> __run_at_load.
>>
> We could try both of our approaches. Instead of passing the 
> information that next kernel should be relocatable from kexec_sequence 
> to purgatory code, we will do it from kexec-tools path (following your 
> approach). But instead of setting the __run_at_load value in the 
> purgatory code (ie at physical address 0x5c), we will set the variable 
> __run_at_load at kernel  image itself.
>
> i.e.,
> [code snip 1]
> 	lwz	r7,__run_at_load-_stext(r26)
> 	cmplwi	cr0,r7,1	/* kdump kernel ? - stay where we are */
>  	bne	1f
>  	add	r25,r25,r26
>
> 	lwz	r7,__run_at_load-_stext(r26)
> 	cmplwi	cr0,r7,1
>  	bne	3f
>
> kexec-tools
> [code snip 2]
> 	LOADADDR(6,run_at_load)
> 	ld	18,0(6)
> 	cmpd	18,1
> 	bne	skip
> 	li	7,1
> 	stw	7,92(4)		# mark __run_at_load flag at kernel
> skip:
> 	lwz	7,0(4)		# get the first instruction that we stole
> 	stw	7,0(0)		# and put it in the slave loop at 0
>  				# skip cache flush, do we care?
>
> [code snip 3]
> 	if (info->kexec_flags & KEXEC_ON_CRASH) {
> 		....
> 		elf_rel_set_symbol(&info->rhdr, "run_at_load",
>                                 &my_run_at_load, 							 
> sizeof(my_run_at_load));
> 	}


This elf_rel_set_symbol sets the copy in purgatory,
after we have copied the code from the kernel.  It
is this copy that gets copied to address 0.

However this information is not in the code that
is at the start of the kernel.  We don't have any
symbols for the kernel itself, it might be stripped.
So we can't use the elf_set_symbol api.  (The kernel
may not be relocatable either).

> Using this approach we are not breaking the kexec_sequence
> ABI and we directly modifying the flag in kernel image.
>
> Regards,
> Mohan.

I'll prepare a patch, but it might be a few days
while I catch up from my 2 week vacation.

milton
Mohan Kumar M - Nov. 10, 2008, 3:22 p.m.
Milton Miller wrote:
> On Oct 23, 2008, at 10:15 AM, Mohan Kumar M wrote:
>> Hi Milton,
>> My suggestions:
>> Milton Miller wrote:
>>
>> i.e.,
>> [code snip 1]
>> 	lwz	r7,__run_at_load-_stext(r26)
>> 	cmplwi	cr0,r7,1	/* kdump kernel ? - stay where we are */
>>  	bne	1f
>>  	add	r25,r25,r26
>>
>> 	lwz	r7,__run_at_load-_stext(r26)
>> 	cmplwi	cr0,r7,1
>>  	bne	3f
>>
>> kexec-tools
>> [code snip 2]
>> 	LOADADDR(6,run_at_load)
>> 	ld	18,0(6)
>> 	cmpd	18,1
>> 	bne	skip
>> 	li	7,1
>> 	stw	7,92(4)		# mark __run_at_load flag at kernel
>> skip:
>> 	lwz	7,0(4)		# get the first instruction that we stole
>> 	stw	7,0(0)		# and put it in the slave loop at 0
>>  				# skip cache flush, do we care?
>>
>> [code snip 3]
>> 	if (info->kexec_flags & KEXEC_ON_CRASH) {
>> 		....
>> 		elf_rel_set_symbol(&info->rhdr, "run_at_load",
>>                                 &my_run_at_load, 							 
>> sizeof(my_run_at_load));
>> 	}
> 
> 
> This elf_rel_set_symbol sets the copy in purgatory,
> after we have copied the code from the kernel.  It
> is this copy that gets copied to address 0.
> 

Yes, elf_ret_symbol sets the copy in purgatory. But the following code 
in purgatory (to be introduced)

  	LOADADDR(6,run_at_load)
  	ld	18,0(6)
  	cmpd	18,1
  	bne	skip
  	li	7,1
  	stw	7,92(4)		# mark __run_at_load flag at kernel

will set the __run_at_load in the kernel image (ie where ever kernel is 
loaded + 0x5c(92). Or am I missing some thing?

> However this information is not in the code that
> is at the start of the kernel.  We don't have any
> symbols for the kernel itself, it might be stripped.
> So we can't use the elf_set_symbol api.  (The kernel
> may not be relocatable either).

Regards,
Mohan.
Milton Miller - Nov. 11, 2008, 4:06 p.m.
On Nov 10, 2008, at 9:22 AM, Mohan Kumar M wrote:
> Yes, elf_ret_symbol sets the copy in purgatory. But the following code 
> in purgatory (to be introduced)
>
>  	LOADADDR(6,run_at_load)
>  	ld	18,0(6)
>  	cmpd	18,1
>  	bne	skip
>  	li	7,1
>  	stw	7,92(4)		# mark __run_at_load flag at kernel
>
> will set the __run_at_load in the kernel image (ie where ever kernel 
> is loaded + 0x5c(92). Or am I missing some thing?

That would work, but I prefer to keep the change in the userspace side. 
  Partly because I don't want to link setting the relocatable flag to 
purgatory starting a dump kernel, and partly because I think 
kexec-tools should be verifying that the loaded kernel will run where 
it expects, either by it finding the relcatable flag, inspecting the 
elf header for the linked address, or some other method (like elf type 
is dynamic for some platforms).  Oh, and its more readable in C.

If someone adds mmap instead of read files to the common code, then we 
will just have to make sure they use MMAP_PRIVATE instead of 
MMAP_SHARED.  Today its not an issue.

milton

Patch

Index: next.git/arch/powerpc/kernel/head_64.S
===================================================================
--- next.git.orig/arch/powerpc/kernel/head_64.S	2008-10-22 04:30:08.000000000 -0500
+++ next.git/arch/powerpc/kernel/head_64.S	2008-10-22 04:59:55.000000000 -0500
@@ -97,12 +97,6 @@  __secondary_hold_spinloop:
 __secondary_hold_acknowledge:
 	.llong	0x0
 
-	/* This flag is set by purgatory if we should be a kdump kernel. */
-	/* Do not move this variable as purgatory knows about it. */
-	.globl	__kdump_flag
-__kdump_flag:
-	.llong	0x0
-
 #ifdef CONFIG_PPC_ISERIES
 	/*
 	 * At offset 0x20, there is a pointer to iSeries LPAR data.
@@ -112,6 +106,20 @@  __kdump_flag:
 	.llong hvReleaseData-KERNELBASE
 #endif /* CONFIG_PPC_ISERIES */
 
+#ifdef CONFIG_CRASH_DUMP
+	/* This flag is set to 1 by a loader if the kernel should run
+	 * at the loaded address instead of the linked address.  This
+	 * is used by kexec-tools to keep the the kdump kernel in the
+	 * crash_kernel region.  The loader is responsible for
+	 * observing the alignment requirement.
+	 */
+	/* Do not move this variable as kexec-tools knows about it. */
+	. = 0x5c
+	.globl	__run_at_load
+__run_at_load:
+	.long	0x72756e30	/* "run0" -- relocate to 0 by default */
+#endif
+
 	. = 0x60
 /*
  * The following code is used to hold secondary processors
@@ -1391,8 +1399,8 @@  _STATIC(__after_prom_start)
 	lis	r25,PAGE_OFFSET@highest	/* compute virtual base of kernel */
 	sldi	r25,r25,32
 #ifdef CONFIG_CRASH_DUMP
-	ld	r7,__kdump_flag-_stext(r26)
-	cmpldi	cr0,r7,1	/* kdump kernel ? - stay where we are */
+	lwz	r7,__run_at_load-_stext(0)
+	cmplwi	cr0,r7,1	/* kdump kernel ? - stay where we are */
 	bne	1f
 	add	r25,r25,r26
 #endif
@@ -1416,11 +1424,11 @@  _STATIC(__after_prom_start)
 #ifdef CONFIG_CRASH_DUMP
 /*
  * Check if the kernel has to be running as relocatable kernel based on the
- * variable __kdump_flag, if it is set the kernel is treated as relocatable
+ * variable __run_at_load, if it is set the kernel is treated as relocatable
  * kernel, otherwise it will be moved to PHYSICAL_START
  */
-	ld	r7,__kdump_flag-_stext(r26)
-	cmpldi	cr0,r7,1
+	lwz	r7,__run_at_load-_stext(0)
+	cmplwi	cr0,r7,1
 	bne	3f
 
 	li	r5,__end_interrupts - _stext	/* just copy interrupts */