Patchwork [2/7] powerpc/85xx: add HOTPLUG_CPU support

login
register
mail settings
Submitter chenhui zhao
Date Nov. 4, 2011, 12:31 p.m.
Message ID <1320409889-14408-1-git-send-email-chenhui.zhao@freescale.com>
Download mbox | patch
Permalink /patch/123616/
State Superseded
Headers show

Comments

chenhui zhao - Nov. 4, 2011, 12:31 p.m.
From: Li Yang <leoli@freescale.com>

Add support to disable and re-enable individual cores at runtime
on MPC85xx/QorIQ SMP machines. Currently support e500 core.

MPC85xx machines use ePAPR spin-table in boot page for CPU kick-off.
This patch uses the boot page from bootloader to boot core at runtime.
It supports 32-bit and 36-bit physical address.

Add delay in generic_cpu_die() to wait core reset.

Signed-off-by: Li Yang <leoli@freescale.com>
Signed-off-by: Jin Qing <b24347@freescale.com>
Signed-off-by: Zhao Chenhui <chenhui.zhao@freescale.com>
---
 arch/powerpc/Kconfig                 |    5 +-
 arch/powerpc/kernel/head_fsl_booke.S |   28 +++++
 arch/powerpc/kernel/smp.c            |    8 +-
 arch/powerpc/platforms/85xx/smp.c    |  220 +++++++++++++++++++++++++++++-----
 4 files changed, 226 insertions(+), 35 deletions(-)
Scott Wood - Nov. 4, 2011, 6:35 p.m.
On 11/04/2011 07:31 AM, Zhao Chenhui wrote:
> From: Li Yang <leoli@freescale.com>
> 
> Add support to disable and re-enable individual cores at runtime
> on MPC85xx/QorIQ SMP machines. Currently support e500 core.
> 
> MPC85xx machines use ePAPR spin-table in boot page for CPU kick-off.
> This patch uses the boot page from bootloader to boot core at runtime.
> It supports 32-bit and 36-bit physical address.

Note that there is no guarantee that the bootloader can handle you
resetting a core.  In ePAPR the spin table is a one-time release
mechanism, not a core reset mechanism.  If this has a U-Boot dependency,
document that.

>  #ifdef CONFIG_SMP
>  /* When we get here, r24 needs to hold the CPU # */
>  	.globl __secondary_start
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7bf2187..12a54f0 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -381,8 +381,14 @@ void generic_cpu_die(unsigned int cpu)
>  
>  	for (i = 0; i < 100; i++) {
>  		smp_rmb();
> -		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
> +		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
> +			/*
> +			 * After another core sets cpu_state to CPU_DEAD,
> +			 * it needs some time to die.
> +			 */
> +			msleep(10);
>  			return;
> +		}
>  		msleep(100);

It would be better to do this as a call into platform-specific code than
can check registers to determine whether the core has checked out (in
our case, whether it has entered nap) -- or to do a suitable delay for
that platform if this isn't possible.

> diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
> index 9b0de9c..5a54fc1 100644
> --- a/arch/powerpc/platforms/85xx/smp.c
> +++ b/arch/powerpc/platforms/85xx/smp.c
> @@ -17,6 +17,7 @@
>  #include <linux/of.h>
>  #include <linux/kexec.h>
>  #include <linux/highmem.h>
> +#include <linux/cpu.h>
>  
>  #include <asm/machdep.h>
>  #include <asm/pgtable.h>
> @@ -30,26 +31,141 @@
>  
>  extern void __early_start(void);
>  
> -#define BOOT_ENTRY_ADDR_UPPER	0
> -#define BOOT_ENTRY_ADDR_LOWER	1
> -#define BOOT_ENTRY_R3_UPPER	2
> -#define BOOT_ENTRY_R3_LOWER	3
> -#define BOOT_ENTRY_RESV		4
> -#define BOOT_ENTRY_PIR		5
> -#define BOOT_ENTRY_R6_UPPER	6
> -#define BOOT_ENTRY_R6_LOWER	7
> -#define NUM_BOOT_ENTRY		8
> -#define SIZE_BOOT_ENTRY		(NUM_BOOT_ENTRY * sizeof(u32))
> -
> -static int __init
> -smp_85xx_kick_cpu(int nr)
> +#define MPC85xx_BPTR_OFF		0x00020
> +#define MPC85xx_BPTR_EN			0x80000000
> +#define MPC85xx_BPTR_BOOT_PAGE_MASK	0x00ffffff
> +#define MPC85xx_BRR_OFF			0xe0e4
> +#define MPC85xx_ECM_EEBPCR_OFF		0x01010
> +#define MPC85xx_PIC_PIR_OFF		0x41090
> +
> +struct epapr_entry {

ePAPR is more than just the spin table.  Call it something like
epapr_spin_table.

> +	u32	addr_h;
> +	u32	addr_l;
> +	u32	r3_h;
> +	u32	r3_l;
> +	u32	reserved;
> +	u32	pir;
> +	u32	r6_h;
> +	u32	r6_l;
> +};

Get rid of r6, it is not part of the ePAPR spin table.

> +static int is_corenet;
> +static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
> +
> +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)

Why PPC32?

> +extern void flush_disable_L1(void);

If this isn't already in a header file, put it in one.

> +static void __cpuinit smp_85xx_mach_cpu_die(void)
> +{
> +	unsigned int cpu = smp_processor_id();
> +	register u32 tmp;
> +
> +	local_irq_disable();
> +	idle_task_exit();
> +	generic_set_cpu_dead(cpu);
> +	smp_wmb();
> +
> +	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
> +	mtspr(SPRN_TCR, 0);

If clearing TSR matters at all (I'm not sure that it does), first clear
TCR, then TSR.

> +	flush_disable_L1();

You'll also need to take down L2 on e500mc.

> +	tmp = 0;
> +	if (cpu_has_feature(CPU_FTR_CAN_NAP))
> +		tmp = HID0_NAP;
> +	else if (cpu_has_feature(CPU_FTR_CAN_DOZE))
> +		tmp = HID0_DOZE;

Those FTR bits are for what we can do in idle, and can be cleared if the
user sets CONFIG_BDI_SWITCH.

On 85xx we always want to nap here, and at least on e500mc it seems to
be mandatory.  From the p5020 RM description of PIR:

> For proper system operation, a core should be reset in this way only if the core is already in nap or sleep
> state. Because a core in either state cannot perform the necessary write to cause a hard reset, a core cannot
> put itself into hard reset.

Note that on e500mc we don't use HID0/MSR_WE to enter nap, we need to
hit the CCSR register.  And unless you can somehow guarantee that only
one core at a time is doing this, we'll need some oher core to actually
place us in nap (since once we enter nap we're not running so can't
release a lock).

> +	if (tmp) {
> +		tmp |= mfspr(SPRN_HID0) & ~(HID0_DOZE|HID0_NAP|HID0_SLEEP);
> +
> +		smp_mb();

smp_mb()?  This is always SMP...  It looks like you meant some specific
sync instruction as part of an architected sequence, so just use that.

> +		isync();
> +		mtspr(SPRN_HID0, tmp);
> +		isync();
> +
> +		tmp = mfmsr();
> +		tmp |= MSR_WE;
> +		smp_mb();
> +		mtmsr(tmp);
> +		isync();
> +	}
> +
> +	for (;;);
> +}
> +
> +static void __cpuinit smp_85xx_reset_core(int nr)
> +{
> +	__iomem u32 *vaddr, *pir_vaddr;
> +	u32 val, cpu_mask;
> +
> +	/* If CoreNet platform, use BRR as release register. */
> +	if (is_corenet) {
> +		cpu_mask = 1 << nr;
> +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
> +	} else {
> +		cpu_mask = 1 << (24 + nr);
> +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
> +	}

Please use the device tree node, not get_immrbase().

> +	val = in_be32(vaddr);
> +	if (!(val & cpu_mask)) {
> +		out_be32(vaddr, val | cpu_mask);
> +	} else {
> +		/* reset core */
> +		pir_vaddr = ioremap(get_immrbase() + MPC85xx_PIC_PIR_OFF, 4);
> +		val = in_be32(pir_vaddr);
> +		/* reset assert */
> +		val |= (1 << nr);
> +		out_be32(pir_vaddr, val);

Use setbits32().

> +		val = in_be32(pir_vaddr);
> +		val &= ~(1 << nr);
> +		/* reset negate */
> +		out_be32(pir_vaddr, val);

clrbits32().

Is there any amount of time we need to keep the reset pin asserted?

> +		iounmap(pir_vaddr);
> +	}
> +	iounmap(vaddr);
> +}
> +
> +static int __cpuinit smp_85xx_map_bootpg(u32 page)
> +{
> +	__iomem u32 *bootpg_ptr;
> +	u32 bptr;
> +
> +	/* Get the BPTR */
> +	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
> +
> +	/* Set the BPTR to the secondary boot page */
> +	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
> +	out_be32(bootpg_ptr, bptr);
> +
> +	iounmap(bootpg_ptr);
> +	return 0;
> +}

Shouldn't the boot page already be set by U-Boot?

> +static int __cpuinit smp_85xx_kick_cpu(int nr)
>  {
>  	unsigned long flags;
>  	const u64 *cpu_rel_addr;
> -	__iomem u32 *bptr_vaddr;
> +	__iomem struct epapr_entry *epapr;
>  	struct device_node *np;
> -	int n = 0;
> +	int n = 0, hw_cpu = get_hard_smp_processor_id(nr);
>  	int ioremappable;
> +	int ret = 0;
>  
>  	WARN_ON (nr < 0 || nr >= NR_CPUS);
>  
> @@ -73,46 +189,79 @@ smp_85xx_kick_cpu(int nr)
>  
>  	/* Map the spin table */
>  	if (ioremappable)
> -		bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY);
> +		epapr = ioremap(*cpu_rel_addr, sizeof(struct epapr_entry));
>  	else
> -		bptr_vaddr = phys_to_virt(*cpu_rel_addr);
> +		epapr = phys_to_virt(*cpu_rel_addr);
>  
>  	local_irq_save(flags);
>  
> -	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr);
> +	out_be32(&epapr->pir, hw_cpu);
>  #ifdef CONFIG_PPC32
> -	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
> +#ifdef CONFIG_HOTPLUG_CPU
> +	if (system_state == SYSTEM_RUNNING) {
> +		out_be32(&epapr->addr_l, 0);
> +		smp_85xx_map_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));

Why is this inside PPC32?

> +		smp_85xx_reset_core(hw_cpu);
> +
> +		/* wait until core is ready... */
> +		n = 0;
> +		while ((in_be32(&epapr->addr_l) != 1) && (++n < 1000))
> +			udelay(100);
> +		if (n > 1000) {

if (n == 1000)

or

if (in_be32(&epapr->addr_l) != 1)

> +			pr_err("timeout waiting for core%d to reset\n",	nr);
> +			ret = -ENOENT;
> +			goto out;
> +		}
> +		/*  clear the acknowledge status */
> +		__secondary_hold_acknowledge = -1;
> +
> +		smp_85xx_unmap_bootpg();
> +	}
> +#endif
> +	out_be32(&epapr->addr_l, __pa(__early_start));
>  
>  	if (!ioremappable)
> -		flush_dcache_range((ulong)bptr_vaddr,
> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
> +		flush_dcache_range((ulong)epapr,
> +				(ulong)epapr + sizeof(struct epapr_entry));
>  
>  	/* Wait a bit for the CPU to ack. */
> -	while ((__secondary_hold_acknowledge != nr) && (++n < 1000))
> +	n = 0;
> +	while ((__secondary_hold_acknowledge != hw_cpu) && (++n < 1000))
>  		mdelay(1);
> +	if (n > 1000) {

if (n == 1000)

or

if (__secondary_hold_acknowledge != hw_cpu)

> +		pr_err("timeout waiting for core%d to ack\n", nr);

pr_err("%s: timeout waiting for core %d to ack\n", __func__, nr);

Likewise elsewhere.  Maybe also/instead mention hw_cpu.

> +		ret = -ENOENT;
> +		goto out;
> +	}
> +out:
>  #else
>  	smp_generic_kick_cpu(nr);
>  
> -	out_be64((u64 *)(bptr_vaddr + BOOT_ENTRY_ADDR_UPPER),
> +	out_be64((u64 *)(&epapr->addr_h),
>  		__pa((u64)*((unsigned long long *) generic_secondary_smp_init)));
>  
>  	if (!ioremappable)
> -		flush_dcache_range((ulong)bptr_vaddr,
> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
> +		flush_dcache_range((ulong)epapr,
> +				(ulong)epapr + sizeof(struct epapr_entry));

We don't wait for the core to come up on 64-bit?

> @@ -228,14 +376,18 @@ void __init mpc85xx_smp_init(void)
>  {
>  	struct device_node *np;
>  
> -	smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
> -
>  	np = of_find_node_by_type(NULL, "open-pic");
>  	if (np) {
>  		smp_85xx_ops.probe = smp_mpic_probe;
>  		smp_85xx_ops.message_pass = smp_mpic_message_pass;
>  	}
>  
> +	/* Check if the chip is based on CoreNet platform. */
> +	is_corenet = 0;
> +	np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-device-config-1.0");
> +	if (np)
> +		is_corenet = 1;

Please also check for the non-corenet guts node.  If you don't find
either, disable the mechanism -- you're probably running under a hypervisor.

-Scott
Li Yang-R58472 - Nov. 8, 2011, 10:05 a.m.
>Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
>
>On 11/04/2011 07:31 AM, Zhao Chenhui wrote:
>> From: Li Yang <leoli@freescale.com>
>>
>> Add support to disable and re-enable individual cores at runtime
>> on MPC85xx/QorIQ SMP machines. Currently support e500 core.
>>
>> MPC85xx machines use ePAPR spin-table in boot page for CPU kick-off.
>> This patch uses the boot page from bootloader to boot core at runtime.
>> It supports 32-bit and 36-bit physical address.
>
>Note that there is no guarantee that the bootloader can handle you
>resetting a core.  In ePAPR the spin table is a one-time release
>mechanism, not a core reset mechanism.  If this has a U-Boot dependency,
>document that.

Actually we suggested to add a spin table in kernel so that we won't have the dependency on u-boot.  But there seems to be some opposite opinion in the internal discussion.  I personally prefer to remove the u-boot dependency and add the mechanism in kernel.

>
>>  #ifdef CONFIG_SMP
>>  /* When we get here, r24 needs to hold the CPU # */
>>  	.globl __secondary_start
>> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
>> index 7bf2187..12a54f0 100644
>> --- a/arch/powerpc/kernel/smp.c
>> +++ b/arch/powerpc/kernel/smp.c
>> @@ -381,8 +381,14 @@ void generic_cpu_die(unsigned int cpu)
>>
>>  	for (i = 0; i < 100; i++) {
>>  		smp_rmb();
>> -		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
>> +		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
>> +			/*
>> +			 * After another core sets cpu_state to CPU_DEAD,
>> +			 * it needs some time to die.
>> +			 */
>> +			msleep(10);
>>  			return;
>> +		}
>>  		msleep(100);
>
>It would be better to do this as a call into platform-specific code than
>can check registers to determine whether the core has checked out (in
>our case, whether it has entered nap) -- or to do a suitable delay for
>that platform if this isn't possible.

It will be easier if we can add a platform specific cpu_die instead of using the generic one?

>
>> diff --git a/arch/powerpc/platforms/85xx/smp.c
>b/arch/powerpc/platforms/85xx/smp.c
>> index 9b0de9c..5a54fc1 100644
>> --- a/arch/powerpc/platforms/85xx/smp.c
>> +++ b/arch/powerpc/platforms/85xx/smp.c
>> @@ -17,6 +17,7 @@
>>  #include <linux/of.h>
>>  #include <linux/kexec.h>
>>  #include <linux/highmem.h>
>> +#include <linux/cpu.h>
>>
>>  #include <asm/machdep.h>
>>  #include <asm/pgtable.h>
>> @@ -30,26 +31,141 @@
>>
>>  extern void __early_start(void);
>>
>> -#define BOOT_ENTRY_ADDR_UPPER	0
>> -#define BOOT_ENTRY_ADDR_LOWER	1
>> -#define BOOT_ENTRY_R3_UPPER	2
>> -#define BOOT_ENTRY_R3_LOWER	3
>> -#define BOOT_ENTRY_RESV		4
>> -#define BOOT_ENTRY_PIR		5
>> -#define BOOT_ENTRY_R6_UPPER	6
>> -#define BOOT_ENTRY_R6_LOWER	7
>> -#define NUM_BOOT_ENTRY		8
>> -#define SIZE_BOOT_ENTRY		(NUM_BOOT_ENTRY * sizeof(u32))
>> -
>> -static int __init
>> -smp_85xx_kick_cpu(int nr)
>> +#define MPC85xx_BPTR_OFF		0x00020
>> +#define MPC85xx_BPTR_EN			0x80000000
>> +#define MPC85xx_BPTR_BOOT_PAGE_MASK	0x00ffffff
>> +#define MPC85xx_BRR_OFF			0xe0e4
>> +#define MPC85xx_ECM_EEBPCR_OFF		0x01010
>> +#define MPC85xx_PIC_PIR_OFF		0x41090
>> +
>> +struct epapr_entry {
>
>ePAPR is more than just the spin table.  Call it something like
>epapr_spin_table.
>
>> +	u32	addr_h;
>> +	u32	addr_l;
>> +	u32	r3_h;
>> +	u32	r3_l;
>> +	u32	reserved;
>> +	u32	pir;
>> +	u32	r6_h;
>> +	u32	r6_l;
>> +};
>
>Get rid of r6, it is not part of the ePAPR spin table.

Agree.

>
>> +static int is_corenet;
>> +static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
>> +
>> +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)
>
>Why PPC32?

Not sure if it is the same for e5500.  E5500 support will be verified later.

>
>> +extern void flush_disable_L1(void);
>
>If this isn't already in a header file, put it in one.
>
>> +static void __cpuinit smp_85xx_mach_cpu_die(void)
>> +{
>> +	unsigned int cpu = smp_processor_id();
>> +	register u32 tmp;
>> +
>> +	local_irq_disable();
>> +	idle_task_exit();
>> +	generic_set_cpu_dead(cpu);
>> +	smp_wmb();
>> +
>> +	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
>> +	mtspr(SPRN_TCR, 0);
>
>If clearing TSR matters at all (I'm not sure that it does), first clear
>TCR, then TSR.
>
>> +	flush_disable_L1();
>
>You'll also need to take down L2 on e500mc.

Right.  E500mc support is beyond this patch series.  We will work on it after the e500v2 support is finalized.

>
>> +	tmp = 0;
>> +	if (cpu_has_feature(CPU_FTR_CAN_NAP))
>> +		tmp = HID0_NAP;
>> +	else if (cpu_has_feature(CPU_FTR_CAN_DOZE))
>> +		tmp = HID0_DOZE;
>
>Those FTR bits are for what we can do in idle, and can be cleared if the
>user sets CONFIG_BDI_SWITCH.

It is powersave_nap variable shows what we can do in idle.  I think it's correct to use the CPU_FTR_CAN_* macro as CPU capability.

>
>On 85xx we always want to nap here, and at least on e500mc it seems to
>be mandatory.  From the p5020 RM description of PIR:
>
>> For proper system operation, a core should be reset in this way only if
>the core is already in nap or sleep
>> state. Because a core in either state cannot perform the necessary write
>to cause a hard reset, a core cannot
>> put itself into hard reset.
>
>Note that on e500mc we don't use HID0/MSR_WE to enter nap, we need to
>hit the CCSR register.  And unless you can somehow guarantee that only
>one core at a time is doing this, we'll need some oher core to actually
>place us in nap (since once we enter nap we're not running so can't
>release a lock).
>
>> +	if (tmp) {
>> +		tmp |= mfspr(SPRN_HID0) & ~(HID0_DOZE|HID0_NAP|HID0_SLEEP);
>> +
>> +		smp_mb();
>
>smp_mb()?  This is always SMP...  It looks like you meant some specific
>sync instruction as part of an architected sequence, so just use that.
>
>> +		isync();
>> +		mtspr(SPRN_HID0, tmp);
>> +		isync();
>> +
>> +		tmp = mfmsr();
>> +		tmp |= MSR_WE;
>> +		smp_mb();
>> +		mtmsr(tmp);
>> +		isync();
>> +	}
>> +
>> +	for (;;);
>> +}
>> +
>> +static void __cpuinit smp_85xx_reset_core(int nr)
>> +{
>> +	__iomem u32 *vaddr, *pir_vaddr;
>> +	u32 val, cpu_mask;
>> +
>> +	/* If CoreNet platform, use BRR as release register. */
>> +	if (is_corenet) {
>> +		cpu_mask = 1 << nr;
>> +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
>> +	} else {
>> +		cpu_mask = 1 << (24 + nr);
>> +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
>> +	}
>
>Please use the device tree node, not get_immrbase().

The get_immrbase() implementation uses the device tree node.

>
>> +	val = in_be32(vaddr);
>> +	if (!(val & cpu_mask)) {
>> +		out_be32(vaddr, val | cpu_mask);
>> +	} else {
>> +		/* reset core */
>> +		pir_vaddr = ioremap(get_immrbase() + MPC85xx_PIC_PIR_OFF, 4);
>> +		val = in_be32(pir_vaddr);
>> +		/* reset assert */
>> +		val |= (1 << nr);
>> +		out_be32(pir_vaddr, val);
>
>Use setbits32().
>
>> +		val = in_be32(pir_vaddr);
>> +		val &= ~(1 << nr);
>> +		/* reset negate */
>> +		out_be32(pir_vaddr, val);
>
>clrbits32().
>
>Is there any amount of time we need to keep the reset pin asserted?

We don't know, but it's already working without any wait.

>
>> +		iounmap(pir_vaddr);
>> +	}
>> +	iounmap(vaddr);
>> +}
>> +
>> +static int __cpuinit smp_85xx_map_bootpg(u32 page)
>> +{
>> +	__iomem u32 *bootpg_ptr;
>> +	u32 bptr;
>> +
>> +	/* Get the BPTR */
>> +	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
>> +
>> +	/* Set the BPTR to the secondary boot page */
>> +	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
>> +	out_be32(bootpg_ptr, bptr);
>> +
>> +	iounmap(bootpg_ptr);
>> +	return 0;
>> +}
>
>Shouldn't the boot page already be set by U-Boot?

The register should be lost after a deep sleep.   Better to do it again.

>
>> +static int __cpuinit smp_85xx_kick_cpu(int nr)
>>  {
>>  	unsigned long flags;
>>  	const u64 *cpu_rel_addr;
>> -	__iomem u32 *bptr_vaddr;
>> +	__iomem struct epapr_entry *epapr;
>>  	struct device_node *np;
>> -	int n = 0;
>> +	int n = 0, hw_cpu = get_hard_smp_processor_id(nr);
>>  	int ioremappable;
>> +	int ret = 0;
>>
>>  	WARN_ON (nr < 0 || nr >= NR_CPUS);
>>
>> @@ -73,46 +189,79 @@ smp_85xx_kick_cpu(int nr)
>>
>>  	/* Map the spin table */
>>  	if (ioremappable)
>> -		bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY);
>> +		epapr = ioremap(*cpu_rel_addr, sizeof(struct epapr_entry));
>>  	else
>> -		bptr_vaddr = phys_to_virt(*cpu_rel_addr);
>> +		epapr = phys_to_virt(*cpu_rel_addr);
>>
>>  	local_irq_save(flags);
>>
>> -	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr);
>> +	out_be32(&epapr->pir, hw_cpu);
>>  #ifdef CONFIG_PPC32
>> -	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
>> +#ifdef CONFIG_HOTPLUG_CPU
>> +	if (system_state == SYSTEM_RUNNING) {
>> +		out_be32(&epapr->addr_l, 0);
>> +		smp_85xx_map_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));
>
>Why is this inside PPC32?

Not tested on 64-bit yet.  Might require a different implementation on PPC64.

>
>> +		smp_85xx_reset_core(hw_cpu);
>> +
>> +		/* wait until core is ready... */
>> +		n = 0;
>> +		while ((in_be32(&epapr->addr_l) != 1) && (++n < 1000))
>> +			udelay(100);
>> +		if (n > 1000) {
>
>if (n == 1000)
>
>or
>
>if (in_be32(&epapr->addr_l) != 1)
>
>> +			pr_err("timeout waiting for core%d to reset\n",	nr);
>> +			ret = -ENOENT;
>> +			goto out;
>> +		}
>> +		/*  clear the acknowledge status */
>> +		__secondary_hold_acknowledge = -1;
>> +
>> +		smp_85xx_unmap_bootpg();
>> +	}
>> +#endif
>> +	out_be32(&epapr->addr_l, __pa(__early_start));
>>
>>  	if (!ioremappable)
>> -		flush_dcache_range((ulong)bptr_vaddr,
>> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
>> +		flush_dcache_range((ulong)epapr,
>> +				(ulong)epapr + sizeof(struct epapr_entry));
>>
>>  	/* Wait a bit for the CPU to ack. */
>> -	while ((__secondary_hold_acknowledge != nr) && (++n < 1000))
>> +	n = 0;
>> +	while ((__secondary_hold_acknowledge != hw_cpu) && (++n < 1000))
>>  		mdelay(1);
>> +	if (n > 1000) {
>
>if (n == 1000)
>
>or
>
>if (__secondary_hold_acknowledge != hw_cpu)
>
>> +		pr_err("timeout waiting for core%d to ack\n", nr);
>
>pr_err("%s: timeout waiting for core %d to ack\n", __func__, nr);
>
>Likewise elsewhere.  Maybe also/instead mention hw_cpu.
>
>> +		ret = -ENOENT;
>> +		goto out;
>> +	}
>> +out:
>>  #else
>>  	smp_generic_kick_cpu(nr);
>>
>> -	out_be64((u64 *)(bptr_vaddr + BOOT_ENTRY_ADDR_UPPER),
>> +	out_be64((u64 *)(&epapr->addr_h),
>>  		__pa((u64)*((unsigned long long *)
>generic_secondary_smp_init)));
>>
>>  	if (!ioremappable)
>> -		flush_dcache_range((ulong)bptr_vaddr,
>> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
>> +		flush_dcache_range((ulong)epapr,
>> +				(ulong)epapr + sizeof(struct epapr_entry));
>
>We don't wait for the core to come up on 64-bit?

Not sure about it.  But at least the normal boot up should be tested on P5020, right?

>
>> @@ -228,14 +376,18 @@ void __init mpc85xx_smp_init(void)
>>  {
>>  	struct device_node *np;
>>
>> -	smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
>> -
>>  	np = of_find_node_by_type(NULL, "open-pic");
>>  	if (np) {
>>  		smp_85xx_ops.probe = smp_mpic_probe;
>>  		smp_85xx_ops.message_pass = smp_mpic_message_pass;
>>  	}
>>
>> +	/* Check if the chip is based on CoreNet platform. */
>> +	is_corenet = 0;
>> +	np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-device-config-
>1.0");
>> +	if (np)
>> +		is_corenet = 1;
>
>Please also check for the non-corenet guts node.  If you don't find
>either, disable the mechanism -- you're probably running under a
>hypervisor.

Ok.

- Leo
Scott Wood - Nov. 8, 2011, 8:57 p.m.
On 11/08/2011 04:05 AM, Li Yang-R58472 wrote:
>> Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
>>
>> On 11/04/2011 07:31 AM, Zhao Chenhui wrote:
>>> +static int is_corenet;
>>> +static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
>>> +
>>> +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)
>>
>> Why PPC32?
> 
> Not sure if it is the same for e5500.  E5500 support will be verified later.

It's better to have 64-bit silently do nothing here?

>>> +	flush_disable_L1();
>>
>> You'll also need to take down L2 on e500mc.
> 
> Right.  E500mc support is beyond this patch series.  We will work on it after the e500v2 support is finalized.

I saw some support with "is_corenet"...  If we don't support e500mc,
make sure the code doesn't try to run on e500mc.

This isn't an e500v2-only BSP you're putting the code into. :-)

>>> +	tmp = 0;
>>> +	if (cpu_has_feature(CPU_FTR_CAN_NAP))
>>> +		tmp = HID0_NAP;
>>> +	else if (cpu_has_feature(CPU_FTR_CAN_DOZE))
>>> +		tmp = HID0_DOZE;
>>
>> Those FTR bits are for what we can do in idle, and can be cleared if the
>> user sets CONFIG_BDI_SWITCH.
> 
> It is powersave_nap variable shows what we can do in idle.

No, that shows what the user wants to do in idle.

> I think it's correct to use the CPU_FTR_CAN_* macro as CPU capability.

This is 85xx-specific code.  We always can, and want to, nap here
(though the way we enter nap will be different on e500mc and up) -- even
if it's broken to use it for idle (such as on p4080, which would miss
doorbells as wake events).

>>> +static void __cpuinit smp_85xx_reset_core(int nr)
>>> +{
>>> +	__iomem u32 *vaddr, *pir_vaddr;
>>> +	u32 val, cpu_mask;
>>> +
>>> +	/* If CoreNet platform, use BRR as release register. */
>>> +	if (is_corenet) {
>>> +		cpu_mask = 1 << nr;
>>> +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
>>> +	} else {
>>> +		cpu_mask = 1 << (24 + nr);
>>> +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
>>> +	}
>>
>> Please use the device tree node, not get_immrbase().
> 
> The get_immrbase() implementation uses the device tree node.

I mean the guts node.  get_immrbase() should be avoided where possible.

Also, some people might care about latency to enter/exit deep sleep.
Searching through the device tree at this point (rather than on init)
slows that down.

>>> +static int __cpuinit smp_85xx_map_bootpg(u32 page)
>>> +{
>>> +	__iomem u32 *bootpg_ptr;
>>> +	u32 bptr;
>>> +
>>> +	/* Get the BPTR */
>>> +	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
>>> +
>>> +	/* Set the BPTR to the secondary boot page */
>>> +	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
>>> +	out_be32(bootpg_ptr, bptr);
>>> +
>>> +	iounmap(bootpg_ptr);
>>> +	return 0;
>>> +}
>>
>> Shouldn't the boot page already be set by U-Boot?
> 
> The register should be lost after a deep sleep.   Better to do it again.

How can it be lost after a deep sleep if we're relying on it to hold our
wakeup code?

It's not "better to do it again" if we're making a bad assumption about
the code and the table being in the same page.

>>>  	local_irq_save(flags);
>>>
>>> -	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr);
>>> +	out_be32(&epapr->pir, hw_cpu);
>>>  #ifdef CONFIG_PPC32
>>> -	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
>>> +#ifdef CONFIG_HOTPLUG_CPU
>>> +	if (system_state == SYSTEM_RUNNING) {
>>> +		out_be32(&epapr->addr_l, 0);
>>> +		smp_85xx_map_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));
>>
>> Why is this inside PPC32?
> 
> Not tested on 64-bit yet.  Might require a different implementation on PPC64.

Please make a reasonable effort to do things in a way that works on
both.  It shouldn't be a big deal to test that it doesn't break booting
on p5020.

>>>  	if (!ioremappable)
>>> -		flush_dcache_range((ulong)bptr_vaddr,
>>> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
>>> +		flush_dcache_range((ulong)epapr,
>>> +				(ulong)epapr + sizeof(struct epapr_entry));
>>
>> We don't wait for the core to come up on 64-bit?
> 
> Not sure about it.  But at least the normal boot up should be tested on P5020, right?

Well, that's a special case in that it only has one secondary. :-)

Or we could be getting lucky with timing.  It's not a new issue with
this patch, it just looks odd.

-Scott
Li Yang-R58472 - Nov. 9, 2011, 8:31 a.m.
>-----Original Message-----
>From: Wood Scott-B07421
>Sent: Wednesday, November 09, 2011 4:58 AM
>To: Li Yang-R58472
>Cc: Wood Scott-B07421; Zhao Chenhui-B35336; linuxppc-dev@lists.ozlabs.org
>Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
>
>On 11/08/2011 04:05 AM, Li Yang-R58472 wrote:
>>> Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
>>>
>>> On 11/04/2011 07:31 AM, Zhao Chenhui wrote:
>>>> +static int is_corenet;
>>>> +static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
>>>> +
>>>> +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)
>>>
>>> Why PPC32?
>>
>> Not sure if it is the same for e5500.  E5500 support will be verified
>later.
>
>It's better to have 64-bit silently do nothing here?
>
>>>> +	flush_disable_L1();
>>>
>>> You'll also need to take down L2 on e500mc.
>>
>> Right.  E500mc support is beyond this patch series.  We will work on it
>after the e500v2 support is finalized.
>
>I saw some support with "is_corenet"...  If we don't support e500mc, make
>sure the code doesn't try to run on e500mc.

The is_corenet code is just a start of the e500mc support and is far from complete.

>
>This isn't an e500v2-only BSP you're putting the code into. :-)

Yes, I know.  But it will take quite some time to perfect the support for different type of cores.  I'd like to make the effort into stages.  We want to push the e500v2 support in first and add support to newer cores later so that we don't need to re-spin the patches again and again.  And the upstream kernel can get the PM functionality at least for e500v2 earlier.  Anyway, we need to update the title of the patch to be more specific on e500v2.

>
>>>> +	tmp = 0;
>>>> +	if (cpu_has_feature(CPU_FTR_CAN_NAP))
>>>> +		tmp = HID0_NAP;
>>>> +	else if (cpu_has_feature(CPU_FTR_CAN_DOZE))
>>>> +		tmp = HID0_DOZE;
>>>
>>> Those FTR bits are for what we can do in idle, and can be cleared if
>>> the user sets CONFIG_BDI_SWITCH.
>>
>> It is powersave_nap variable shows what we can do in idle.
>
>No, that shows what the user wants to do in idle.
>
>> I think it's correct to use the CPU_FTR_CAN_* macro as CPU capability.
>
>This is 85xx-specific code.  We always can, and want to, nap here (though
>the way we enter nap will be different on e500mc and up) -- even if it's
>broken to use it for idle (such as on p4080, which would miss doorbells as
>wake events).

Ok.  Will stick to Nap.

>
>>>> +static void __cpuinit smp_85xx_reset_core(int nr) {
>>>> +	__iomem u32 *vaddr, *pir_vaddr;
>>>> +	u32 val, cpu_mask;
>>>> +
>>>> +	/* If CoreNet platform, use BRR as release register. */
>>>> +	if (is_corenet) {
>>>> +		cpu_mask = 1 << nr;
>>>> +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
>>>> +	} else {
>>>> +		cpu_mask = 1 << (24 + nr);
>>>> +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
>>>> +	}
>>>
>>> Please use the device tree node, not get_immrbase().
>>
>> The get_immrbase() implementation uses the device tree node.
>
>I mean the guts node.  get_immrbase() should be avoided where possible.

Ok.  I got your point to use offset from guts.

>
>Also, some people might care about latency to enter/exit deep sleep.
>Searching through the device tree at this point (rather than on init)
>slows that down.

Actually the get_immrbase() is smarter than you thought. :) It only search the device tree on first call and returns the saved value on follow up calls.

>
>>>> +static int __cpuinit smp_85xx_map_bootpg(u32 page) {
>>>> +	__iomem u32 *bootpg_ptr;
>>>> +	u32 bptr;
>>>> +
>>>> +	/* Get the BPTR */
>>>> +	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
>>>> +
>>>> +	/* Set the BPTR to the secondary boot page */
>>>> +	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
>>>> +	out_be32(bootpg_ptr, bptr);
>>>> +
>>>> +	iounmap(bootpg_ptr);
>>>> +	return 0;
>>>> +}
>>>
>>> Shouldn't the boot page already be set by U-Boot?
>>
>> The register should be lost after a deep sleep.   Better to do it again.
>
>How can it be lost after a deep sleep if we're relying on it to hold our
>wakeup code?

In order to wake up from deep sleep, the boot page has been set to the deep sleep restoration function.  We need to set it back to the bootpage from u-boot.

>
>It's not "better to do it again" if we're making a bad assumption about
>the code and the table being in the same page.

Currently we are reusing the whole boot page including the spin-table from u-boot.  Are you suggesting to move just the boot page to kernel?

>
>>>>  	local_irq_save(flags);
>>>>
>>>> -	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr);
>>>> +	out_be32(&epapr->pir, hw_cpu);
>>>>  #ifdef CONFIG_PPC32
>>>> -	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
>>>> +#ifdef CONFIG_HOTPLUG_CPU
>>>> +	if (system_state == SYSTEM_RUNNING) {
>>>> +		out_be32(&epapr->addr_l, 0);
>>>> +		smp_85xx_map_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));
>>>
>>> Why is this inside PPC32?
>>
>> Not tested on 64-bit yet.  Might require a different implementation on
>PPC64.
>
>Please make a reasonable effort to do things in a way that works on both.
>It shouldn't be a big deal to test that it doesn't break booting on p5020.

Will do.  But in stages.

>
>>>>  	if (!ioremappable)
>>>> -		flush_dcache_range((ulong)bptr_vaddr,
>>>> -				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
>>>> +		flush_dcache_range((ulong)epapr,
>>>> +				(ulong)epapr + sizeof(struct epapr_entry));
>>>
>>> We don't wait for the core to come up on 64-bit?
>>
>> Not sure about it.  But at least the normal boot up should be tested on
>P5020, right?
>
>Well, that's a special case in that it only has one secondary. :-)
>
>Or we could be getting lucky with timing.  It's not a new issue with this
>patch, it just looks odd.

We will look into it more when doing the e5500 support.

- Leo
Scott Wood - Nov. 9, 2011, 4:12 p.m.
On Wed, Nov 09, 2011 at 02:31:23AM -0600, Li Yang-R58472 wrote:
> >-----Original Message-----
> >From: Wood Scott-B07421
> >Sent: Wednesday, November 09, 2011 4:58 AM
> >To: Li Yang-R58472
> >Cc: Wood Scott-B07421; Zhao Chenhui-B35336; linuxppc-dev@lists.ozlabs.org
> >Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
> >
> >On 11/08/2011 04:05 AM, Li Yang-R58472 wrote:
> >>> Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
> >>>
> >>>> +	flush_disable_L1();
> >>>
> >>> You'll also need to take down L2 on e500mc.
> >>
> >> Right.  E500mc support is beyond this patch series.  We will work on it
> >after the e500v2 support is finalized.
> >
> >I saw some support with "is_corenet"...  If we don't support e500mc, make
> >sure the code doesn't try to run on e500mc.
> 
> The is_corenet code is just a start of the e500mc support and is far from complete.
> 
> >
> >This isn't an e500v2-only BSP you're putting the code into. :-)
> 
> Yes, I know.  But it will take quite some time to perfect the support
> for different type of cores. 

That's fine, I'm just asking for it to be clearer that e500mc/corenet is TODO,
and ideally for the code to reject hotplug if is_corenet is set, until it's
supported.

> >>>> +static int __cpuinit smp_85xx_map_bootpg(u32 page) {
> >>>> +	__iomem u32 *bootpg_ptr;
> >>>> +	u32 bptr;
> >>>> +
> >>>> +	/* Get the BPTR */
> >>>> +	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
> >>>> +
> >>>> +	/* Set the BPTR to the secondary boot page */
> >>>> +	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
> >>>> +	out_be32(bootpg_ptr, bptr);
> >>>> +
> >>>> +	iounmap(bootpg_ptr);
> >>>> +	return 0;
> >>>> +}
> >>>
> >>> Shouldn't the boot page already be set by U-Boot?
> >>
> >> The register should be lost after a deep sleep.   Better to do it again.
> >
> >How can it be lost after a deep sleep if we're relying on it to hold our
> >wakeup code?
> 
> In order to wake up from deep sleep, the boot page has been set to the
> deep sleep restoration function.  We need to set it back to the
> bootpage from u-boot.

Ah.  That deserves a code comment.

> >It's not "better to do it again" if we're making a bad assumption about
> >the code and the table being in the same page.
> 
> Currently we are reusing the whole boot page including the spin-table
> from u-boot.  Are you suggesting to move just the boot page to kernel?

Just that it might be better to make sure that we put the same value in
BPTR that was in it when Linux booted, rather than assume that the page
that holds the spin table itself is the right one.

-Scott
Benjamin Herrenschmidt - Nov. 11, 2011, 4:22 a.m.
On Fri, 2011-11-04 at 20:31 +0800, Zhao Chenhui wrote:

>  #ifdef CONFIG_SMP
>  /* When we get here, r24 needs to hold the CPU # */
>  	.globl __secondary_start
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 7bf2187..12a54f0 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -381,8 +381,14 @@ void generic_cpu_die(unsigned int cpu)
>  
>  	for (i = 0; i < 100; i++) {
>  		smp_rmb();
> -		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
> +		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
> +			/*
> +			 * After another core sets cpu_state to CPU_DEAD,
> +			 * it needs some time to die.
> +			 */
> +			msleep(10);
>  			return;
> +		}
>  		msleep(100);
>  	}
>  	printk(KERN_ERR "CPU%d didn't die...\n", cpu);

I don't really see why you need to wait. This loop is about waiting for
the CPU to mark itself DEAD, not necessarily to be physically powered
off.

> +struct epapr_entry {
> +	u32	addr_h;
> +	u32	addr_l;
> +	u32	r3_h;
> +	u32	r3_l;
> +	u32	reserved;
> +	u32	pir;
> +	u32	r6_h;
> +	u32	r6_l;
> +};

This should probably be in a generic location.

> +static int is_corenet;

This global is a bit gross...

 ...

> +
> +static void __cpuinit smp_85xx_reset_core(int nr)
> +{
> +	__iomem u32 *vaddr, *pir_vaddr;
> +	u32 val, cpu_mask;
> +
> +	/* If CoreNet platform, use BRR as release register. */
> +	if (is_corenet) {
> +		cpu_mask = 1 << nr;
> +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
> +	} else {
> +		cpu_mask = 1 << (24 + nr);
> +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
> +	}

Instead, can't you instead have two functions using a common helper and
pick/hook the right one ?

Also in what context is that reset_core() called ? Doing ioremap isn't
something you can do at any random time...

Cheers,
Ben.
Zhao Chenhui - Nov. 11, 2011, 12:26 p.m.
> -----Original Message-----
> From: Benjamin Herrenschmidt [mailto:benh@kernel.crashing.org]
> Sent: Friday, November 11, 2011 12:23 PM
> To: Zhao Chenhui-B35336
> Cc: linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH 2/7] powerpc/85xx: add HOTPLUG_CPU support
> 
> On Fri, 2011-11-04 at 20:31 +0800, Zhao Chenhui wrote:
> 
> >  #ifdef CONFIG_SMP
> >  /* When we get here, r24 needs to hold the CPU # */
> >  	.globl __secondary_start
> > diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> > index 7bf2187..12a54f0 100644
> > --- a/arch/powerpc/kernel/smp.c
> > +++ b/arch/powerpc/kernel/smp.c
> > @@ -381,8 +381,14 @@ void generic_cpu_die(unsigned int cpu)
> >
> >  	for (i = 0; i < 100; i++) {
> >  		smp_rmb();
> > -		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
> > +		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
> > +			/*
> > +			 * After another core sets cpu_state to CPU_DEAD,
> > +			 * it needs some time to die.
> > +			 */
> > +			msleep(10);
> >  			return;
> > +		}
> >  		msleep(100);
> >  	}
> >  	printk(KERN_ERR "CPU%d didn't die...\n", cpu);
> 
> I don't really see why you need to wait. This loop is about waiting for
> the CPU to mark itself DEAD, not necessarily to be physically powered
> off.
> 

If I don't add this delay, the kernel hangs sometimes when hotpluging a core.
I will move this function to 85xx/smp.c in the next revision.

> > +struct epapr_entry {
> > +	u32	addr_h;
> > +	u32	addr_l;
> > +	u32	r3_h;
> > +	u32	r3_l;
> > +	u32	reserved;
> > +	u32	pir;
> > +	u32	r6_h;
> > +	u32	r6_l;
> > +};
> 
> This should probably be in a generic location.
> 
> > +static int is_corenet;
> 
> This global is a bit gross...
> 
>  ...

Agree. I will remove it.

> 
> > +
> > +static void __cpuinit smp_85xx_reset_core(int nr)
> > +{
> > +	__iomem u32 *vaddr, *pir_vaddr;
> > +	u32 val, cpu_mask;
> > +
> > +	/* If CoreNet platform, use BRR as release register. */
> > +	if (is_corenet) {
> > +		cpu_mask = 1 << nr;
> > +		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
> > +	} else {
> > +		cpu_mask = 1 << (24 + nr);
> > +		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
> > +	}
> 
> Instead, can't you instead have two functions using a common helper and
> pick/hook the right one ?
> 
> Also in what context is that reset_core() called ? Doing ioremap isn't
> something you can do at any random time...
> 
> Cheers,
> Ben.
> 
> 

Thanks. I will fix them.

-chenhui

Patch

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 47682b6..dc7feba 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -212,7 +212,7 @@  config ARCH_HIBERNATION_POSSIBLE
 config ARCH_SUSPEND_POSSIBLE
 	def_bool y
 	depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || \
-		   (PPC_85xx && !SMP) || PPC_86xx || PPC_PSERIES || 44x || 40x
+		   PPC_85xx || PPC_86xx || PPC_PSERIES || 44x || 40x
 
 config PPC_DCR_NATIVE
 	bool
@@ -323,7 +323,8 @@  config SWIOTLB
 
 config HOTPLUG_CPU
 	bool "Support for enabling/disabling CPUs"
-	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC)
+	depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || \
+		PPC_PMAC || E500)
 	---help---
 	  Say Y here to be able to disable and re-enable individual
 	  CPUs at runtime on SMP machines.
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 5084592..d13ae54 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -899,6 +899,34 @@  _GLOBAL(flush_dcache_L1)
 
 	blr
 
+/* Flush L1 d-cache, invalidate and disable d-cache and i-cache */
+_GLOBAL(flush_disable_L1)
+	mflr	r10
+	bl	flush_dcache_L1	/* Flush L1 d-cache */
+	mtlr	r10
+
+	mfspr	r4, SPRN_L1CSR0	/* Invalidate and disable d-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	msync
+	isync
+	mtspr	SPRN_L1CSR0, r4
+	isync
+
+1:	mfspr	r4, SPRN_L1CSR0	/* Wait for the invalidate to finish */
+	andi.	r4, r4, 2
+	bne	1b
+
+	mfspr	r4, SPRN_L1CSR1	/* Invalidate and disable i-cache */
+	li	r5, 2
+	rlwimi	r4, r5, 0, 3
+
+	mtspr	SPRN_L1CSR1, r4
+	isync
+
+	blr
+
 #ifdef CONFIG_SMP
 /* When we get here, r24 needs to hold the CPU # */
 	.globl __secondary_start
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7bf2187..12a54f0 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -381,8 +381,14 @@  void generic_cpu_die(unsigned int cpu)
 
 	for (i = 0; i < 100; i++) {
 		smp_rmb();
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			/*
+			 * After another core sets cpu_state to CPU_DEAD,
+			 * it needs some time to die.
+			 */
+			msleep(10);
 			return;
+		}
 		msleep(100);
 	}
 	printk(KERN_ERR "CPU%d didn't die...\n", cpu);
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 9b0de9c..5a54fc1 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -17,6 +17,7 @@ 
 #include <linux/of.h>
 #include <linux/kexec.h>
 #include <linux/highmem.h>
+#include <linux/cpu.h>
 
 #include <asm/machdep.h>
 #include <asm/pgtable.h>
@@ -30,26 +31,141 @@ 
 
 extern void __early_start(void);
 
-#define BOOT_ENTRY_ADDR_UPPER	0
-#define BOOT_ENTRY_ADDR_LOWER	1
-#define BOOT_ENTRY_R3_UPPER	2
-#define BOOT_ENTRY_R3_LOWER	3
-#define BOOT_ENTRY_RESV		4
-#define BOOT_ENTRY_PIR		5
-#define BOOT_ENTRY_R6_UPPER	6
-#define BOOT_ENTRY_R6_LOWER	7
-#define NUM_BOOT_ENTRY		8
-#define SIZE_BOOT_ENTRY		(NUM_BOOT_ENTRY * sizeof(u32))
-
-static int __init
-smp_85xx_kick_cpu(int nr)
+#define MPC85xx_BPTR_OFF		0x00020
+#define MPC85xx_BPTR_EN			0x80000000
+#define MPC85xx_BPTR_BOOT_PAGE_MASK	0x00ffffff
+#define MPC85xx_BRR_OFF			0xe0e4
+#define MPC85xx_ECM_EEBPCR_OFF		0x01010
+#define MPC85xx_PIC_PIR_OFF		0x41090
+
+struct epapr_entry {
+	u32	addr_h;
+	u32	addr_l;
+	u32	r3_h;
+	u32	r3_l;
+	u32	reserved;
+	u32	pir;
+	u32	r6_h;
+	u32	r6_l;
+};
+
+static int is_corenet;
+static void __cpuinit smp_85xx_setup_cpu(int cpu_nr);
+
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PPC32)
+extern void flush_disable_L1(void);
+
+static void __cpuinit smp_85xx_mach_cpu_die(void)
+{
+	unsigned int cpu = smp_processor_id();
+	register u32 tmp;
+
+	local_irq_disable();
+	idle_task_exit();
+	generic_set_cpu_dead(cpu);
+	smp_wmb();
+
+	mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
+	mtspr(SPRN_TCR, 0);
+
+	flush_disable_L1();
+
+	tmp = 0;
+	if (cpu_has_feature(CPU_FTR_CAN_NAP))
+		tmp = HID0_NAP;
+	else if (cpu_has_feature(CPU_FTR_CAN_DOZE))
+		tmp = HID0_DOZE;
+	if (tmp) {
+		tmp |= mfspr(SPRN_HID0) & ~(HID0_DOZE|HID0_NAP|HID0_SLEEP);
+
+		smp_mb();
+		isync();
+		mtspr(SPRN_HID0, tmp);
+		isync();
+
+		tmp = mfmsr();
+		tmp |= MSR_WE;
+		smp_mb();
+		mtmsr(tmp);
+		isync();
+	}
+
+	for (;;);
+}
+
+static void __cpuinit smp_85xx_reset_core(int nr)
+{
+	__iomem u32 *vaddr, *pir_vaddr;
+	u32 val, cpu_mask;
+
+	/* If CoreNet platform, use BRR as release register. */
+	if (is_corenet) {
+		cpu_mask = 1 << nr;
+		vaddr = ioremap(get_immrbase() + MPC85xx_BRR_OFF, 4);
+	} else {
+		cpu_mask = 1 << (24 + nr);
+		vaddr = ioremap(get_immrbase() + MPC85xx_ECM_EEBPCR_OFF, 4);
+	}
+	val = in_be32(vaddr);
+	if (!(val & cpu_mask)) {
+		out_be32(vaddr, val | cpu_mask);
+	} else {
+		/* reset core */
+		pir_vaddr = ioremap(get_immrbase() + MPC85xx_PIC_PIR_OFF, 4);
+		val = in_be32(pir_vaddr);
+		/* reset assert */
+		val |= (1 << nr);
+		out_be32(pir_vaddr, val);
+		val = in_be32(pir_vaddr);
+		val &= ~(1 << nr);
+		/* reset negate */
+		out_be32(pir_vaddr, val);
+		iounmap(pir_vaddr);
+	}
+	iounmap(vaddr);
+}
+
+static int __cpuinit smp_85xx_map_bootpg(u32 page)
+{
+	__iomem u32 *bootpg_ptr;
+	u32 bptr;
+
+	/* Get the BPTR */
+	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
+
+	/* Set the BPTR to the secondary boot page */
+	bptr = MPC85xx_BPTR_EN | (page & MPC85xx_BPTR_BOOT_PAGE_MASK);
+	out_be32(bootpg_ptr, bptr);
+
+	iounmap(bootpg_ptr);
+	return 0;
+}
+
+static int __cpuinit smp_85xx_unmap_bootpg(void)
+{
+	__iomem u32 *bootpg_ptr;
+
+	/* Get the BPTR */
+	bootpg_ptr = ioremap(get_immrbase() + MPC85xx_BPTR_OFF, 4);
+
+	/* Restore the BPTR */
+	if (in_be32(bootpg_ptr) & MPC85xx_BPTR_EN)
+		out_be32(bootpg_ptr, 0);
+
+	iounmap(bootpg_ptr);
+	return 0;
+}
+#endif
+
+static int __cpuinit smp_85xx_kick_cpu(int nr)
 {
 	unsigned long flags;
 	const u64 *cpu_rel_addr;
-	__iomem u32 *bptr_vaddr;
+	__iomem struct epapr_entry *epapr;
 	struct device_node *np;
-	int n = 0;
+	int n = 0, hw_cpu = get_hard_smp_processor_id(nr);
 	int ioremappable;
+	int ret = 0;
 
 	WARN_ON (nr < 0 || nr >= NR_CPUS);
 
@@ -73,46 +189,79 @@  smp_85xx_kick_cpu(int nr)
 
 	/* Map the spin table */
 	if (ioremappable)
-		bptr_vaddr = ioremap(*cpu_rel_addr, SIZE_BOOT_ENTRY);
+		epapr = ioremap(*cpu_rel_addr, sizeof(struct epapr_entry));
 	else
-		bptr_vaddr = phys_to_virt(*cpu_rel_addr);
+		epapr = phys_to_virt(*cpu_rel_addr);
 
 	local_irq_save(flags);
 
-	out_be32(bptr_vaddr + BOOT_ENTRY_PIR, nr);
+	out_be32(&epapr->pir, hw_cpu);
 #ifdef CONFIG_PPC32
-	out_be32(bptr_vaddr + BOOT_ENTRY_ADDR_LOWER, __pa(__early_start));
+#ifdef CONFIG_HOTPLUG_CPU
+	if (system_state == SYSTEM_RUNNING) {
+		out_be32(&epapr->addr_l, 0);
+		smp_85xx_map_bootpg((u32)(*cpu_rel_addr >> PAGE_SHIFT));
+
+		smp_85xx_reset_core(hw_cpu);
+
+		/* wait until core is ready... */
+		n = 0;
+		while ((in_be32(&epapr->addr_l) != 1) && (++n < 1000))
+			udelay(100);
+		if (n > 1000) {
+			pr_err("timeout waiting for core%d to reset\n",	nr);
+			ret = -ENOENT;
+			goto out;
+		}
+		/*  clear the acknowledge status */
+		__secondary_hold_acknowledge = -1;
+
+		smp_85xx_unmap_bootpg();
+	}
+#endif
+	out_be32(&epapr->addr_l, __pa(__early_start));
 
 	if (!ioremappable)
-		flush_dcache_range((ulong)bptr_vaddr,
-				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
+		flush_dcache_range((ulong)epapr,
+				(ulong)epapr + sizeof(struct epapr_entry));
 
 	/* Wait a bit for the CPU to ack. */
-	while ((__secondary_hold_acknowledge != nr) && (++n < 1000))
+	n = 0;
+	while ((__secondary_hold_acknowledge != hw_cpu) && (++n < 1000))
 		mdelay(1);
+	if (n > 1000) {
+		pr_err("timeout waiting for core%d to ack\n", nr);
+		ret = -ENOENT;
+		goto out;
+	}
+out:
 #else
 	smp_generic_kick_cpu(nr);
 
-	out_be64((u64 *)(bptr_vaddr + BOOT_ENTRY_ADDR_UPPER),
+	out_be64((u64 *)(&epapr->addr_h),
 		__pa((u64)*((unsigned long long *) generic_secondary_smp_init)));
 
 	if (!ioremappable)
-		flush_dcache_range((ulong)bptr_vaddr,
-				(ulong)(bptr_vaddr + SIZE_BOOT_ENTRY));
+		flush_dcache_range((ulong)epapr,
+				(ulong)epapr + sizeof(struct epapr_entry));
 #endif
-
 	local_irq_restore(flags);
 
 	if (ioremappable)
-		iounmap(bptr_vaddr);
+		iounmap(epapr);
 
 	pr_debug("waited %d msecs for CPU #%d.\n", n, nr);
 
-	return 0;
+	return ret;
 }
 
 struct smp_ops_t smp_85xx_ops = {
 	.kick_cpu = smp_85xx_kick_cpu,
+	.setup_cpu	= smp_85xx_setup_cpu,
+#ifdef CONFIG_HOTPLUG_CPU
+	.cpu_disable	= generic_cpu_disable,
+	.cpu_die	= generic_cpu_die,
+#endif
 	.give_timebase	= smp_generic_give_timebase,
 	.take_timebase	= smp_generic_take_timebase,
 };
@@ -214,8 +363,7 @@  static void mpc85xx_smp_machine_kexec(struct kimage *image)
 }
 #endif /* CONFIG_KEXEC */
 
-static void __init
-smp_85xx_setup_cpu(int cpu_nr)
+static void __cpuinit smp_85xx_setup_cpu(int cpu_nr)
 {
 	if (smp_85xx_ops.probe == smp_mpic_probe)
 		mpic_setup_this_cpu();
@@ -228,14 +376,18 @@  void __init mpc85xx_smp_init(void)
 {
 	struct device_node *np;
 
-	smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
-
 	np = of_find_node_by_type(NULL, "open-pic");
 	if (np) {
 		smp_85xx_ops.probe = smp_mpic_probe;
 		smp_85xx_ops.message_pass = smp_mpic_message_pass;
 	}
 
+	/* Check if the chip is based on CoreNet platform. */
+	is_corenet = 0;
+	np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-device-config-1.0");
+	if (np)
+		is_corenet = 1;
+
 	if (cpu_has_feature(CPU_FTR_DBELL)) {
 		/*
 		 * If left NULL, .message_pass defaults to
@@ -244,6 +396,10 @@  void __init mpc85xx_smp_init(void)
 		smp_85xx_ops.cause_ipi = doorbell_cause_ipi;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	ppc_md.cpu_die		= smp_85xx_mach_cpu_die;
+#endif
+
 	smp_ops = &smp_85xx_ops;
 
 #ifdef CONFIG_KEXEC