diff mbox series

[2/2] sunxi: enable dual rank memory on R40

Message ID 20210225161325.94019-3-icenowy@aosc.io
State Accepted
Delegated to: Andre Przywara
Headers show
Series A64/R40 DRAM controller dual-rank-related changes | expand

Commit Message

Icenowy Zheng Feb. 25, 2021, 4:13 p.m. UTC
Previously we do not have proper dual rank memory detection on R40
(because we omitted PIR_QSGATE, which does not work on R40 with our
configuration), and dual rank memory is just simply disabled as early
R40 boards available (Banana Pi M2 Ultra and Berry) have single rank
memory.

As a board with dual rank memory (Forlinx OKA40i-C) is now known to us,
we need to have a way to do memory rank detection to support that board.

Add some routine to detect memory rank by trying to access the memory
in rank 1 and check for error status of the memory controller, and then
enable dual rank memory on R40.

Similar routine can be used to detect half DQ width (which is also
detected by PIR_QSGATE on other SoCs), but it's left unimplemented
because there's no known R40 board with half DQ width now.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
---
 arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 6 deletions(-)

Comments

Ivan Uvarov Feb. 26, 2021, 2:04 p.m. UTC | #1
I have tested this patchset on both OKA40i-C and our custom carrier
board which uses the same SoM (Forlinx FETA40i-C), and can confirm that:
1) It does enable functioning of U-Boot on the FETA40i SoM, and
2) I have not experienced any immediately apparent problems with DRAM in
either U-Boot itself or the loaded OS during my testing.


Tested-by: Ivan Uvarov <i.uvarov@cognitivepilot.com>


On 2/25/21 7:13 PM, Icenowy Zheng wrote:
> Previously we do not have proper dual rank memory detection on R40
> (because we omitted PIR_QSGATE, which does not work on R40 with our
> configuration), and dual rank memory is just simply disabled as early
> R40 boards available (Banana Pi M2 Ultra and Berry) have single rank
> memory.
> 
> As a board with dual rank memory (Forlinx OKA40i-C) is now known to us,
> we need to have a way to do memory rank detection to support that board.
> 
> Add some routine to detect memory rank by trying to access the memory
> in rank 1 and check for error status of the memory controller, and then
> enable dual rank memory on R40.
> 
> Similar routine can be used to detect half DQ width (which is also
> detected by PIR_QSGATE on other SoCs), but it's left unimplemented
> because there's no known R40 board with half DQ width now.
> 
> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> ---
>  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++----
>  1 file changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> index 2b9d631d49..b86ae7cdf3 100644
> --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para)
>  	}
>  
>  	if (socid == SOCID_R40) {
> -		if (para->dual_rank)
> -			panic("Dual rank memory not supported\n");
> -
>  		/* Mux pin to A15 address line for single rank memory. */
> -		setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
> +		if (!para->dual_rank)
> +			setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
>  	}
>  }
>  
> @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct rank_para *rank)
>  	return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size;
>  }
>  
> +/*
> + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads
> + * to failure), it's needed to detect the rank count of R40 in another way.
> + *
> + * The code here is modelled after time_out_detect() in BSP, which tries to
> + * access the memory and check for error code.
> + *
> + * TODO: auto detect half DQ width here
> + */
> +static void mctl_r40_detect_rank_count(struct dram_para *para)
> +{
> +	ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> +			   mctl_calc_rank_size(&para->ranks[0]);
> +	struct sunxi_mctl_ctl_reg * const mctl_ctl =
> +			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
> +
> +	/* Enable read time out */
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> +
> +	(void) readl((void *) rank1_base);
> +	udelay(10);
> +
> +	if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> +		clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
> +		para->dual_rank = 0;
> +	}
> +
> +	/* Reset PHY FIFO to clear it */
> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> +	udelay(100);
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> +
> +	/* Clear error status */
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> +
> +	/* Clear time out flag */
> +	clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> +
> +	/* Disable read time out */
> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> +}
> +
>  static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para)
>  {
> +	if (socid == SOCID_R40) {
> +		mctl_r40_detect_rank_count(para);
> +		mctl_set_cr(socid, para);
> +	}
> +
>  	mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);
>  
>  	if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
> @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
>  	uint16_t socid = SOCID_H3;
>  #elif defined(CONFIG_MACH_SUN8I_R40)
>  	uint16_t socid = SOCID_R40;
> -	/* Currently we cannot support R40 with dual rank memory */
> -	para.dual_rank = 0;
>  #elif defined(CONFIG_MACH_SUN8I_V3S)
>  	uint16_t socid = SOCID_V3S;
>  #elif defined(CONFIG_MACH_SUN50I)
>
Andre Przywara March 2, 2021, 1:40 p.m. UTC | #2
On Fri, 26 Feb 2021 00:13:25 +0800
Icenowy Zheng <icenowy@aosc.io> wrote:

> Previously we do not have proper dual rank memory detection on R40
> (because we omitted PIR_QSGATE, which does not work on R40 with our
> configuration), and dual rank memory is just simply disabled as early
> R40 boards available (Banana Pi M2 Ultra and Berry) have single rank
> memory.
> 
> As a board with dual rank memory (Forlinx OKA40i-C) is now known to us,
> we need to have a way to do memory rank detection to support that board.
> 
> Add some routine to detect memory rank by trying to access the memory
> in rank 1 and check for error status of the memory controller, and then
> enable dual rank memory on R40.
> 
> Similar routine can be used to detect half DQ width (which is also
> detected by PIR_QSGATE on other SoCs), but it's left unimplemented
> because there's no known R40 board with half DQ width now.

So when looking at the SPL size I realised that this patch breaks the
socid constant parameter propagation, especially for mctl_set_cr(). I
don't see immediately why, this whole code here should be compiled out
on any A64 builds, for instance. Instead GCC turns
"<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689
around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const
everywhere, to amplify the constant nature of this value. Patch
1/2 added to the code size, but kept the const propagation (only one
instance of 0x1689 in the disassembly). This patch here should be for
free on A64, but adds 104 bytes.

Does anyone have a clue why this is happening? Is this a compiler issue?

Cheers,
Andre
 
> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> ---
>  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55 +++++++++++++++++++++++++----
>  1 file changed, 49 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> index 2b9d631d49..b86ae7cdf3 100644
> --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct dram_para *para)
>  	}
>  
>  	if (socid == SOCID_R40) {
> -		if (para->dual_rank)
> -			panic("Dual rank memory not supported\n");
> -
>  		/* Mux pin to A15 address line for single rank memory. */
> -		setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
> +		if (!para->dual_rank)
> +			setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
>  	}
>  }
>  
> @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct rank_para *rank)
>  	return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size;
>  }
>  
> +/*
> + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads
> + * to failure), it's needed to detect the rank count of R40 in another way.
> + *
> + * The code here is modelled after time_out_detect() in BSP, which tries to
> + * access the memory and check for error code.
> + *
> + * TODO: auto detect half DQ width here
> + */
> +static void mctl_r40_detect_rank_count(struct dram_para *para)
> +{
> +	ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> +			   mctl_calc_rank_size(&para->ranks[0]);
> +	struct sunxi_mctl_ctl_reg * const mctl_ctl =
> +			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
> +
> +	/* Enable read time out */
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> +
> +	(void) readl((void *) rank1_base);
> +	udelay(10);
> +
> +	if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> +		clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
> +		para->dual_rank = 0;
> +	}
> +
> +	/* Reset PHY FIFO to clear it */
> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> +	udelay(100);
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> +
> +	/* Clear error status */
> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> +
> +	/* Clear time out flag */
> +	clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> +
> +	/* Disable read time out */
> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> +}
> +
>  static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para)
>  {
> +	if (socid == SOCID_R40) {
> +		mctl_r40_detect_rank_count(para);
> +		mctl_set_cr(socid, para);
> +	}
> +
>  	mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);
>  
>  	if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
> @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
>  	uint16_t socid = SOCID_H3;
>  #elif defined(CONFIG_MACH_SUN8I_R40)
>  	uint16_t socid = SOCID_R40;
> -	/* Currently we cannot support R40 with dual rank memory */
> -	para.dual_rank = 0;
>  #elif defined(CONFIG_MACH_SUN8I_V3S)
>  	uint16_t socid = SOCID_V3S;
>  #elif defined(CONFIG_MACH_SUN50I)
Icenowy Zheng March 2, 2021, 1:50 p.m. UTC | #3
于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara <andre.przywara@arm.com> 写到:
>On Fri, 26 Feb 2021 00:13:25 +0800
>Icenowy Zheng <icenowy@aosc.io> wrote:
>
>> Previously we do not have proper dual rank memory detection on R40
>> (because we omitted PIR_QSGATE, which does not work on R40 with our
>> configuration), and dual rank memory is just simply disabled as early
>> R40 boards available (Banana Pi M2 Ultra and Berry) have single rank
>> memory.
>> 
>> As a board with dual rank memory (Forlinx OKA40i-C) is now known to
>us,
>> we need to have a way to do memory rank detection to support that
>board.
>> 
>> Add some routine to detect memory rank by trying to access the memory
>> in rank 1 and check for error status of the memory controller, and
>then
>> enable dual rank memory on R40.
>> 
>> Similar routine can be used to detect half DQ width (which is also
>> detected by PIR_QSGATE on other SoCs), but it's left unimplemented
>> because there's no known R40 board with half DQ width now.
>
>So when looking at the SPL size I realised that this patch breaks the
>socid constant parameter propagation, especially for mctl_set_cr(). I
>don't see immediately why, this whole code here should be compiled out
>on any A64 builds, for instance. Instead GCC turns
>"<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689
>around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const
>everywhere, to amplify the constant nature of this value. Patch
>1/2 added to the code size, but kept the const propagation (only one
>instance of 0x1689 in the disassembly). This patch here should be for
>free on A64, but adds 104 bytes.
>
>Does anyone have a clue why this is happening? Is this a compiler
>issue?

Maybe the issue is that I assume this codepath is only for R40 and
I didn't add socid to it?

Could you try to add a socid parameter to mctl_r40_detect_rank_count()?

Or maybe it makes mctl_calc_rank_size() not inlined?

(Well I think the code looks noop on non-R40)

BTW, original rank/width detection part won't get called on R40. But
R40 is not where we are tight on code size, so I didn't bother to ignore
it on R40.

Or should we switch back to #if's and do not rely on compiler behavior any longer?

>
>Cheers,
>Andre
> 
>> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>> ---
>>  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55
>+++++++++++++++++++++++++----
>>  1 file changed, 49 insertions(+), 6 deletions(-)
>> 
>> diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c
>b/arch/arm/mach-sunxi/dram_sunxi_dw.c
>> index 2b9d631d49..b86ae7cdf3 100644
>> --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
>> +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
>> @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct
>dram_para *para)
>>  	}
>>  
>>  	if (socid == SOCID_R40) {
>> -		if (para->dual_rank)
>> -			panic("Dual rank memory not supported\n");
>> -
>>  		/* Mux pin to A15 address line for single rank memory. */
>> -		setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
>> +		if (!para->dual_rank)
>> +			setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
>>  	}
>>  }
>>  
>> @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct
>rank_para *rank)
>>  	return (1UL << (rank->row_bits + rank->bank_bits)) *
>rank->page_size;
>>  }
>>  
>> +/*
>> + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which
>leads
>> + * to failure), it's needed to detect the rank count of R40 in
>another way.
>> + *
>> + * The code here is modelled after time_out_detect() in BSP, which
>tries to
>> + * access the memory and check for error code.
>> + *
>> + * TODO: auto detect half DQ width here
>> + */
>> +static void mctl_r40_detect_rank_count(struct dram_para *para)
>> +{
>> +	ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
>> +			   mctl_calc_rank_size(&para->ranks[0]);
>> +	struct sunxi_mctl_ctl_reg * const mctl_ctl =
>> +			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
>> +
>> +	/* Enable read time out */
>> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
>> +
>> +	(void) readl((void *) rank1_base);
>> +	udelay(10);
>> +
>> +	if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
>> +		clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
>> +		para->dual_rank = 0;
>> +	}
>> +
>> +	/* Reset PHY FIFO to clear it */
>> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
>> +	udelay(100);
>> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
>> +
>> +	/* Clear error status */
>> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
>> +
>> +	/* Clear time out flag */
>> +	clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
>> +
>> +	/* Disable read time out */
>> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
>> +}
>> +
>>  static void mctl_auto_detect_dram_size(uint16_t socid, struct
>dram_para *para)
>>  {
>> +	if (socid == SOCID_R40) {
>> +		mctl_r40_detect_rank_count(para);
>> +		mctl_set_cr(socid, para);
>> +	}
>> +
>>  	mctl_auto_detect_dram_size_rank(socid, para,
>(ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);
>>  
>>  	if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank)
>{
>> @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
>>  	uint16_t socid = SOCID_H3;
>>  #elif defined(CONFIG_MACH_SUN8I_R40)
>>  	uint16_t socid = SOCID_R40;
>> -	/* Currently we cannot support R40 with dual rank memory */
>> -	para.dual_rank = 0;
>>  #elif defined(CONFIG_MACH_SUN8I_V3S)
>>  	uint16_t socid = SOCID_V3S;
>>  #elif defined(CONFIG_MACH_SUN50I)
Andre Przywara March 2, 2021, 3:19 p.m. UTC | #4
On Tue, 02 Mar 2021 21:50:49 +0800
Icenowy Zheng <icenowy@aosc.io> wrote:

Hi Icenowy,

> 于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara <andre.przywara@arm.com> 写到:
> >On Fri, 26 Feb 2021 00:13:25 +0800
> >Icenowy Zheng <icenowy@aosc.io> wrote:
> >  
> >> Previously we do not have proper dual rank memory detection on R40
> >> (because we omitted PIR_QSGATE, which does not work on R40 with our
> >> configuration), and dual rank memory is just simply disabled as early
> >> R40 boards available (Banana Pi M2 Ultra and Berry) have single rank
> >> memory.
> >> 
> >> As a board with dual rank memory (Forlinx OKA40i-C) is now known to  
> >us,  
> >> we need to have a way to do memory rank detection to support that  
> >board.  
> >> 
> >> Add some routine to detect memory rank by trying to access the memory
> >> in rank 1 and check for error status of the memory controller, and  
> >then  
> >> enable dual rank memory on R40.
> >> 
> >> Similar routine can be used to detect half DQ width (which is also
> >> detected by PIR_QSGATE on other SoCs), but it's left unimplemented
> >> because there's no known R40 board with half DQ width now.  
> >
> >So when looking at the SPL size I realised that this patch breaks the
> >socid constant parameter propagation, especially for mctl_set_cr(). I
> >don't see immediately why, this whole code here should be compiled out
> >on any A64 builds, for instance. Instead GCC turns
> >"<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes 0x1689
> >around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding const
> >everywhere, to amplify the constant nature of this value. Patch
> >1/2 added to the code size, but kept the const propagation (only one
> >instance of 0x1689 in the disassembly). This patch here should be for
> >free on A64, but adds 104 bytes.
> >
> >Does anyone have a clue why this is happening? Is this a compiler
> >issue?  
> 
> Maybe the issue is that I assume this codepath is only for R40 and
> I didn't add socid to it?

But that's clearly visible by this function only being called inside "if
(socid == SOCID_R40)". And that works very well for the H3 ZQ
calibration quirk, for instance.

> Could you try to add a socid parameter to mctl_r40_detect_rank_count()?

I just tried that, and apart from looking weird it didn't do anything.

To be clear: Your code is absolutely fine, exactly the way it should be
written. It's just that the compiler is playing stupid suddenly. I was
thinking that your dummy readl might have upset it, but even with that
commented out it's still happening.

> Or maybe it makes mctl_calc_rank_size() not inlined?

So the assembly looks like everything apart from mctl_set_cr() and
mctl_auto_detect_dram_size_rank() is inlined. Those are extra because
they are called multiple times and we are using -Os.
 
> (Well I think the code looks noop on non-R40)

Exactly that was my thinking: when compiling for the A64, it should be
totally compiled out, and the object file should be the same before and
after.
 
> BTW, original rank/width detection part won't get called on R40. But
> R40 is not where we are tight on code size, so I didn't bother to ignore
> it on R40.

I see. Yeah, I am not concerned about R40 either, but I want to avoid
the A64 bloating up. 

> Or should we switch back to #if's and do not rely on compiler behavior any longer?

I'd rather not do that. We are doing the right thing, and it works
marvellously so far.

To be clear: it's not a show stopper, I was just curious why this
happens.
The code size is not really critical on the A64 at the moment, so I'd
merge it as it, even if we don't find a solution. Maybe just leave a
hint about it in the code should people need to come back to this.

I also asked some compiler buffs about it, but it's not exactly the
simple reproducer case they like to deal with ;-)

Cheers,
Andre



> 
> >
> >Cheers,
> >Andre
> >   
> >> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> >> ---
> >>  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55  
> >+++++++++++++++++++++++++----  
> >>  1 file changed, 49 insertions(+), 6 deletions(-)
> >> 
> >> diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> >b/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> >> index 2b9d631d49..b86ae7cdf3 100644
> >> --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> >> +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> >> @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid, struct  
> >dram_para *para)  
> >>  	}
> >>  
> >>  	if (socid == SOCID_R40) {
> >> -		if (para->dual_rank)
> >> -			panic("Dual rank memory not supported\n");
> >> -
> >>  		/* Mux pin to A15 address line for single rank memory. */
> >> -		setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
> >> +		if (!para->dual_rank)
> >> +			setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
> >>  	}
> >>  }
> >>  
> >> @@ -702,8 +700,55 @@ static unsigned long mctl_calc_rank_size(struct  
> >rank_para *rank)  
> >>  	return (1UL << (rank->row_bits + rank->bank_bits)) *  
> >rank->page_size;  
> >>  }
> >>  
> >> +/*
> >> + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which  
> >leads  
> >> + * to failure), it's needed to detect the rank count of R40 in  
> >another way.  
> >> + *
> >> + * The code here is modelled after time_out_detect() in BSP, which  
> >tries to  
> >> + * access the memory and check for error code.
> >> + *
> >> + * TODO: auto detect half DQ width here
> >> + */
> >> +static void mctl_r40_detect_rank_count(struct dram_para *para)
> >> +{
> >> +	ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> >> +			   mctl_calc_rank_size(&para->ranks[0]);
> >> +	struct sunxi_mctl_ctl_reg * const mctl_ctl =
> >> +			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
> >> +
> >> +	/* Enable read time out */
> >> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> >> +
> >> +	(void) readl((void *) rank1_base);
> >> +	udelay(10);
> >> +
> >> +	if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> >> +		clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
> >> +		para->dual_rank = 0;
> >> +	}
> >> +
> >> +	/* Reset PHY FIFO to clear it */
> >> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> >> +	udelay(100);
> >> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> >> +
> >> +	/* Clear error status */
> >> +	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> >> +
> >> +	/* Clear time out flag */
> >> +	clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> >> +
> >> +	/* Disable read time out */
> >> +	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> >> +}
> >> +
> >>  static void mctl_auto_detect_dram_size(uint16_t socid, struct  
> >dram_para *para)  
> >>  {
> >> +	if (socid == SOCID_R40) {
> >> +		mctl_r40_detect_rank_count(para);
> >> +		mctl_set_cr(socid, para);
> >> +	}
> >> +
> >>  	mctl_auto_detect_dram_size_rank(socid, para,  
> >(ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);  
> >>  
> >>  	if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank)  
> >{  
> >> @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
> >>  	uint16_t socid = SOCID_H3;
> >>  #elif defined(CONFIG_MACH_SUN8I_R40)
> >>  	uint16_t socid = SOCID_R40;
> >> -	/* Currently we cannot support R40 with dual rank memory */
> >> -	para.dual_rank = 0;
> >>  #elif defined(CONFIG_MACH_SUN8I_V3S)
> >>  	uint16_t socid = SOCID_V3S;
> >>  #elif defined(CONFIG_MACH_SUN50I)
Icenowy Zheng March 2, 2021, 5:49 p.m. UTC | #5
在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
> On Tue, 02 Mar 2021 21:50:49 +0800
> Icenowy Zheng <icenowy@aosc.io> wrote:
> 
> Hi Icenowy,
> 
> > 于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara <
> > andre.przywara@arm.com> 写到:
> > > On Fri, 26 Feb 2021 00:13:25 +0800
> > > Icenowy Zheng <icenowy@aosc.io> wrote:
> > >  
> > > > Previously we do not have proper dual rank memory detection on
> > > > R40
> > > > (because we omitted PIR_QSGATE, which does not work on R40 with
> > > > our
> > > > configuration), and dual rank memory is just simply disabled as
> > > > early
> > > > R40 boards available (Banana Pi M2 Ultra and Berry) have single
> > > > rank
> > > > memory.
> > > > 
> > > > As a board with dual rank memory (Forlinx OKA40i-C) is now
> > > > known to  
> > > us,  
> > > > we need to have a way to do memory rank detection to support
> > > > that  
> > > board.  
> > > > 
> > > > Add some routine to detect memory rank by trying to access the
> > > > memory
> > > > in rank 1 and check for error status of the memory controller,
> > > > and  
> > > then  
> > > > enable dual rank memory on R40.
> > > > 
> > > > Similar routine can be used to detect half DQ width (which is
> > > > also
> > > > detected by PIR_QSGATE on other SoCs), but it's left
> > > > unimplemented
> > > > because there's no known R40 board with half DQ width now.  
> > > 
> > > So when looking at the SPL size I realised that this patch breaks
> > > the
> > > socid constant parameter propagation, especially for
> > > mctl_set_cr(). I
> > > don't see immediately why, this whole code here should be
> > > compiled out
> > > on any A64 builds, for instance. Instead GCC turns
> > > "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes
> > > 0x1689
> > > around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding
> > > const
> > > everywhere, to amplify the constant nature of this value. Patch
> > > 1/2 added to the code size, but kept the const propagation (only
> > > one
> > > instance of 0x1689 in the disassembly). This patch here should be
> > > for
> > > free on A64, but adds 104 bytes.
> > > 
> > > Does anyone have a clue why this is happening? Is this a compiler
> > > issue?  
> > 
> > Maybe the issue is that I assume this codepath is only for R40 and
> > I didn't add socid to it?
> 
> But that's clearly visible by this function only being called inside
> "if
> (socid == SOCID_R40)". And that works very well for the H3 ZQ
> calibration quirk, for instance.
> 
> > Could you try to add a socid parameter to
> > mctl_r40_detect_rank_count()?
> 
> I just tried that, and apart from looking weird it didn't do
> anything.
> 
> To be clear: Your code is absolutely fine, exactly the way it should
> be
> written. It's just that the compiler is playing stupid suddenly. I
> was
> thinking that your dummy readl might have upset it, but even with
> that
> commented out it's still happening.
> 
> > Or maybe it makes mctl_calc_rank_size() not inlined?
> 
> So the assembly looks like everything apart from mctl_set_cr() and
> mctl_auto_detect_dram_size_rank() is inlined. Those are extra because
> they are called multiple times and we are using -Os.
>  
> > (Well I think the code looks noop on non-R40)
> 
> Exactly that was my thinking: when compiling for the A64, it should
> be
> totally compiled out, and the object file should be the same before
> and
> after.
>  
> > BTW, original rank/width detection part won't get called on R40.
> > But
> > R40 is not where we are tight on code size, so I didn't bother to
> > ignore
> > it on R40.
> 
> I see. Yeah, I am not concerned about R40 either, but I want to avoid
> the A64 bloating up. 
> 
> > Or should we switch back to #if's and do not rely on compiler
> > behavior any longer?
> 
> I'd rather not do that. We are doing the right thing, and it works
> marvellously so far.
> 
> To be clear: it's not a show stopper, I was just curious why this
> happens.
> The code size is not really critical on the A64 at the moment, so I'd
> merge it as it, even if we don't find a solution. Maybe just leave a
> hint about it in the code should people need to come back to this.
> 
> I also asked some compiler buffs about it, but it's not exactly the
> simple reproducer case they like to deal with ;-)
> 
> Cheers,
> Andre
> 
> 
> 
> > 
> > > 
> > > Cheers,
> > > Andre
> > >   
> > > > Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> > > > ---
> > > >  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55  
> > > +++++++++++++++++++++++++----  
> > > >  1 file changed, 49 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> > > b/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> > > > index 2b9d631d49..b86ae7cdf3 100644
> > > > --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid,
> > > > struct  
> > > dram_para *para)  
> > > >         }
> > > >  
> > > >         if (socid == SOCID_R40) {
> > > > -               if (para->dual_rank)
> > > > -                       panic("Dual rank memory not
> > > > supported\n");
> > > > -
> > > >                 /* Mux pin to A15 address line for single rank
> > > > memory. */
> > > > -               setbits_le32(&mctl_com->cr_r1,
> > > > MCTL_CR_R1_MUX_A15);
> > > > +               if (!para->dual_rank)
> > > > +                       setbits_le32(&mctl_com->cr_r1,
> > > > MCTL_CR_R1_MUX_A15);
> > > >         }
> > > >  }
> > > >  
> > > > @@ -702,8 +700,55 @@ static unsigned long
> > > > mctl_calc_rank_size(struct  
> > > rank_para *rank)  
> > > >         return (1UL << (rank->row_bits + rank->bank_bits)) *  
> > > rank->page_size;  
> > > >  }
> > > >  
> > > > +/*
> > > > + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
> > > > (which  
> > > leads  
> > > > + * to failure), it's needed to detect the rank count of R40
> > > > in  
> > > another way.  
> > > > + *
> > > > + * The code here is modelled after time_out_detect() in BSP,
> > > > which  
> > > tries to  
> > > > + * access the memory and check for error code.
> > > > + *
> > > > + * TODO: auto detect half DQ width here
> > > > + */
> > > > +static void mctl_r40_detect_rank_count(struct dram_para *para)
> > > > +{
> > > > +       ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> > > > +                          mctl_calc_rank_size(&para-
> > > > >ranks[0]);
> > > > +       struct sunxi_mctl_ctl_reg * const mctl_ctl =
> > > > +                       (struct sunxi_mctl_ctl_reg
> > > > *)SUNXI_DRAM_CTL0_BASE;
> > > > +
> > > > +       /* Enable read time out */
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > +
> > > > +       (void) readl((void *) rank1_base);
> > > > +       udelay(10);
> > > > +
> > > > +       if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> > > > +               clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1
> > > > << 24);
> > > > +               para->dual_rank = 0;
> > > > +       }
> > > > +
> > > > +       /* Reset PHY FIFO to clear it */
> > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > +       udelay(100);
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > +
> > > > +       /* Clear error status */
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> > > > +
> > > > +       /* Clear time out flag */
> > > > +       clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> > > > +
> > > > +       /* Disable read time out */
> > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > +}
> > > > +
> > > >  static void mctl_auto_detect_dram_size(uint16_t socid, struct 
> > > dram_para *para)  
> > > >  {
> > > > +       if (socid == SOCID_R40) {
> > > > +               mctl_r40_detect_rank_count(para);
> > > > +               mctl_set_cr(socid, para);
> > > > +       }
> > > > +

Trying to move this routine to sunxi_dram_init() fixes the problem.

And I think this is also a valid code sequence because originally the
mctl_auto_detect_dram_size only detects page/column (and bank when DDR2
support added).

BTW the mctl_r40_detect_rank_count() call looks innocent. The victim is
the mctl_set_cr() call. Maybe the code get complex enough (too many
function calls)?

> > > >         mctl_auto_detect_dram_size_rank(socid, para,  
> > > (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);  
> > > >  
> > > >         if ((socid == SOCID_A64 || socid == SOCID_R40) && para-
> > > > >dual_rank)  
> > > {  
> > > > @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
> > > >         uint16_t socid = SOCID_H3;
> > > >  #elif defined(CONFIG_MACH_SUN8I_R40)
> > > >         uint16_t socid = SOCID_R40;
> > > > -       /* Currently we cannot support R40 with dual rank
> > > > memory */
> > > > -       para.dual_rank = 0;
> > > >  #elif defined(CONFIG_MACH_SUN8I_V3S)
> > > >         uint16_t socid = SOCID_V3S;
> > > >  #elif defined(CONFIG_MACH_SUN50I)  
>
Icenowy Zheng March 2, 2021, 6:08 p.m. UTC | #6
在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
> On Tue, 02 Mar 2021 21:50:49 +0800
> Icenowy Zheng <icenowy@aosc.io> wrote:
> 
> Hi Icenowy,
> 
> > 于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara <
> > andre.przywara@arm.com> 写到:
> > > On Fri, 26 Feb 2021 00:13:25 +0800
> > > Icenowy Zheng <icenowy@aosc.io> wrote:
> > >  
> > > > Previously we do not have proper dual rank memory detection on
> > > > R40
> > > > (because we omitted PIR_QSGATE, which does not work on R40 with
> > > > our
> > > > configuration), and dual rank memory is just simply disabled as
> > > > early
> > > > R40 boards available (Banana Pi M2 Ultra and Berry) have single
> > > > rank
> > > > memory.
> > > > 
> > > > As a board with dual rank memory (Forlinx OKA40i-C) is now
> > > > known to  
> > > us,  
> > > > we need to have a way to do memory rank detection to support
> > > > that  
> > > board.  
> > > > 
> > > > Add some routine to detect memory rank by trying to access the
> > > > memory
> > > > in rank 1 and check for error status of the memory controller,
> > > > and  
> > > then  
> > > > enable dual rank memory on R40.
> > > > 
> > > > Similar routine can be used to detect half DQ width (which is
> > > > also
> > > > detected by PIR_QSGATE on other SoCs), but it's left
> > > > unimplemented
> > > > because there's no known R40 board with half DQ width now.  
> > > 
> > > So when looking at the SPL size I realised that this patch breaks
> > > the
> > > socid constant parameter propagation, especially for
> > > mctl_set_cr(). I
> > > don't see immediately why, this whole code here should be
> > > compiled out
> > > on any A64 builds, for instance. Instead GCC turns
> > > "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes
> > > 0x1689
> > > around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding
> > > const
> > > everywhere, to amplify the constant nature of this value. Patch
> > > 1/2 added to the code size, but kept the const propagation (only
> > > one
> > > instance of 0x1689 in the disassembly). This patch here should be
> > > for
> > > free on A64, but adds 104 bytes.
> > > 
> > > Does anyone have a clue why this is happening? Is this a compiler
> > > issue?  
> > 
> > Maybe the issue is that I assume this codepath is only for R40 and
> > I didn't add socid to it?
> 
> But that's clearly visible by this function only being called inside
> "if
> (socid == SOCID_R40)". And that works very well for the H3 ZQ
> calibration quirk, for instance.
> 
> > Could you try to add a socid parameter to
> > mctl_r40_detect_rank_count()?
> 
> I just tried that, and apart from looking weird it didn't do
> anything.
> 
> To be clear: Your code is absolutely fine, exactly the way it should
> be
> written. It's just that the compiler is playing stupid suddenly. I
> was
> thinking that your dummy readl might have upset it, but even with
> that
> commented out it's still happening.
> 
> > Or maybe it makes mctl_calc_rank_size() not inlined?
> 
> So the assembly looks like everything apart from mctl_set_cr() and
> mctl_auto_detect_dram_size_rank() is inlined. Those are extra because
> they are called multiple times and we are using -Os.
>  
> > (Well I think the code looks noop on non-R40)
> 
> Exactly that was my thinking: when compiling for the A64, it should
> be
> totally compiled out, and the object file should be the same before
> and
> after.
>  
> > BTW, original rank/width detection part won't get called on R40.
> > But
> > R40 is not where we are tight on code size, so I didn't bother to
> > ignore
> > it on R40.
> 
> I see. Yeah, I am not concerned about R40 either, but I want to avoid
> the A64 bloating up. 
> 
> > Or should we switch back to #if's and do not rely on compiler
> > behavior any longer?
> 
> I'd rather not do that. We are doing the right thing, and it works
> marvellously so far.
> 
> To be clear: it's not a show stopper, I was just curious why this
> happens.
> The code size is not really critical on the A64 at the moment, so I'd
> merge it as it, even if we don't find a solution. Maybe just leave a
> hint about it in the code should people need to come back to this.
> 
> I also asked some compiler buffs about it, but it's not exactly the
> simple reproducer case they like to deal with ;-)

Well I tried to make a simpler reproducer:

```
#include <stdint.h>

__attribute__((noinline)) static void mctl_set_cr(uint16_t socid)
{
	if (socid == 0x1701)
		*((volatile uint16_t *)0x12345678) = 40;
	else if (socid == 0x1689)
		*((volatile uint16_t *)0x12345678) = 64;
}

static void test(uint16_t socid)
{
	if (socid == 0x1701)
		mctl_set_cr(socid);

	*((volatile uint16_t *)0x12345670) = 0xefe8;
}

int sunxi_dram_init()
{
	uint16_t socid = 0x1689;

	test(socid);
	mctl_set_cr(socid);

	return 0;
}
```

If compile this with `aarch64-linux-gnu-gcc test.c -c -Os`, then the
mctl_set_cr won't get constprop optimization.

In the same situation with real dram_sunxi_dw, if judge for 0x1701 is
moved from test() to sunxi_dram_init() then mctl_set_cr gets optimized
and codepath for setting 40 is omitted.

I think, the conditions that triggers the lose of the optimization is
below:
- Inside a function inlined to where the constant is defined (not the
function that defines the constant itself)
- Get called inside a if() block that the expression inside the
condition uses the constant

> 
> Cheers,
> Andre
> 
> 
> 
> > 
> > > 
> > > Cheers,
> > > Andre
> > >   
> > > > Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> > > > ---
> > > >  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55  
> > > +++++++++++++++++++++++++----  
> > > >  1 file changed, 49 insertions(+), 6 deletions(-)
> > > > 
> > > > diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> > > b/arch/arm/mach-sunxi/dram_sunxi_dw.c  
> > > > index 2b9d631d49..b86ae7cdf3 100644
> > > > --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid,
> > > > struct  
> > > dram_para *para)  
> > > >         }
> > > >  
> > > >         if (socid == SOCID_R40) {
> > > > -               if (para->dual_rank)
> > > > -                       panic("Dual rank memory not
> > > > supported\n");
> > > > -
> > > >                 /* Mux pin to A15 address line for single rank
> > > > memory. */
> > > > -               setbits_le32(&mctl_com->cr_r1,
> > > > MCTL_CR_R1_MUX_A15);
> > > > +               if (!para->dual_rank)
> > > > +                       setbits_le32(&mctl_com->cr_r1,
> > > > MCTL_CR_R1_MUX_A15);
> > > >         }
> > > >  }
> > > >  
> > > > @@ -702,8 +700,55 @@ static unsigned long
> > > > mctl_calc_rank_size(struct  
> > > rank_para *rank)  
> > > >         return (1UL << (rank->row_bits + rank->bank_bits)) *  
> > > rank->page_size;  
> > > >  }
> > > >  
> > > > +/*
> > > > + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
> > > > (which  
> > > leads  
> > > > + * to failure), it's needed to detect the rank count of R40
> > > > in  
> > > another way.  
> > > > + *
> > > > + * The code here is modelled after time_out_detect() in BSP,
> > > > which  
> > > tries to  
> > > > + * access the memory and check for error code.
> > > > + *
> > > > + * TODO: auto detect half DQ width here
> > > > + */
> > > > +static void mctl_r40_detect_rank_count(struct dram_para *para)
> > > > +{
> > > > +       ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> > > > +                          mctl_calc_rank_size(&para-
> > > > >ranks[0]);
> > > > +       struct sunxi_mctl_ctl_reg * const mctl_ctl =
> > > > +                       (struct sunxi_mctl_ctl_reg
> > > > *)SUNXI_DRAM_CTL0_BASE;
> > > > +
> > > > +       /* Enable read time out */
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > +
> > > > +       (void) readl((void *) rank1_base);
> > > > +       udelay(10);
> > > > +
> > > > +       if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> > > > +               clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1
> > > > << 24);
> > > > +               para->dual_rank = 0;
> > > > +       }
> > > > +
> > > > +       /* Reset PHY FIFO to clear it */
> > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > +       udelay(100);
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > +
> > > > +       /* Clear error status */
> > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> > > > +
> > > > +       /* Clear time out flag */
> > > > +       clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> > > > +
> > > > +       /* Disable read time out */
> > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > +}
> > > > +
> > > >  static void mctl_auto_detect_dram_size(uint16_t socid, struct 
> > > dram_para *para)  
> > > >  {
> > > > +       if (socid == SOCID_R40) {
> > > > +               mctl_r40_detect_rank_count(para);
> > > > +               mctl_set_cr(socid, para);
> > > > +       }
> > > > +
> > > >         mctl_auto_detect_dram_size_rank(socid, para,  
> > > (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);  
> > > >  
> > > >         if ((socid == SOCID_A64 || socid == SOCID_R40) && para-
> > > > >dual_rank)  
> > > {  
> > > > @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
> > > >         uint16_t socid = SOCID_H3;
> > > >  #elif defined(CONFIG_MACH_SUN8I_R40)
> > > >         uint16_t socid = SOCID_R40;
> > > > -       /* Currently we cannot support R40 with dual rank
> > > > memory */
> > > > -       para.dual_rank = 0;
> > > >  #elif defined(CONFIG_MACH_SUN8I_V3S)
> > > >         uint16_t socid = SOCID_V3S;
> > > >  #elif defined(CONFIG_MACH_SUN50I)  
>
Andre Przywara March 18, 2021, 1:42 a.m. UTC | #7
On Wed, 03 Mar 2021 01:49:53 +0800
Icenowy Zheng <icenowy@aosc.io> wrote:

Hi Icenowy,

many thanks for your research on this. I asked some local compiler
buffs, see below ...


> 在 2021-03-02星期二的 15:19 +0000,Andre Przywara写道:
> > On Tue, 02 Mar 2021 21:50:49 +0800
> > Icenowy Zheng <icenowy@aosc.io> wrote:
> > 
> > Hi Icenowy,
> >   
> > > 于 2021年3月2日 GMT+08:00 下午9:40:44, Andre Przywara <  
> > > andre.przywara@arm.com> 写到:
> > > > On Fri, 26 Feb 2021 00:13:25 +0800
> > > > Icenowy Zheng <icenowy@aosc.io> wrote:
> > > >    
> > > > > Previously we do not have proper dual rank memory detection on
> > > > > R40
> > > > > (because we omitted PIR_QSGATE, which does not work on R40 with
> > > > > our
> > > > > configuration), and dual rank memory is just simply disabled as
> > > > > early
> > > > > R40 boards available (Banana Pi M2 Ultra and Berry) have single
> > > > > rank
> > > > > memory.
> > > > > 
> > > > > As a board with dual rank memory (Forlinx OKA40i-C) is now
> > > > > known to    
> > > > us,    
> > > > > we need to have a way to do memory rank detection to support
> > > > > that    
> > > > board.    
> > > > > 
> > > > > Add some routine to detect memory rank by trying to access the
> > > > > memory
> > > > > in rank 1 and check for error status of the memory controller,
> > > > > and    
> > > > then    
> > > > > enable dual rank memory on R40.
> > > > > 
> > > > > Similar routine can be used to detect half DQ width (which is
> > > > > also
> > > > > detected by PIR_QSGATE on other SoCs), but it's left
> > > > > unimplemented
> > > > > because there's no known R40 board with half DQ width now.    
> > > > 
> > > > So when looking at the SPL size I realised that this patch breaks
> > > > the
> > > > socid constant parameter propagation, especially for
> > > > mctl_set_cr(). I
> > > > don't see immediately why, this whole code here should be
> > > > compiled out
> > > > on any A64 builds, for instance. Instead GCC turns
> > > > "<mctl_set_cr.constprop.0>:" into "<mctl_set_cr>:", and passes
> > > > 0x1689
> > > > around everytime. I tried GCC 10.2.0 and 9.2.0, also tried adding
> > > > const
> > > > everywhere, to amplify the constant nature of this value. Patch
> > > > 1/2 added to the code size, but kept the const propagation (only
> > > > one
> > > > instance of 0x1689 in the disassembly). This patch here should be
> > > > for
> > > > free on A64, but adds 104 bytes.
> > > > 
> > > > Does anyone have a clue why this is happening? Is this a compiler
> > > > issue?    
> > > 
> > > Maybe the issue is that I assume this codepath is only for R40 and
> > > I didn't add socid to it?  
> > 
> > But that's clearly visible by this function only being called inside
> > "if
> > (socid == SOCID_R40)". And that works very well for the H3 ZQ
> > calibration quirk, for instance.
> >   
> > > Could you try to add a socid parameter to
> > > mctl_r40_detect_rank_count()?  
> > 
> > I just tried that, and apart from looking weird it didn't do
> > anything.
> > 
> > To be clear: Your code is absolutely fine, exactly the way it should
> > be
> > written. It's just that the compiler is playing stupid suddenly. I
> > was
> > thinking that your dummy readl might have upset it, but even with
> > that
> > commented out it's still happening.
> >   
> > > Or maybe it makes mctl_calc_rank_size() not inlined?  
> > 
> > So the assembly looks like everything apart from mctl_set_cr() and
> > mctl_auto_detect_dram_size_rank() is inlined. Those are extra because
> > they are called multiple times and we are using -Os.
> >    
> > > (Well I think the code looks noop on non-R40)  
> > 
> > Exactly that was my thinking: when compiling for the A64, it should
> > be
> > totally compiled out, and the object file should be the same before
> > and
> > after.
> >    
> > > BTW, original rank/width detection part won't get called on R40.
> > > But
> > > R40 is not where we are tight on code size, so I didn't bother to
> > > ignore
> > > it on R40.  
> > 
> > I see. Yeah, I am not concerned about R40 either, but I want to avoid
> > the A64 bloating up. 
> >   
> > > Or should we switch back to #if's and do not rely on compiler
> > > behavior any longer?  
> > 
> > I'd rather not do that. We are doing the right thing, and it works
> > marvellously so far.
> > 
> > To be clear: it's not a show stopper, I was just curious why this
> > happens.
> > The code size is not really critical on the A64 at the moment, so I'd
> > merge it as it, even if we don't find a solution. Maybe just leave a
> > hint about it in the code should people need to come back to this.
> > 
> > I also asked some compiler buffs about it, but it's not exactly the
> > simple reproducer case they like to deal with ;-)
> > 
> > Cheers,
> > Andre
> > 
> > 
> >   
> > >   
> > > > 
> > > > Cheers,
> > > > Andre
> > > >     
> > > > > Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
> > > > > ---
> > > > >  arch/arm/mach-sunxi/dram_sunxi_dw.c | 55    
> > > > +++++++++++++++++++++++++----    
> > > > >  1 file changed, 49 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c    
> > > > b/arch/arm/mach-sunxi/dram_sunxi_dw.c    
> > > > > index 2b9d631d49..b86ae7cdf3 100644
> > > > > --- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > > +++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
> > > > > @@ -414,11 +414,9 @@ static void mctl_set_cr(uint16_t socid,
> > > > > struct    
> > > > dram_para *para)    
> > > > >         }
> > > > >  
> > > > >         if (socid == SOCID_R40) {
> > > > > -               if (para->dual_rank)
> > > > > -                       panic("Dual rank memory not
> > > > > supported\n");
> > > > > -
> > > > >                 /* Mux pin to A15 address line for single rank
> > > > > memory. */
> > > > > -               setbits_le32(&mctl_com->cr_r1,
> > > > > MCTL_CR_R1_MUX_A15);
> > > > > +               if (!para->dual_rank)
> > > > > +                       setbits_le32(&mctl_com->cr_r1,
> > > > > MCTL_CR_R1_MUX_A15);
> > > > >         }
> > > > >  }
> > > > >  
> > > > > @@ -702,8 +700,55 @@ static unsigned long
> > > > > mctl_calc_rank_size(struct    
> > > > rank_para *rank)    
> > > > >         return (1UL << (rank->row_bits + rank->bank_bits)) *    
> > > > rank->page_size;    
> > > > >  }
> > > > >  
> > > > > +/*
> > > > > + * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now
> > > > > (which    
> > > > leads    
> > > > > + * to failure), it's needed to detect the rank count of R40
> > > > > in    
> > > > another way.    
> > > > > + *
> > > > > + * The code here is modelled after time_out_detect() in BSP,
> > > > > which    
> > > > tries to    
> > > > > + * access the memory and check for error code.
> > > > > + *
> > > > > + * TODO: auto detect half DQ width here
> > > > > + */
> > > > > +static void mctl_r40_detect_rank_count(struct dram_para *para)
> > > > > +{
> > > > > +       ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
> > > > > +                          mctl_calc_rank_size(&para-  
> > > > > >ranks[0]);  
> > > > > +       struct sunxi_mctl_ctl_reg * const mctl_ctl =
> > > > > +                       (struct sunxi_mctl_ctl_reg
> > > > > *)SUNXI_DRAM_CTL0_BASE;
> > > > > +
> > > > > +       /* Enable read time out */
> > > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > > +
> > > > > +       (void) readl((void *) rank1_base);
> > > > > +       udelay(10);
> > > > > +
> > > > > +       if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
> > > > > +               clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1
> > > > > << 24);
> > > > > +               para->dual_rank = 0;
> > > > > +       }
> > > > > +
> > > > > +       /* Reset PHY FIFO to clear it */
> > > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > > +       udelay(100);
> > > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
> > > > > +
> > > > > +       /* Clear error status */
> > > > > +       setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
> > > > > +
> > > > > +       /* Clear time out flag */
> > > > > +       clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
> > > > > +
> > > > > +       /* Disable read time out */
> > > > > +       clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
> > > > > +}
> > > > > +
> > > > >  static void mctl_auto_detect_dram_size(uint16_t socid, struct   
> > > > dram_para *para)    
> > > > >  {
> > > > > +       if (socid == SOCID_R40) {
> > > > > +               mctl_r40_detect_rank_count(para);
> > > > > +               mctl_set_cr(socid, para);
> > > > > +       }
> > > > > +  
> 
> Trying to move this routine to sunxi_dram_init() fixes the problem.
>
> And I think this is also a valid code sequence because originally the
> mctl_auto_detect_dram_size only detects page/column (and bank when DDR2
> support added).

Yes, I can confirm this. This is identical in functionality and fixes
the code size growth problem.

> 
> BTW the mctl_r40_detect_rank_count() call looks innocent. The victim is
> the mctl_set_cr() call. Maybe the code get complex enough (too many
> function calls)?

Yes, so this is what a colleague (Tamar Christina) said:
" ... so the reason is costing. The patch adds the following bit
+	if (socid == SOCID_R40) {
+		mctl_r40_detect_rank_count(para);
+		mctl_set_cr(socid, para);
+	}
which now makes a direct call to mctl_set_cr with a new constant value
for cloning SOCID_R40. Which makes IPA have to evaluate the costs of
specializing both SOCID_A64 and SOCID_R40 at the same time. It
determines that the benefit of doing so is quite small. It has to keep
2 copies of the function around and it can't replace all usages of
mctl_set_cr with the specialized version. The function is too small and
the gain from the specialization seemed very little.You can lower the
threshold with --param=ipa-cp-eval-threshold=300 if you really want it."

So this seems to be heuristics driven, and in a first steps it tries
to create multiple copies of the function, one of which would later be
garbage collected. However those multiple functions don't seem
worthwhile to the compiler in the first place now, so it discards that
idea, *before* it sees that all of them wouldn't be needed later.

I think your fix is good, I will take this version.

Cheers,
Andre

P.S. Thanks for taking the time to find the reproducer!


> 
> > > > >         mctl_auto_detect_dram_size_rank(socid, para,    
> > > > (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);    
> > > > >  
> > > > >         if ((socid == SOCID_A64 || socid == SOCID_R40) && para-  
> > > > > >dual_rank)    
> > > > {    
> > > > > @@ -854,8 +899,6 @@ unsigned long sunxi_dram_init(void)
> > > > >         uint16_t socid = SOCID_H3;
> > > > >  #elif defined(CONFIG_MACH_SUN8I_R40)
> > > > >         uint16_t socid = SOCID_R40;
> > > > > -       /* Currently we cannot support R40 with dual rank
> > > > > memory */
> > > > > -       para.dual_rank = 0;
> > > > >  #elif defined(CONFIG_MACH_SUN8I_V3S)
> > > > >         uint16_t socid = SOCID_V3S;
> > > > >  #elif defined(CONFIG_MACH_SUN50I)    
> >   
>
diff mbox series

Patch

diff --git a/arch/arm/mach-sunxi/dram_sunxi_dw.c b/arch/arm/mach-sunxi/dram_sunxi_dw.c
index 2b9d631d49..b86ae7cdf3 100644
--- a/arch/arm/mach-sunxi/dram_sunxi_dw.c
+++ b/arch/arm/mach-sunxi/dram_sunxi_dw.c
@@ -414,11 +414,9 @@  static void mctl_set_cr(uint16_t socid, struct dram_para *para)
 	}
 
 	if (socid == SOCID_R40) {
-		if (para->dual_rank)
-			panic("Dual rank memory not supported\n");
-
 		/* Mux pin to A15 address line for single rank memory. */
-		setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
+		if (!para->dual_rank)
+			setbits_le32(&mctl_com->cr_r1, MCTL_CR_R1_MUX_A15);
 	}
 }
 
@@ -702,8 +700,55 @@  static unsigned long mctl_calc_rank_size(struct rank_para *rank)
 	return (1UL << (rank->row_bits + rank->bank_bits)) * rank->page_size;
 }
 
+/*
+ * Because we cannot do mctl_phy_init(PIR_QSGATE) on R40 now (which leads
+ * to failure), it's needed to detect the rank count of R40 in another way.
+ *
+ * The code here is modelled after time_out_detect() in BSP, which tries to
+ * access the memory and check for error code.
+ *
+ * TODO: auto detect half DQ width here
+ */
+static void mctl_r40_detect_rank_count(struct dram_para *para)
+{
+	ulong rank1_base = (ulong) CONFIG_SYS_SDRAM_BASE +
+			   mctl_calc_rank_size(&para->ranks[0]);
+	struct sunxi_mctl_ctl_reg * const mctl_ctl =
+			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
+
+	/* Enable read time out */
+	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+
+	(void) readl((void *) rank1_base);
+	udelay(10);
+
+	if (readl(&mctl_ctl->pgsr[0]) & (0x1 << 13)) {
+		clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
+		para->dual_rank = 0;
+	}
+
+	/* Reset PHY FIFO to clear it */
+	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
+	udelay(100);
+	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 26);
+
+	/* Clear error status */
+	setbits_le32(&mctl_ctl->pgcr[0], 0x1 << 24);
+
+	/* Clear time out flag */
+	clrbits_le32(&mctl_ctl->pgsr[0], 0x1 << 13);
+
+	/* Disable read time out */
+	clrbits_le32(&mctl_ctl->pgcr[0], 0x1 << 25);
+}
+
 static void mctl_auto_detect_dram_size(uint16_t socid, struct dram_para *para)
 {
+	if (socid == SOCID_R40) {
+		mctl_r40_detect_rank_count(para);
+		mctl_set_cr(socid, para);
+	}
+
 	mctl_auto_detect_dram_size_rank(socid, para, (ulong)CONFIG_SYS_SDRAM_BASE, &para->ranks[0]);
 
 	if ((socid == SOCID_A64 || socid == SOCID_R40) && para->dual_rank) {
@@ -854,8 +899,6 @@  unsigned long sunxi_dram_init(void)
 	uint16_t socid = SOCID_H3;
 #elif defined(CONFIG_MACH_SUN8I_R40)
 	uint16_t socid = SOCID_R40;
-	/* Currently we cannot support R40 with dual rank memory */
-	para.dual_rank = 0;
 #elif defined(CONFIG_MACH_SUN8I_V3S)
 	uint16_t socid = SOCID_V3S;
 #elif defined(CONFIG_MACH_SUN50I)