diff mbox series

[U-Boot] ARM: socfpga: Clear PL310 early in SPL

Message ID 20190219125240.2833-1-marex@denx.de
State Superseded, archived
Delegated to: Simon Goldschmidt
Headers show
Series [U-Boot] ARM: socfpga: Clear PL310 early in SPL | expand

Commit Message

Marek Vasut Feb. 19, 2019, 12:52 p.m. UTC
On SoCFPGA Gen5 systems, it can rarely happen that a reboot from Linux
will result in stale data in PL310 L2 cache controller. Even if the L2
cache controller is disabled via the CTRL register CTRL_EN bit, those
data can interfere with operation of devices using DMA, like e.g. the
DWMMC controller. This can in turn cause e.g. SPL to fail reading data
from SD/MMC.

The obvious solution here would be to fully reset the L2 cache controller
via the reset manager MPUMODRST L2 bit, however this causes bus hang even
if executed entirely from L1 I-cache to avoid generating any bus traffic
through the L2 cache controller.

This patch thus configures and enables the L2 cache controller very early
in the SPL boot process, clears the L2 cache and disables the L2 cache
controller again.

The reason for doing it in SPL is because we need to avoid accessing any
of the potentially stale data in the L2 cache, and we are certain any of
the stale data will be below the OCRAM address range. To further reduce
bus traffic during the L2 cache invalidation, we enable L1 I-cache and
run the invalidation code entirely out of the L1 I-cache.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Dalon Westergreen <dwesterg@gmail.com>
Cc: Dinh Nguyen <dinguyen@kernel.org>
---
 arch/arm/mach-socfpga/spl_gen5.c | 55 ++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Comments

Simon Goldschmidt Feb. 19, 2019, 1:28 p.m. UTC | #1
On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de> wrote:
>
> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from Linux
> will result in stale data in PL310 L2 cache controller. Even if the L2
> cache controller is disabled via the CTRL register CTRL_EN bit, those
> data can interfere with operation of devices using DMA, like e.g. the
> DWMMC controller. This can in turn cause e.g. SPL to fail reading data
> from SD/MMC.
>
> The obvious solution here would be to fully reset the L2 cache controller
> via the reset manager MPUMODRST L2 bit, however this causes bus hang even
> if executed entirely from L1 I-cache to avoid generating any bus traffic
> through the L2 cache controller.
>
> This patch thus configures and enables the L2 cache controller very early
> in the SPL boot process, clears the L2 cache and disables the L2 cache
> controller again.
>
> The reason for doing it in SPL is because we need to avoid accessing any
> of the potentially stale data in the L2 cache, and we are certain any of
> the stale data will be below the OCRAM address range. To further reduce
> bus traffic during the L2 cache invalidation, we enable L1 I-cache and
> run the invalidation code entirely out of the L1 I-cache.
>
> Signed-off-by: Marek Vasut <marex@denx.de>
> Cc: Dalon Westergreen <dwesterg@gmail.com>
> Cc: Dinh Nguyen <dinguyen@kernel.org>

We've tested this and it seems to fix the issue, so:

Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com>

However, I don't really get why clearing the L2 cache later in U-Boot
isn't good enough.
Also, wouldn't this affect other platforms as well?

Regards,
Simon

> ---
>  arch/arm/mach-socfpga/spl_gen5.c | 55 ++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/arch/arm/mach-socfpga/spl_gen5.c b/arch/arm/mach-socfpga/spl_gen5.c
> index ccdc661d05..12d011e681 100644
> --- a/arch/arm/mach-socfpga/spl_gen5.c
> +++ b/arch/arm/mach-socfpga/spl_gen5.c
> @@ -63,6 +63,60 @@ u32 spl_boot_mode(const u32 boot_device)
>  }
>  #endif
>
> +static void socfpga_pl310_clear(void)
> +{
> +       u32 mask = 0xff, ena = 0;
> +
> +       icache_enable();
> +
> +       /* Disable the L2 cache */
> +       clrbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
> +
> +       writel(0x111, &pl310->pl310_tag_latency_ctrl);
> +       writel(0x121, &pl310->pl310_data_latency_ctrl);
> +
> +       /* enable BRESP, instruction and data prefetch, full line of zeroes */
> +       setbits_le32(&pl310->pl310_aux_ctrl,
> +                    L310_AUX_CTRL_DATA_PREFETCH_MASK |
> +                    L310_AUX_CTRL_INST_PREFETCH_MASK |
> +                    L310_SHARED_ATT_OVERRIDE_ENABLE);
> +
> +       /* Enable the L2 cache */
> +       ena = readl(&pl310->pl310_ctrl);
> +       ena |= L2X0_CTRL_EN;
> +
> +       /*
> +        * Invalidate the PL310 L2 cache. Keep the invalidation code
> +        * entirely in L1 I-cache to avoid any bus traffic through
> +        * the L2.
> +        */
> +       asm volatile(
> +               ".align 5                       \n"
> +               "       b       3f              \n"
> +               "1:     str     %1,     [%4]    \n"
> +               "       dsb                     \n"
> +               "       isb                     \n"
> +               "       str     %0,     [%2]    \n"
> +               "       dsb                     \n"
> +               "       isb                     \n"
> +               "2:     ldr     %0,     [%2]    \n"
> +               "       cmp     %0,     #0      \n"
> +               "       bne     2b              \n"
> +               "       str     %0,     [%3]    \n"
> +               "       dsb                     \n"
> +               "       isb                     \n"
> +               "       b       4f              \n"
> +               "3:     b       1b              \n"
> +               "4:     nop                     \n"
> +       : "+r"(mask), "+r"(ena)
> +       : "r"(&pl310->pl310_inv_way),
> +         "r"(&pl310->pl310_cache_sync), "r"(&pl310->pl310_ctrl)
> +       : "memory", "cc");
> +
> +       /* Disable the L2 cache */
> +       clrbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
> +}
> +
>  void board_init_f(ulong dummy)
>  {
>         const struct cm_config *cm_default_cfg = cm_get_default_config();
> @@ -85,6 +139,7 @@ void board_init_f(ulong dummy)
>         memset(__bss_start, 0, __bss_end - __bss_start);
>
>         socfpga_sdram_remap_zero();
> +       socfpga_pl310_clear();
>
>         debug("Freezing all I/O banks\n");
>         /* freeze all IO banks */
> --
> 2.19.2
Marek Vasut Feb. 19, 2019, 1:42 p.m. UTC | #2
On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
> On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de> wrote:
>>
>> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from Linux
>> will result in stale data in PL310 L2 cache controller. Even if the L2
>> cache controller is disabled via the CTRL register CTRL_EN bit, those
>> data can interfere with operation of devices using DMA, like e.g. the
>> DWMMC controller. This can in turn cause e.g. SPL to fail reading data
>> from SD/MMC.
>>
>> The obvious solution here would be to fully reset the L2 cache controller
>> via the reset manager MPUMODRST L2 bit, however this causes bus hang even
>> if executed entirely from L1 I-cache to avoid generating any bus traffic
>> through the L2 cache controller.
>>
>> This patch thus configures and enables the L2 cache controller very early
>> in the SPL boot process, clears the L2 cache and disables the L2 cache
>> controller again.
>>
>> The reason for doing it in SPL is because we need to avoid accessing any
>> of the potentially stale data in the L2 cache, and we are certain any of
>> the stale data will be below the OCRAM address range. To further reduce
>> bus traffic during the L2 cache invalidation, we enable L1 I-cache and
>> run the invalidation code entirely out of the L1 I-cache.
>>
>> Signed-off-by: Marek Vasut <marex@denx.de>
>> Cc: Dalon Westergreen <dwesterg@gmail.com>
>> Cc: Dinh Nguyen <dinguyen@kernel.org>
> 
> We've tested this and it seems to fix the issue, so:
> 
> Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com>
> 
> However, I don't really get why clearing the L2 cache later in U-Boot
> isn't good enough.

Because when U-Boot is running, it is already running from RAM, which is
on different PL310 master than OCRAM, and you're likely to hit the
corrupted cache lines on the DRAM master which is primed by prior Linux
operation. Such cacheline can be hit between the code enabling the PL310
and the code invalidating it, which is a C code, using stack and calling
functions, thus accessing memory which would likely reside in different
PL310 cachelines. If one of those cachelines contains invalid/corrupted
data, they can be provided to the CPU before the cacheline is invalidated.

To further reduce the likelihood of hitting any such cache line, the
code which enables the PL310 and invalidates it runs from primed L1
icache lines, thus generating no bus traffic at all.

> Also, wouldn't this affect other platforms as well?

I had the same concern, however I suspect it might have to do with the
reset implementation on SoCFPGA, which doesn't clear the L2CC, while
reset implementations on other SoCs likely do clear the L2CC. I am
however inquiring with Altera/Intel about this.

[...]
Simon Goldschmidt Feb. 19, 2019, 1:58 p.m. UTC | #3
Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de> geschrieben:

> On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
> > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de> wrote:
> >>
> >> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from Linux
> >> will result in stale data in PL310 L2 cache controller. Even if the L2
> >> cache controller is disabled via the CTRL register CTRL_EN bit, those
> >> data can interfere with operation of devices using DMA, like e.g. the
> >> DWMMC controller. This can in turn cause e.g. SPL to fail reading data
> >> from SD/MMC.
> >>
> >> The obvious solution here would be to fully reset the L2 cache
> controller
> >> via the reset manager MPUMODRST L2 bit, however this causes bus hang
> even
> >> if executed entirely from L1 I-cache to avoid generating any bus traffic
> >> through the L2 cache controller.
> >>
> >> This patch thus configures and enables the L2 cache controller very
> early
> >> in the SPL boot process, clears the L2 cache and disables the L2 cache
> >> controller again.
> >>
> >> The reason for doing it in SPL is because we need to avoid accessing any
> >> of the potentially stale data in the L2 cache, and we are certain any of
> >> the stale data will be below the OCRAM address range. To further reduce
> >> bus traffic during the L2 cache invalidation, we enable L1 I-cache and
> >> run the invalidation code entirely out of the L1 I-cache.
> >>
> >> Signed-off-by: Marek Vasut <marex@denx.de>
> >> Cc: Dalon Westergreen <dwesterg@gmail.com>
> >> Cc: Dinh Nguyen <dinguyen@kernel.org>
> >
> > We've tested this and it seems to fix the issue, so:
> >
> > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com>
> >
> > However, I don't really get why clearing the L2 cache later in U-Boot
> > isn't good enough.
>
> Because when U-Boot is running, it is already running from RAM, which is
> on different PL310 master than OCRAM, and you're likely to hit the
> corrupted cache lines on the DRAM master which is primed by prior Linux
> operation. Such cacheline can be hit between the code enabling the PL310
> and the code invalidating it, which is a C code, using stack and calling
> functions, thus accessing memory which would likely reside in different
> PL310 cachelines. If one of those cachelines contains invalid/corrupted
> data, they can be provided to the CPU before the cacheline is invalidated.
>
> To further reduce the likelihood of hitting any such cache line, the
> code which enables the PL310 and invalidates it runs from primed L1
> icache lines, thus generating no bus traffic at all.
>

Ah, right. I somehow didn't realize that invalidating is done after
enabling:-)

> Also, wouldn't this affect other platforms as well?
>
> I had the same concern, however I suspect it might have to do with the
> reset implementation on SoCFPGA, which doesn't clear the L2CC, while
> reset implementations on other SoCs likely do clear the L2CC. I am
> however inquiring with Altera/Intel about this.
>

Ok, thanks for the explanation!

Regards,
Simon
Marek Vasut Feb. 19, 2019, 2 p.m. UTC | #4
On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
> 
> 
> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
> <mailto:marex@denx.de>> geschrieben:
> 
>     On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>     > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>     <mailto:marex@denx.de>> wrote:
>     >>
>     >> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from
>     Linux
>     >> will result in stale data in PL310 L2 cache controller. Even if
>     the L2
>     >> cache controller is disabled via the CTRL register CTRL_EN bit, those
>     >> data can interfere with operation of devices using DMA, like e.g. the
>     >> DWMMC controller. This can in turn cause e.g. SPL to fail reading
>     data
>     >> from SD/MMC.
>     >>
>     >> The obvious solution here would be to fully reset the L2 cache
>     controller
>     >> via the reset manager MPUMODRST L2 bit, however this causes bus
>     hang even
>     >> if executed entirely from L1 I-cache to avoid generating any bus
>     traffic
>     >> through the L2 cache controller.
>     >>
>     >> This patch thus configures and enables the L2 cache controller
>     very early
>     >> in the SPL boot process, clears the L2 cache and disables the L2
>     cache
>     >> controller again.
>     >>
>     >> The reason for doing it in SPL is because we need to avoid
>     accessing any
>     >> of the potentially stale data in the L2 cache, and we are certain
>     any of
>     >> the stale data will be below the OCRAM address range. To further
>     reduce
>     >> bus traffic during the L2 cache invalidation, we enable L1
>     I-cache and
>     >> run the invalidation code entirely out of the L1 I-cache.
>     >>
>     >> Signed-off-by: Marek Vasut <marex@denx.de <mailto:marex@denx.de>>
>     >> Cc: Dalon Westergreen <dwesterg@gmail.com
>     <mailto:dwesterg@gmail.com>>
>     >> Cc: Dinh Nguyen <dinguyen@kernel.org <mailto:dinguyen@kernel.org>>
>     >
>     > We've tested this and it seems to fix the issue, so:
>     >
>     > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>     <mailto:simon.k.r.goldschmidt@gmail.com>>
>     >
>     > However, I don't really get why clearing the L2 cache later in U-Boot
>     > isn't good enough.
> 
>     Because when U-Boot is running, it is already running from RAM, which is
>     on different PL310 master than OCRAM, and you're likely to hit the
>     corrupted cache lines on the DRAM master which is primed by prior Linux
>     operation. Such cacheline can be hit between the code enabling the PL310
>     and the code invalidating it, which is a C code, using stack and calling
>     functions, thus accessing memory which would likely reside in different
>     PL310 cachelines. If one of those cachelines contains invalid/corrupted
>     data, they can be provided to the CPU before the cacheline is
>     invalidated.
> 
>     To further reduce the likelihood of hitting any such cache line, the
>     code which enables the PL310 and invalidates it runs from primed L1
>     icache lines, thus generating no bus traffic at all.
> 
> 
> Ah, right. I somehow didn't realize that invalidating is done after
> enabling:-)

Right. If it could be done before (like e.g. by whacking the L2CC
reset), that'd be fantastic.

>     > Also, wouldn't this affect other platforms as well?
> 
>     I had the same concern, however I suspect it might have to do with the
>     reset implementation on SoCFPGA, which doesn't clear the L2CC, while
>     reset implementations on other SoCs likely do clear the L2CC. I am
>     however inquiring with Altera/Intel about this.
> 
> 
> Ok, thanks for the explanation!

Sure
Simon Goldschmidt Feb. 19, 2019, 3:55 p.m. UTC | #5
Am 19.02.2019 um 15:00 schrieb Marek Vasut:
> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>
>>
>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>> <mailto:marex@denx.de>> geschrieben:
>>
>>      On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>      > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>      <mailto:marex@denx.de>> wrote:
>>      >>
>>      >> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from
>>      Linux
>>      >> will result in stale data in PL310 L2 cache controller. Even if
>>      the L2
>>      >> cache controller is disabled via the CTRL register CTRL_EN bit, those
>>      >> data can interfere with operation of devices using DMA, like e.g. the
>>      >> DWMMC controller. This can in turn cause e.g. SPL to fail reading
>>      data
>>      >> from SD/MMC.
>>      >>
>>      >> The obvious solution here would be to fully reset the L2 cache
>>      controller
>>      >> via the reset manager MPUMODRST L2 bit, however this causes bus
>>      hang even
>>      >> if executed entirely from L1 I-cache to avoid generating any bus
>>      traffic
>>      >> through the L2 cache controller.
>>      >>
>>      >> This patch thus configures and enables the L2 cache controller
>>      very early
>>      >> in the SPL boot process, clears the L2 cache and disables the L2
>>      cache
>>      >> controller again.
>>      >>
>>      >> The reason for doing it in SPL is because we need to avoid
>>      accessing any
>>      >> of the potentially stale data in the L2 cache, and we are certain
>>      any of
>>      >> the stale data will be below the OCRAM address range. To further
>>      reduce
>>      >> bus traffic during the L2 cache invalidation, we enable L1
>>      I-cache and
>>      >> run the invalidation code entirely out of the L1 I-cache.
>>      >>
>>      >> Signed-off-by: Marek Vasut <marex@denx.de <mailto:marex@denx.de>>
>>      >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>      <mailto:dwesterg@gmail.com>>
>>      >> Cc: Dinh Nguyen <dinguyen@kernel.org <mailto:dinguyen@kernel.org>>
>>      >
>>      > We've tested this and it seems to fix the issue, so:
>>      >
>>      > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>>      <mailto:simon.k.r.goldschmidt@gmail.com>>

Seems like I was a little fast there: the patch does not apply cleanly 
to master. It did apply to our tree, obviously.

It's missing an include to <asm/pl310.h> and 'pl310' is undefined. 
Probably a result of rebasing it...

>>      >
>>      > However, I don't really get why clearing the L2 cache later in U-Boot
>>      > isn't good enough.
>>
>>      Because when U-Boot is running, it is already running from RAM, which is
>>      on different PL310 master than OCRAM, and you're likely to hit the
>>      corrupted cache lines on the DRAM master which is primed by prior Linux
>>      operation. Such cacheline can be hit between the code enabling the PL310
>>      and the code invalidating it, which is a C code, using stack and calling
>>      functions, thus accessing memory which would likely reside in different
>>      PL310 cachelines. If one of those cachelines contains invalid/corrupted
>>      data, they can be provided to the CPU before the cacheline is
>>      invalidated.
>>
>>      To further reduce the likelihood of hitting any such cache line, the
>>      code which enables the PL310 and invalidates it runs from primed L1
>>      icache lines, thus generating no bus traffic at all.
>>
>>
>> Ah, right. I somehow didn't realize that invalidating is done after
>> enabling:-)
> 
> Right. If it could be done before (like e.g. by whacking the L2CC
> reset), that'd be fantastic.

I haven't yet found a way to cleanly reproduce this on my socrates, so 
unfortunately, I currently cannot test any suggestions, right now :-(

Wouldn't it be better to place this function in some central position 
near the cache initialization code instead of running it from SPL? Or is 
it required to run it in SPL?

Also, it seems this patch enables the ICACHE in SPL, which wasn't 
enabled before. Not a bad thing, but might this have side effects?

Regards,
Simon

> 
>>      > Also, wouldn't this affect other platforms as well?
>>
>>      I had the same concern, however I suspect it might have to do with the
>>      reset implementation on SoCFPGA, which doesn't clear the L2CC, while
>>      reset implementations on other SoCs likely do clear the L2CC. I am
>>      however inquiring with Altera/Intel about this.
>>
>>
>> Ok, thanks for the explanation!
> 
> Sure
>
Marek Vasut Feb. 19, 2019, 4 p.m. UTC | #6
On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>
>>>
>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>> <mailto:marex@denx.de>> geschrieben:
>>>
>>>      On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>      > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>      <mailto:marex@denx.de>> wrote:
>>>      >>
>>>      >> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from
>>>      Linux
>>>      >> will result in stale data in PL310 L2 cache controller. Even if
>>>      the L2
>>>      >> cache controller is disabled via the CTRL register CTRL_EN
>>> bit, those
>>>      >> data can interfere with operation of devices using DMA, like
>>> e.g. the
>>>      >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>> reading
>>>      data
>>>      >> from SD/MMC.
>>>      >>
>>>      >> The obvious solution here would be to fully reset the L2 cache
>>>      controller
>>>      >> via the reset manager MPUMODRST L2 bit, however this causes bus
>>>      hang even
>>>      >> if executed entirely from L1 I-cache to avoid generating any bus
>>>      traffic
>>>      >> through the L2 cache controller.
>>>      >>
>>>      >> This patch thus configures and enables the L2 cache controller
>>>      very early
>>>      >> in the SPL boot process, clears the L2 cache and disables the L2
>>>      cache
>>>      >> controller again.
>>>      >>
>>>      >> The reason for doing it in SPL is because we need to avoid
>>>      accessing any
>>>      >> of the potentially stale data in the L2 cache, and we are
>>> certain
>>>      any of
>>>      >> the stale data will be below the OCRAM address range. To further
>>>      reduce
>>>      >> bus traffic during the L2 cache invalidation, we enable L1
>>>      I-cache and
>>>      >> run the invalidation code entirely out of the L1 I-cache.
>>>      >>
>>>      >> Signed-off-by: Marek Vasut <marex@denx.de
>>> <mailto:marex@denx.de>>
>>>      >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>      <mailto:dwesterg@gmail.com>>
>>>      >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>> <mailto:dinguyen@kernel.org>>
>>>      >
>>>      > We've tested this and it seems to fix the issue, so:
>>>      >
>>>      > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>>>      <mailto:simon.k.r.goldschmidt@gmail.com>>
> 
> Seems like I was a little fast there: the patch does not apply cleanly
> to master. It did apply to our tree, obviously.
> 
> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
> Probably a result of rebasing it...
> 
>>>      >
>>>      > However, I don't really get why clearing the L2 cache later in
>>> U-Boot
>>>      > isn't good enough.
>>>
>>>      Because when U-Boot is running, it is already running from RAM,
>>> which is
>>>      on different PL310 master than OCRAM, and you're likely to hit the
>>>      corrupted cache lines on the DRAM master which is primed by
>>> prior Linux
>>>      operation. Such cacheline can be hit between the code enabling
>>> the PL310
>>>      and the code invalidating it, which is a C code, using stack and
>>> calling
>>>      functions, thus accessing memory which would likely reside in
>>> different
>>>      PL310 cachelines. If one of those cachelines contains
>>> invalid/corrupted
>>>      data, they can be provided to the CPU before the cacheline is
>>>      invalidated.
>>>
>>>      To further reduce the likelihood of hitting any such cache line,
>>> the
>>>      code which enables the PL310 and invalidates it runs from primed L1
>>>      icache lines, thus generating no bus traffic at all.
>>>
>>>
>>> Ah, right. I somehow didn't realize that invalidating is done after
>>> enabling:-)
>>
>> Right. If it could be done before (like e.g. by whacking the L2CC
>> reset), that'd be fantastic.
> 
> I haven't yet found a way to cleanly reproduce this on my socrates, so
> unfortunately, I currently cannot test any suggestions, right now :-(

Did you ever have this hang happen on SoCrates ?

> Wouldn't it be better to place this function in some central position
> near the cache initialization code instead of running it from SPL? Or is
> it required to run it in SPL?

It's required in SPL as soon as possible, to avoid accessing the DRAM,
see above. The current location is close to the rest of PL310 init in SPL.

> Also, it seems this patch enables the ICACHE in SPL, which wasn't
> enabled before. Not a bad thing, but might this have side effects?
Actually, I-Cache is enabled in SPL:

http://git.denx.de/?p=u-boot.git;a=blob;f=arch/arm/cpu/armv7/start.S;h=0cb6dd39ccfc0584f4e6f01523fef9405c314763;hb=HEAD#l158

[...]
Simon Goldschmidt Feb. 19, 2019, 4:31 p.m. UTC | #7
Am 19.02.2019 um 17:00 schrieb Marek Vasut:
> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>
>>>>
>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>> <mailto:marex@denx.de>> geschrieben:
>>>>
>>>>       On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>       > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>>       <mailto:marex@denx.de>> wrote:
>>>>       >>
>>>>       >> On SoCFPGA Gen5 systems, it can rarely happen that a reboot from
>>>>       Linux
>>>>       >> will result in stale data in PL310 L2 cache controller. Even if
>>>>       the L2
>>>>       >> cache controller is disabled via the CTRL register CTRL_EN
>>>> bit, those
>>>>       >> data can interfere with operation of devices using DMA, like
>>>> e.g. the
>>>>       >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>>> reading
>>>>       data
>>>>       >> from SD/MMC.
>>>>       >>
>>>>       >> The obvious solution here would be to fully reset the L2 cache
>>>>       controller
>>>>       >> via the reset manager MPUMODRST L2 bit, however this causes bus
>>>>       hang even
>>>>       >> if executed entirely from L1 I-cache to avoid generating any bus
>>>>       traffic
>>>>       >> through the L2 cache controller.
>>>>       >>
>>>>       >> This patch thus configures and enables the L2 cache controller
>>>>       very early
>>>>       >> in the SPL boot process, clears the L2 cache and disables the L2
>>>>       cache
>>>>       >> controller again.
>>>>       >>
>>>>       >> The reason for doing it in SPL is because we need to avoid
>>>>       accessing any
>>>>       >> of the potentially stale data in the L2 cache, and we are
>>>> certain
>>>>       any of
>>>>       >> the stale data will be below the OCRAM address range. To further
>>>>       reduce
>>>>       >> bus traffic during the L2 cache invalidation, we enable L1
>>>>       I-cache and
>>>>       >> run the invalidation code entirely out of the L1 I-cache.
>>>>       >>
>>>>       >> Signed-off-by: Marek Vasut <marex@denx.de
>>>> <mailto:marex@denx.de>>
>>>>       >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>       <mailto:dwesterg@gmail.com>>
>>>>       >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>> <mailto:dinguyen@kernel.org>>
>>>>       >
>>>>       > We've tested this and it seems to fix the issue, so:
>>>>       >
>>>>       > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>>>>       <mailto:simon.k.r.goldschmidt@gmail.com>>
>>
>> Seems like I was a little fast there: the patch does not apply cleanly
>> to master. It did apply to our tree, obviously.
>>
>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>> Probably a result of rebasing it...
>>
>>>>       >
>>>>       > However, I don't really get why clearing the L2 cache later in
>>>> U-Boot
>>>>       > isn't good enough.
>>>>
>>>>       Because when U-Boot is running, it is already running from RAM,
>>>> which is
>>>>       on different PL310 master than OCRAM, and you're likely to hit the
>>>>       corrupted cache lines on the DRAM master which is primed by
>>>> prior Linux
>>>>       operation. Such cacheline can be hit between the code enabling
>>>> the PL310
>>>>       and the code invalidating it, which is a C code, using stack and
>>>> calling
>>>>       functions, thus accessing memory which would likely reside in
>>>> different
>>>>       PL310 cachelines. If one of those cachelines contains
>>>> invalid/corrupted
>>>>       data, they can be provided to the CPU before the cacheline is
>>>>       invalidated.
>>>>
>>>>       To further reduce the likelihood of hitting any such cache line,
>>>> the
>>>>       code which enables the PL310 and invalidates it runs from primed L1
>>>>       icache lines, thus generating no bus traffic at all.
>>>>
>>>>
>>>> Ah, right. I somehow didn't realize that invalidating is done after
>>>> enabling:-)
>>>
>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>> reset), that'd be fantastic.
>>
>> I haven't yet found a way to cleanly reproduce this on my socrates, so
>> unfortunately, I currently cannot test any suggestions, right now :-(
> 
> Did you ever have this hang happen on SoCrates ?

I think so, yes.

What I did was running some lmb tests loading from mmc, removing the 
sdcard to write an updated version of U-Boot, replacing it and then 
typing 'reset'. It would then sometimes hang in cache initialization - I 
added debug prints throughout cache initialization and was debug prints 
from one function and then nothing more after a simple function call...

That would very much fit to your explanation. However, I cannot 
reproduce this currently :-(

> 
>> Wouldn't it be better to place this function in some central position
>> near the cache initialization code instead of running it from SPL? Or is
>> it required to run it in SPL?
> 
> It's required in SPL as soon as possible, to avoid accessing the DRAM,
> see above. The current location is close to the rest of PL310 init in SPL.

OK, I just did not understand why this is necessary before accessing DRAM.

> 
>> Also, it seems this patch enables the ICACHE in SPL, which wasn't
>> enabled before. Not a bad thing, but might this have side effects?
> Actually, I-Cache is enabled in SPL:
> 
> http://git.denx.de/?p=u-boot.git;a=blob;f=arch/arm/cpu/armv7/start.S;h=0cb6dd39ccfc0584f4e6f01523fef9405c314763;hb=HEAD#l158

Oh, right. Unless disabled via config option.

Regards,
Simon
Marek Vasut Feb. 19, 2019, 4:39 p.m. UTC | #8
On 2/19/19 5:31 PM, Simon Goldschmidt wrote:
> Am 19.02.2019 um 17:00 schrieb Marek Vasut:
>> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>>
>>>>>
>>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>>> <mailto:marex@denx.de>> geschrieben:
>>>>>
>>>>>       On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>>       > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>>>       <mailto:marex@denx.de>> wrote:
>>>>>       >>
>>>>>       >> On SoCFPGA Gen5 systems, it can rarely happen that a
>>>>> reboot from
>>>>>       Linux
>>>>>       >> will result in stale data in PL310 L2 cache controller.
>>>>> Even if
>>>>>       the L2
>>>>>       >> cache controller is disabled via the CTRL register CTRL_EN
>>>>> bit, those
>>>>>       >> data can interfere with operation of devices using DMA, like
>>>>> e.g. the
>>>>>       >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>>>> reading
>>>>>       data
>>>>>       >> from SD/MMC.
>>>>>       >>
>>>>>       >> The obvious solution here would be to fully reset the L2
>>>>> cache
>>>>>       controller
>>>>>       >> via the reset manager MPUMODRST L2 bit, however this
>>>>> causes bus
>>>>>       hang even
>>>>>       >> if executed entirely from L1 I-cache to avoid generating
>>>>> any bus
>>>>>       traffic
>>>>>       >> through the L2 cache controller.
>>>>>       >>
>>>>>       >> This patch thus configures and enables the L2 cache
>>>>> controller
>>>>>       very early
>>>>>       >> in the SPL boot process, clears the L2 cache and disables
>>>>> the L2
>>>>>       cache
>>>>>       >> controller again.
>>>>>       >>
>>>>>       >> The reason for doing it in SPL is because we need to avoid
>>>>>       accessing any
>>>>>       >> of the potentially stale data in the L2 cache, and we are
>>>>> certain
>>>>>       any of
>>>>>       >> the stale data will be below the OCRAM address range. To
>>>>> further
>>>>>       reduce
>>>>>       >> bus traffic during the L2 cache invalidation, we enable L1
>>>>>       I-cache and
>>>>>       >> run the invalidation code entirely out of the L1 I-cache.
>>>>>       >>
>>>>>       >> Signed-off-by: Marek Vasut <marex@denx.de
>>>>> <mailto:marex@denx.de>>
>>>>>       >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>>       <mailto:dwesterg@gmail.com>>
>>>>>       >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>>> <mailto:dinguyen@kernel.org>>
>>>>>       >
>>>>>       > We've tested this and it seems to fix the issue, so:
>>>>>       >
>>>>>       > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>>>>>       <mailto:simon.k.r.goldschmidt@gmail.com>>
>>>
>>> Seems like I was a little fast there: the patch does not apply cleanly
>>> to master. It did apply to our tree, obviously.
>>>
>>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>>> Probably a result of rebasing it...
>>>
>>>>>       >
>>>>>       > However, I don't really get why clearing the L2 cache later in
>>>>> U-Boot
>>>>>       > isn't good enough.
>>>>>
>>>>>       Because when U-Boot is running, it is already running from RAM,
>>>>> which is
>>>>>       on different PL310 master than OCRAM, and you're likely to
>>>>> hit the
>>>>>       corrupted cache lines on the DRAM master which is primed by
>>>>> prior Linux
>>>>>       operation. Such cacheline can be hit between the code enabling
>>>>> the PL310
>>>>>       and the code invalidating it, which is a C code, using stack and
>>>>> calling
>>>>>       functions, thus accessing memory which would likely reside in
>>>>> different
>>>>>       PL310 cachelines. If one of those cachelines contains
>>>>> invalid/corrupted
>>>>>       data, they can be provided to the CPU before the cacheline is
>>>>>       invalidated.
>>>>>
>>>>>       To further reduce the likelihood of hitting any such cache line,
>>>>> the
>>>>>       code which enables the PL310 and invalidates it runs from
>>>>> primed L1
>>>>>       icache lines, thus generating no bus traffic at all.
>>>>>
>>>>>
>>>>> Ah, right. I somehow didn't realize that invalidating is done after
>>>>> enabling:-)
>>>>
>>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>>> reset), that'd be fantastic.
>>>
>>> I haven't yet found a way to cleanly reproduce this on my socrates, so
>>> unfortunately, I currently cannot test any suggestions, right now :-(
>>
>> Did you ever have this hang happen on SoCrates ?
> 
> I think so, yes.
> 
> What I did was running some lmb tests loading from mmc, removing the
> sdcard to write an updated version of U-Boot, replacing it and then
> typing 'reset'. It would then sometimes hang in cache initialization - I
> added debug prints throughout cache initialization and was debug prints
> from one function and then nothing more after a simple function call...
> 
> That would very much fit to your explanation. However, I cannot
> reproduce this currently :-(

So basically a board which uses DMA (like the DWMMC DMA) is likely to be
affected. Surely, the DMA does trigger PL310 operations.

>>> Wouldn't it be better to place this function in some central position
>>> near the cache initialization code instead of running it from SPL? Or is
>>> it required to run it in SPL?
>>
>> It's required in SPL as soon as possible, to avoid accessing the DRAM,
>> see above. The current location is close to the rest of PL310 init in
>> SPL.
> 
> OK, I just did not understand why this is necessary before accessing DRAM.

:)

>>> Also, it seems this patch enables the ICACHE in SPL, which wasn't
>>> enabled before. Not a bad thing, but might this have side effects?
>> Actually, I-Cache is enabled in SPL:
>>
>> http://git.denx.de/?p=u-boot.git;a=blob;f=arch/arm/cpu/armv7/start.S;h=0cb6dd39ccfc0584f4e6f01523fef9405c314763;hb=HEAD#l158
>>
> 
> Oh, right. Unless disabled via config option.

Yes
Simon Goldschmidt Feb. 19, 2019, 7:29 p.m. UTC | #9
Am 19.02.2019 um 17:39 schrieb Marek Vasut:
> On 2/19/19 5:31 PM, Simon Goldschmidt wrote:
>> Am 19.02.2019 um 17:00 schrieb Marek Vasut:
>>> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>>>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>>>
>>>>>>
>>>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>>>> <mailto:marex@denx.de>> geschrieben:
>>>>>>
>>>>>>        On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>>>        > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>>>>        <mailto:marex@denx.de>> wrote:
>>>>>>        >>
>>>>>>        >> On SoCFPGA Gen5 systems, it can rarely happen that a
>>>>>> reboot from
>>>>>>        Linux
>>>>>>        >> will result in stale data in PL310 L2 cache controller.
>>>>>> Even if
>>>>>>        the L2
>>>>>>        >> cache controller is disabled via the CTRL register CTRL_EN
>>>>>> bit, those
>>>>>>        >> data can interfere with operation of devices using DMA, like
>>>>>> e.g. the
>>>>>>        >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>>>>> reading
>>>>>>        data
>>>>>>        >> from SD/MMC.
>>>>>>        >>
>>>>>>        >> The obvious solution here would be to fully reset the L2
>>>>>> cache
>>>>>>        controller
>>>>>>        >> via the reset manager MPUMODRST L2 bit, however this
>>>>>> causes bus
>>>>>>        hang even
>>>>>>        >> if executed entirely from L1 I-cache to avoid generating
>>>>>> any bus
>>>>>>        traffic
>>>>>>        >> through the L2 cache controller.
>>>>>>        >>
>>>>>>        >> This patch thus configures and enables the L2 cache
>>>>>> controller
>>>>>>        very early
>>>>>>        >> in the SPL boot process, clears the L2 cache and disables
>>>>>> the L2
>>>>>>        cache
>>>>>>        >> controller again.
>>>>>>        >>
>>>>>>        >> The reason for doing it in SPL is because we need to avoid
>>>>>>        accessing any
>>>>>>        >> of the potentially stale data in the L2 cache, and we are
>>>>>> certain
>>>>>>        any of
>>>>>>        >> the stale data will be below the OCRAM address range. To
>>>>>> further
>>>>>>        reduce
>>>>>>        >> bus traffic during the L2 cache invalidation, we enable L1
>>>>>>        I-cache and
>>>>>>        >> run the invalidation code entirely out of the L1 I-cache.
>>>>>>        >>
>>>>>>        >> Signed-off-by: Marek Vasut <marex@denx.de
>>>>>> <mailto:marex@denx.de>>
>>>>>>        >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>>>        <mailto:dwesterg@gmail.com>>
>>>>>>        >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>>>> <mailto:dinguyen@kernel.org>>
>>>>>>        >
>>>>>>        > We've tested this and it seems to fix the issue, so:
>>>>>>        >
>>>>>>        > Tested-by: Simon Goldschmidt <simon.k.r.goldschmidt@gmail.com
>>>>>>        <mailto:simon.k.r.goldschmidt@gmail.com>>
>>>>
>>>> Seems like I was a little fast there: the patch does not apply cleanly
>>>> to master. It did apply to our tree, obviously.
>>>>
>>>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>>>> Probably a result of rebasing it...
>>>>
>>>>>>        >
>>>>>>        > However, I don't really get why clearing the L2 cache later in
>>>>>> U-Boot
>>>>>>        > isn't good enough.
>>>>>>
>>>>>>        Because when U-Boot is running, it is already running from RAM,
>>>>>> which is
>>>>>>        on different PL310 master than OCRAM, and you're likely to
>>>>>> hit the
>>>>>>        corrupted cache lines on the DRAM master which is primed by
>>>>>> prior Linux
>>>>>>        operation. Such cacheline can be hit between the code enabling
>>>>>> the PL310
>>>>>>        and the code invalidating it, which is a C code, using stack and
>>>>>> calling
>>>>>>        functions, thus accessing memory which would likely reside in
>>>>>> different
>>>>>>        PL310 cachelines. If one of those cachelines contains
>>>>>> invalid/corrupted
>>>>>>        data, they can be provided to the CPU before the cacheline is
>>>>>>        invalidated.
>>>>>>
>>>>>>        To further reduce the likelihood of hitting any such cache line,
>>>>>> the
>>>>>>        code which enables the PL310 and invalidates it runs from
>>>>>> primed L1
>>>>>>        icache lines, thus generating no bus traffic at all.
>>>>>>
>>>>>>
>>>>>> Ah, right. I somehow didn't realize that invalidating is done after
>>>>>> enabling:-)
>>>>>
>>>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>>>> reset), that'd be fantastic.
>>>>
>>>> I haven't yet found a way to cleanly reproduce this on my socrates, so
>>>> unfortunately, I currently cannot test any suggestions, right now :-(
>>>
>>> Did you ever have this hang happen on SoCrates ?
>>
>> I think so, yes.
>>
>> What I did was running some lmb tests loading from mmc, removing the
>> sdcard to write an updated version of U-Boot, replacing it and then
>> typing 'reset'. It would then sometimes hang in cache initialization - I
>> added debug prints throughout cache initialization and was debug prints
>> from one function and then nothing more after a simple function call...
>>
>> That would very much fit to your explanation. However, I cannot
>> reproduce this currently :-(
> 
> So basically a board which uses DMA (like the DWMMC DMA) is likely to be
> affected. Surely, the DMA does trigger PL310 operations.
> 
>>>> Wouldn't it be better to place this function in some central position
>>>> near the cache initialization code instead of running it from SPL? Or is
>>>> it required to run it in SPL?
>>>
>>> It's required in SPL as soon as possible, to avoid accessing the DRAM,
>>> see above. The current location is close to the rest of PL310 init in
>>> SPL.
>>
>> OK, I just did not understand why this is necessary before accessing DRAM.
> 
> :)

Sorry, I phrased that wrong. I *do* (still) not understand why this is 
necessary before accessing DRAM. Sorry if I'm annoying you with that, 
but I'd like to understand.

Does the DWMMC DMA really trigger PL310 operations? I thought DMA would 
only affect cache if it goes through ACP/SCU?

Regards,
Simon

> 
>>>> Also, it seems this patch enables the ICACHE in SPL, which wasn't
>>>> enabled before. Not a bad thing, but might this have side effects?
>>> Actually, I-Cache is enabled in SPL:
>>>
>>> http://git.denx.de/?p=u-boot.git;a=blob;f=arch/arm/cpu/armv7/start.S;h=0cb6dd39ccfc0584f4e6f01523fef9405c314763;hb=HEAD#l158
>>>
>>
>> Oh, right. Unless disabled via config option.
> 
> Yes
>
Marek Vasut Feb. 19, 2019, 8:07 p.m. UTC | #10
On 2/19/19 8:29 PM, Simon Goldschmidt wrote:
> Am 19.02.2019 um 17:39 schrieb Marek Vasut:
>> On 2/19/19 5:31 PM, Simon Goldschmidt wrote:
>>> Am 19.02.2019 um 17:00 schrieb Marek Vasut:
>>>> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>>>>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>>>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>>>>
>>>>>>>
>>>>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>>>>> <mailto:marex@denx.de>> geschrieben:
>>>>>>>
>>>>>>>        On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>>>>        > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>>>>>        <mailto:marex@denx.de>> wrote:
>>>>>>>        >>
>>>>>>>        >> On SoCFPGA Gen5 systems, it can rarely happen that a
>>>>>>> reboot from
>>>>>>>        Linux
>>>>>>>        >> will result in stale data in PL310 L2 cache controller.
>>>>>>> Even if
>>>>>>>        the L2
>>>>>>>        >> cache controller is disabled via the CTRL register CTRL_EN
>>>>>>> bit, those
>>>>>>>        >> data can interfere with operation of devices using DMA,
>>>>>>> like
>>>>>>> e.g. the
>>>>>>>        >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>>>>>> reading
>>>>>>>        data
>>>>>>>        >> from SD/MMC.
>>>>>>>        >>
>>>>>>>        >> The obvious solution here would be to fully reset the L2
>>>>>>> cache
>>>>>>>        controller
>>>>>>>        >> via the reset manager MPUMODRST L2 bit, however this
>>>>>>> causes bus
>>>>>>>        hang even
>>>>>>>        >> if executed entirely from L1 I-cache to avoid generating
>>>>>>> any bus
>>>>>>>        traffic
>>>>>>>        >> through the L2 cache controller.
>>>>>>>        >>
>>>>>>>        >> This patch thus configures and enables the L2 cache
>>>>>>> controller
>>>>>>>        very early
>>>>>>>        >> in the SPL boot process, clears the L2 cache and disables
>>>>>>> the L2
>>>>>>>        cache
>>>>>>>        >> controller again.
>>>>>>>        >>
>>>>>>>        >> The reason for doing it in SPL is because we need to avoid
>>>>>>>        accessing any
>>>>>>>        >> of the potentially stale data in the L2 cache, and we are
>>>>>>> certain
>>>>>>>        any of
>>>>>>>        >> the stale data will be below the OCRAM address range. To
>>>>>>> further
>>>>>>>        reduce
>>>>>>>        >> bus traffic during the L2 cache invalidation, we enable L1
>>>>>>>        I-cache and
>>>>>>>        >> run the invalidation code entirely out of the L1 I-cache.
>>>>>>>        >>
>>>>>>>        >> Signed-off-by: Marek Vasut <marex@denx.de
>>>>>>> <mailto:marex@denx.de>>
>>>>>>>        >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>>>>        <mailto:dwesterg@gmail.com>>
>>>>>>>        >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>>>>> <mailto:dinguyen@kernel.org>>
>>>>>>>        >
>>>>>>>        > We've tested this and it seems to fix the issue, so:
>>>>>>>        >
>>>>>>>        > Tested-by: Simon Goldschmidt
>>>>>>> <simon.k.r.goldschmidt@gmail.com
>>>>>>>        <mailto:simon.k.r.goldschmidt@gmail.com>>
>>>>>
>>>>> Seems like I was a little fast there: the patch does not apply cleanly
>>>>> to master. It did apply to our tree, obviously.
>>>>>
>>>>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>>>>> Probably a result of rebasing it...
>>>>>
>>>>>>>        >
>>>>>>>        > However, I don't really get why clearing the L2 cache
>>>>>>> later in
>>>>>>> U-Boot
>>>>>>>        > isn't good enough.
>>>>>>>
>>>>>>>        Because when U-Boot is running, it is already running from
>>>>>>> RAM,
>>>>>>> which is
>>>>>>>        on different PL310 master than OCRAM, and you're likely to
>>>>>>> hit the
>>>>>>>        corrupted cache lines on the DRAM master which is primed by
>>>>>>> prior Linux
>>>>>>>        operation. Such cacheline can be hit between the code
>>>>>>> enabling
>>>>>>> the PL310
>>>>>>>        and the code invalidating it, which is a C code, using
>>>>>>> stack and
>>>>>>> calling
>>>>>>>        functions, thus accessing memory which would likely reside in
>>>>>>> different
>>>>>>>        PL310 cachelines. If one of those cachelines contains
>>>>>>> invalid/corrupted
>>>>>>>        data, they can be provided to the CPU before the cacheline is
>>>>>>>        invalidated.
>>>>>>>
>>>>>>>        To further reduce the likelihood of hitting any such cache
>>>>>>> line,
>>>>>>> the
>>>>>>>        code which enables the PL310 and invalidates it runs from
>>>>>>> primed L1
>>>>>>>        icache lines, thus generating no bus traffic at all.
>>>>>>>
>>>>>>>
>>>>>>> Ah, right. I somehow didn't realize that invalidating is done after
>>>>>>> enabling:-)
>>>>>>
>>>>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>>>>> reset), that'd be fantastic.
>>>>>
>>>>> I haven't yet found a way to cleanly reproduce this on my socrates, so
>>>>> unfortunately, I currently cannot test any suggestions, right now :-(
>>>>
>>>> Did you ever have this hang happen on SoCrates ?
>>>
>>> I think so, yes.
>>>
>>> What I did was running some lmb tests loading from mmc, removing the
>>> sdcard to write an updated version of U-Boot, replacing it and then
>>> typing 'reset'. It would then sometimes hang in cache initialization - I
>>> added debug prints throughout cache initialization and was debug prints
>>> from one function and then nothing more after a simple function call...
>>>
>>> That would very much fit to your explanation. However, I cannot
>>> reproduce this currently :-(
>>
>> So basically a board which uses DMA (like the DWMMC DMA) is likely to be
>> affected. Surely, the DMA does trigger PL310 operations.
>>
>>>>> Wouldn't it be better to place this function in some central position
>>>>> near the cache initialization code instead of running it from SPL?
>>>>> Or is
>>>>> it required to run it in SPL?
>>>>
>>>> It's required in SPL as soon as possible, to avoid accessing the DRAM,
>>>> see above. The current location is close to the rest of PL310 init in
>>>> SPL.
>>>
>>> OK, I just did not understand why this is necessary before accessing
>>> DRAM.
>>
>> :)
> 
> Sorry, I phrased that wrong. I *do* (still) not understand why this is
> necessary before accessing DRAM. Sorry if I'm annoying you with that,
> but I'd like to understand.

Because we know that once the L2 cache is enabled and until it's content
is invalidated, it may return stale entries (*). The stale entries in
the L2 cache can only come from a cacheable area. Linux treats the DRAM
as cacheable area, the registers and OCRAM are not cacheable. Therefore,
if we do not fetch anything from DRAM until the L2 cache is invalidated,
we will not get any stale entries in return, and thus we are safe.

The best way to assure that we don't access the DRAM is to execute from
OCRAM, so to do the invalidation in SPL. The best way to assure we won't
generate any traffic on the bus is to execute the code from primed L1
I-cache lines.

* U-Boot is complex code and spans multiple L2 cache lines. The code
which enables L2 cache can very well reside in a different L2 cache line
than the code which invalidates the L2 cache. Furthermore, the code also
spans multiple L1 I-cache lines.

The code which enables the L2 cache is likely in active L1 I-cache line,
however the code which invalidates the L2 cache is not in L1 I-cache
yet, so it needs to be fetched. But (!) now the L2 cache is enabled, so
the system first checks whether the code might be in the L2 cache ; if
there is a stale entry in just the right L2 cache line, the L2 cache
will refill the L1 I-cache with undefined stale content, which the CPU
will happily execute and crash.

Does it make sense now ? Just keep asking until it does make sense ...

> Does the DWMMC DMA really trigger PL310 operations?

No

> I thought DMA would only affect cache if it goes through ACP/SCU?

Correct

I suspect shuffling data between the SD card (DWMMC puts data directly
in DRAM using DMA) and CPU (which looks at the DRAM through multiple
levels of caches, so it has to invalidate/refill/flush the caches to
access the data delivered by DWMMC DMA in DRAM) just primes the L2 cache
with the right kind of entries to trigger the issue.

[...]
Simon Goldschmidt Feb. 19, 2019, 8:56 p.m. UTC | #11
Am 19.02.2019 um 21:07 schrieb Marek Vasut:
> On 2/19/19 8:29 PM, Simon Goldschmidt wrote:
>> Am 19.02.2019 um 17:39 schrieb Marek Vasut:
>>> On 2/19/19 5:31 PM, Simon Goldschmidt wrote:
>>>> Am 19.02.2019 um 17:00 schrieb Marek Vasut:
>>>>> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>>>>>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>>>>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>>>>>> <mailto:marex@denx.de>> geschrieben:
>>>>>>>>
>>>>>>>>         On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>>>>>         > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut <marex@denx.de
>>>>>>>>         <mailto:marex@denx.de>> wrote:
>>>>>>>>         >>
>>>>>>>>         >> On SoCFPGA Gen5 systems, it can rarely happen that a
>>>>>>>> reboot from
>>>>>>>>         Linux
>>>>>>>>         >> will result in stale data in PL310 L2 cache controller.
>>>>>>>> Even if
>>>>>>>>         the L2
>>>>>>>>         >> cache controller is disabled via the CTRL register CTRL_EN
>>>>>>>> bit, those
>>>>>>>>         >> data can interfere with operation of devices using DMA,
>>>>>>>> like
>>>>>>>> e.g. the
>>>>>>>>         >> DWMMC controller. This can in turn cause e.g. SPL to fail
>>>>>>>> reading
>>>>>>>>         data
>>>>>>>>         >> from SD/MMC.
>>>>>>>>         >>
>>>>>>>>         >> The obvious solution here would be to fully reset the L2
>>>>>>>> cache
>>>>>>>>         controller
>>>>>>>>         >> via the reset manager MPUMODRST L2 bit, however this
>>>>>>>> causes bus
>>>>>>>>         hang even
>>>>>>>>         >> if executed entirely from L1 I-cache to avoid generating
>>>>>>>> any bus
>>>>>>>>         traffic
>>>>>>>>         >> through the L2 cache controller.
>>>>>>>>         >>
>>>>>>>>         >> This patch thus configures and enables the L2 cache
>>>>>>>> controller
>>>>>>>>         very early
>>>>>>>>         >> in the SPL boot process, clears the L2 cache and disables
>>>>>>>> the L2
>>>>>>>>         cache
>>>>>>>>         >> controller again.
>>>>>>>>         >>
>>>>>>>>         >> The reason for doing it in SPL is because we need to avoid
>>>>>>>>         accessing any
>>>>>>>>         >> of the potentially stale data in the L2 cache, and we are
>>>>>>>> certain
>>>>>>>>         any of
>>>>>>>>         >> the stale data will be below the OCRAM address range. To
>>>>>>>> further
>>>>>>>>         reduce
>>>>>>>>         >> bus traffic during the L2 cache invalidation, we enable L1
>>>>>>>>         I-cache and
>>>>>>>>         >> run the invalidation code entirely out of the L1 I-cache.
>>>>>>>>         >>
>>>>>>>>         >> Signed-off-by: Marek Vasut <marex@denx.de
>>>>>>>> <mailto:marex@denx.de>>
>>>>>>>>         >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>>>>>         <mailto:dwesterg@gmail.com>>
>>>>>>>>         >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>>>>>> <mailto:dinguyen@kernel.org>>
>>>>>>>>         >
>>>>>>>>         > We've tested this and it seems to fix the issue, so:
>>>>>>>>         >
>>>>>>>>         > Tested-by: Simon Goldschmidt
>>>>>>>> <simon.k.r.goldschmidt@gmail.com
>>>>>>>>         <mailto:simon.k.r.goldschmidt@gmail.com>>
>>>>>>
>>>>>> Seems like I was a little fast there: the patch does not apply cleanly
>>>>>> to master. It did apply to our tree, obviously.
>>>>>>
>>>>>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>>>>>> Probably a result of rebasing it...
>>>>>>
>>>>>>>>         >
>>>>>>>>         > However, I don't really get why clearing the L2 cache
>>>>>>>> later in
>>>>>>>> U-Boot
>>>>>>>>         > isn't good enough.
>>>>>>>>
>>>>>>>>         Because when U-Boot is running, it is already running from
>>>>>>>> RAM,
>>>>>>>> which is
>>>>>>>>         on different PL310 master than OCRAM, and you're likely to
>>>>>>>> hit the
>>>>>>>>         corrupted cache lines on the DRAM master which is primed by
>>>>>>>> prior Linux
>>>>>>>>         operation. Such cacheline can be hit between the code
>>>>>>>> enabling
>>>>>>>> the PL310
>>>>>>>>         and the code invalidating it, which is a C code, using
>>>>>>>> stack and
>>>>>>>> calling
>>>>>>>>         functions, thus accessing memory which would likely reside in
>>>>>>>> different
>>>>>>>>         PL310 cachelines. If one of those cachelines contains
>>>>>>>> invalid/corrupted
>>>>>>>>         data, they can be provided to the CPU before the cacheline is
>>>>>>>>         invalidated.
>>>>>>>>
>>>>>>>>         To further reduce the likelihood of hitting any such cache
>>>>>>>> line,
>>>>>>>> the
>>>>>>>>         code which enables the PL310 and invalidates it runs from
>>>>>>>> primed L1
>>>>>>>>         icache lines, thus generating no bus traffic at all.
>>>>>>>>
>>>>>>>>
>>>>>>>> Ah, right. I somehow didn't realize that invalidating is done after
>>>>>>>> enabling:-)
>>>>>>>
>>>>>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>>>>>> reset), that'd be fantastic.
>>>>>>
>>>>>> I haven't yet found a way to cleanly reproduce this on my socrates, so
>>>>>> unfortunately, I currently cannot test any suggestions, right now :-(
>>>>>
>>>>> Did you ever have this hang happen on SoCrates ?
>>>>
>>>> I think so, yes.
>>>>
>>>> What I did was running some lmb tests loading from mmc, removing the
>>>> sdcard to write an updated version of U-Boot, replacing it and then
>>>> typing 'reset'. It would then sometimes hang in cache initialization - I
>>>> added debug prints throughout cache initialization and was debug prints
>>>> from one function and then nothing more after a simple function call...
>>>>
>>>> That would very much fit to your explanation. However, I cannot
>>>> reproduce this currently :-(
>>>
>>> So basically a board which uses DMA (like the DWMMC DMA) is likely to be
>>> affected. Surely, the DMA does trigger PL310 operations.
>>>
>>>>>> Wouldn't it be better to place this function in some central position
>>>>>> near the cache initialization code instead of running it from SPL?
>>>>>> Or is
>>>>>> it required to run it in SPL?
>>>>>
>>>>> It's required in SPL as soon as possible, to avoid accessing the DRAM,
>>>>> see above. The current location is close to the rest of PL310 init in
>>>>> SPL.
>>>>
>>>> OK, I just did not understand why this is necessary before accessing
>>>> DRAM.
>>>
>>> :)
>>
>> Sorry, I phrased that wrong. I *do* (still) not understand why this is
>> necessary before accessing DRAM. Sorry if I'm annoying you with that,
>> but I'd like to understand.
> 
> Because we know that once the L2 cache is enabled and until it's content
> is invalidated, it may return stale entries (*). The stale entries in
> the L2 cache can only come from a cacheable area. Linux treats the DRAM
> as cacheable area, the registers and OCRAM are not cacheable. Therefore,
> if we do not fetch anything from DRAM until the L2 cache is invalidated,
> we will not get any stale entries in return, and thus we are safe.
> 
> The best way to assure that we don't access the DRAM is to execute from
> OCRAM, so to do the invalidation in SPL. The best way to assure we won't
> generate any traffic on the bus is to execute the code from primed L1
> I-cache lines.
> 
> * U-Boot is complex code and spans multiple L2 cache lines. The code
> which enables L2 cache can very well reside in a different L2 cache line
> than the code which invalidates the L2 cache. Furthermore, the code also
> spans multiple L1 I-cache lines.
> 
> The code which enables the L2 cache is likely in active L1 I-cache line,
> however the code which invalidates the L2 cache is not in L1 I-cache
> yet, so it needs to be fetched. But (!) now the L2 cache is enabled, so
> the system first checks whether the code might be in the L2 cache ; if
> there is a stale entry in just the right L2 cache line, the L2 cache
> will refill the L1 I-cache with undefined stale content, which the CPU
> will happily execute and crash.
> 
> Does it make sense now ? Just keep asking until it does make sense ...

Yes, absolutely! Thanks for the excellent explanation!

I hadn't payed attention to the fact that SPL runs from OCRAM and 
concentrated on DWMMC. Because what kept us racking our brains was the 
fact that this hang only occurs on one platform booting from eMMC, but 
not on a similar platform booting from SPI-NOR.

And yes, like you I'm wondering who the hell should be able to write 
mpumodrst.l2 if not the CPU... I also tried this but failed ;-)

Regards,
Simon

>> Does the DWMMC DMA really trigger PL310 operations?
> 
> No
> 
>> I thought DMA would only affect cache if it goes through ACP/SCU?
> 
> Correct
> 
> I suspect shuffling data between the SD card (DWMMC puts data directly
> in DRAM using DMA) and CPU (which looks at the DRAM through multiple
> levels of caches, so it has to invalidate/refill/flush the caches to
> access the data delivered by DWMMC DMA in DRAM) just primes the L2 cache
> with the right kind of entries to trigger the issue.
> 
> [...]
> 
>
Marek Vasut Feb. 19, 2019, 10:02 p.m. UTC | #12
On 2/19/19 9:56 PM, Simon Goldschmidt wrote:
> Am 19.02.2019 um 21:07 schrieb Marek Vasut:
>> On 2/19/19 8:29 PM, Simon Goldschmidt wrote:
>>> Am 19.02.2019 um 17:39 schrieb Marek Vasut:
>>>> On 2/19/19 5:31 PM, Simon Goldschmidt wrote:
>>>>> Am 19.02.2019 um 17:00 schrieb Marek Vasut:
>>>>>> On 2/19/19 4:55 PM, Simon Goldschmidt wrote:
>>>>>>> Am 19.02.2019 um 15:00 schrieb Marek Vasut:
>>>>>>>> On 2/19/19 2:58 PM, Simon Goldschmidt wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Am Di., 19. Feb. 2019, 14:45 hat Marek Vasut <marex@denx.de
>>>>>>>>> <mailto:marex@denx.de>> geschrieben:
>>>>>>>>>
>>>>>>>>>         On 2/19/19 2:28 PM, Simon Goldschmidt wrote:
>>>>>>>>>         > On Tue, Feb 19, 2019 at 1:53 PM Marek Vasut
>>>>>>>>> <marex@denx.de
>>>>>>>>>         <mailto:marex@denx.de>> wrote:
>>>>>>>>>         >>
>>>>>>>>>         >> On SoCFPGA Gen5 systems, it can rarely happen that a
>>>>>>>>> reboot from
>>>>>>>>>         Linux
>>>>>>>>>         >> will result in stale data in PL310 L2 cache controller.
>>>>>>>>> Even if
>>>>>>>>>         the L2
>>>>>>>>>         >> cache controller is disabled via the CTRL register
>>>>>>>>> CTRL_EN
>>>>>>>>> bit, those
>>>>>>>>>         >> data can interfere with operation of devices using DMA,
>>>>>>>>> like
>>>>>>>>> e.g. the
>>>>>>>>>         >> DWMMC controller. This can in turn cause e.g. SPL to
>>>>>>>>> fail
>>>>>>>>> reading
>>>>>>>>>         data
>>>>>>>>>         >> from SD/MMC.
>>>>>>>>>         >>
>>>>>>>>>         >> The obvious solution here would be to fully reset
>>>>>>>>> the L2
>>>>>>>>> cache
>>>>>>>>>         controller
>>>>>>>>>         >> via the reset manager MPUMODRST L2 bit, however this
>>>>>>>>> causes bus
>>>>>>>>>         hang even
>>>>>>>>>         >> if executed entirely from L1 I-cache to avoid
>>>>>>>>> generating
>>>>>>>>> any bus
>>>>>>>>>         traffic
>>>>>>>>>         >> through the L2 cache controller.
>>>>>>>>>         >>
>>>>>>>>>         >> This patch thus configures and enables the L2 cache
>>>>>>>>> controller
>>>>>>>>>         very early
>>>>>>>>>         >> in the SPL boot process, clears the L2 cache and
>>>>>>>>> disables
>>>>>>>>> the L2
>>>>>>>>>         cache
>>>>>>>>>         >> controller again.
>>>>>>>>>         >>
>>>>>>>>>         >> The reason for doing it in SPL is because we need to
>>>>>>>>> avoid
>>>>>>>>>         accessing any
>>>>>>>>>         >> of the potentially stale data in the L2 cache, and
>>>>>>>>> we are
>>>>>>>>> certain
>>>>>>>>>         any of
>>>>>>>>>         >> the stale data will be below the OCRAM address
>>>>>>>>> range. To
>>>>>>>>> further
>>>>>>>>>         reduce
>>>>>>>>>         >> bus traffic during the L2 cache invalidation, we
>>>>>>>>> enable L1
>>>>>>>>>         I-cache and
>>>>>>>>>         >> run the invalidation code entirely out of the L1
>>>>>>>>> I-cache.
>>>>>>>>>         >>
>>>>>>>>>         >> Signed-off-by: Marek Vasut <marex@denx.de
>>>>>>>>> <mailto:marex@denx.de>>
>>>>>>>>>         >> Cc: Dalon Westergreen <dwesterg@gmail.com
>>>>>>>>>         <mailto:dwesterg@gmail.com>>
>>>>>>>>>         >> Cc: Dinh Nguyen <dinguyen@kernel.org
>>>>>>>>> <mailto:dinguyen@kernel.org>>
>>>>>>>>>         >
>>>>>>>>>         > We've tested this and it seems to fix the issue, so:
>>>>>>>>>         >
>>>>>>>>>         > Tested-by: Simon Goldschmidt
>>>>>>>>> <simon.k.r.goldschmidt@gmail.com
>>>>>>>>>         <mailto:simon.k.r.goldschmidt@gmail.com>>
>>>>>>>
>>>>>>> Seems like I was a little fast there: the patch does not apply
>>>>>>> cleanly
>>>>>>> to master. It did apply to our tree, obviously.
>>>>>>>
>>>>>>> It's missing an include to <asm/pl310.h> and 'pl310' is undefined.
>>>>>>> Probably a result of rebasing it...
>>>>>>>
>>>>>>>>>         >
>>>>>>>>>         > However, I don't really get why clearing the L2 cache
>>>>>>>>> later in
>>>>>>>>> U-Boot
>>>>>>>>>         > isn't good enough.
>>>>>>>>>
>>>>>>>>>         Because when U-Boot is running, it is already running from
>>>>>>>>> RAM,
>>>>>>>>> which is
>>>>>>>>>         on different PL310 master than OCRAM, and you're likely to
>>>>>>>>> hit the
>>>>>>>>>         corrupted cache lines on the DRAM master which is
>>>>>>>>> primed by
>>>>>>>>> prior Linux
>>>>>>>>>         operation. Such cacheline can be hit between the code
>>>>>>>>> enabling
>>>>>>>>> the PL310
>>>>>>>>>         and the code invalidating it, which is a C code, using
>>>>>>>>> stack and
>>>>>>>>> calling
>>>>>>>>>         functions, thus accessing memory which would likely
>>>>>>>>> reside in
>>>>>>>>> different
>>>>>>>>>         PL310 cachelines. If one of those cachelines contains
>>>>>>>>> invalid/corrupted
>>>>>>>>>         data, they can be provided to the CPU before the
>>>>>>>>> cacheline is
>>>>>>>>>         invalidated.
>>>>>>>>>
>>>>>>>>>         To further reduce the likelihood of hitting any such cache
>>>>>>>>> line,
>>>>>>>>> the
>>>>>>>>>         code which enables the PL310 and invalidates it runs from
>>>>>>>>> primed L1
>>>>>>>>>         icache lines, thus generating no bus traffic at all.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Ah, right. I somehow didn't realize that invalidating is done
>>>>>>>>> after
>>>>>>>>> enabling:-)
>>>>>>>>
>>>>>>>> Right. If it could be done before (like e.g. by whacking the L2CC
>>>>>>>> reset), that'd be fantastic.
>>>>>>>
>>>>>>> I haven't yet found a way to cleanly reproduce this on my
>>>>>>> socrates, so
>>>>>>> unfortunately, I currently cannot test any suggestions, right now
>>>>>>> :-(
>>>>>>
>>>>>> Did you ever have this hang happen on SoCrates ?
>>>>>
>>>>> I think so, yes.
>>>>>
>>>>> What I did was running some lmb tests loading from mmc, removing the
>>>>> sdcard to write an updated version of U-Boot, replacing it and then
>>>>> typing 'reset'. It would then sometimes hang in cache
>>>>> initialization - I
>>>>> added debug prints throughout cache initialization and was debug
>>>>> prints
>>>>> from one function and then nothing more after a simple function
>>>>> call...
>>>>>
>>>>> That would very much fit to your explanation. However, I cannot
>>>>> reproduce this currently :-(
>>>>
>>>> So basically a board which uses DMA (like the DWMMC DMA) is likely
>>>> to be
>>>> affected. Surely, the DMA does trigger PL310 operations.
>>>>
>>>>>>> Wouldn't it be better to place this function in some central
>>>>>>> position
>>>>>>> near the cache initialization code instead of running it from SPL?
>>>>>>> Or is
>>>>>>> it required to run it in SPL?
>>>>>>
>>>>>> It's required in SPL as soon as possible, to avoid accessing the
>>>>>> DRAM,
>>>>>> see above. The current location is close to the rest of PL310 init in
>>>>>> SPL.
>>>>>
>>>>> OK, I just did not understand why this is necessary before accessing
>>>>> DRAM.
>>>>
>>>> :)
>>>
>>> Sorry, I phrased that wrong. I *do* (still) not understand why this is
>>> necessary before accessing DRAM. Sorry if I'm annoying you with that,
>>> but I'd like to understand.
>>
>> Because we know that once the L2 cache is enabled and until it's content
>> is invalidated, it may return stale entries (*). The stale entries in
>> the L2 cache can only come from a cacheable area. Linux treats the DRAM
>> as cacheable area, the registers and OCRAM are not cacheable. Therefore,
>> if we do not fetch anything from DRAM until the L2 cache is invalidated,
>> we will not get any stale entries in return, and thus we are safe.
>>
>> The best way to assure that we don't access the DRAM is to execute from
>> OCRAM, so to do the invalidation in SPL. The best way to assure we won't
>> generate any traffic on the bus is to execute the code from primed L1
>> I-cache lines.
>>
>> * U-Boot is complex code and spans multiple L2 cache lines. The code
>> which enables L2 cache can very well reside in a different L2 cache line
>> than the code which invalidates the L2 cache. Furthermore, the code also
>> spans multiple L1 I-cache lines.
>>
>> The code which enables the L2 cache is likely in active L1 I-cache line,
>> however the code which invalidates the L2 cache is not in L1 I-cache
>> yet, so it needs to be fetched. But (!) now the L2 cache is enabled, so
>> the system first checks whether the code might be in the L2 cache ; if
>> there is a stale entry in just the right L2 cache line, the L2 cache
>> will refill the L1 I-cache with undefined stale content, which the CPU
>> will happily execute and crash.
>>
>> Does it make sense now ? Just keep asking until it does make sense ...
> 
> Yes, absolutely! Thanks for the excellent explanation!
> 
> I hadn't payed attention to the fact that SPL runs from OCRAM and
> concentrated on DWMMC. Because what kept us racking our brains was the
> fact that this hang only occurs on one platform booting from eMMC, but
> not on a similar platform booting from SPI-NOR.

I think that's because the CQSPI doesn't do DMA into the memory, but
rather exposes the SPI NOR as a memory mapped chunk of register area.

> And yes, like you I'm wondering who the hell should be able to write
> mpumodrst.l2 if not the CPU... I also tried this but failed ;-)

Something in the FPGA, a well-programmed PL330 DMA or any other L3
master I think.
diff mbox series

Patch

diff --git a/arch/arm/mach-socfpga/spl_gen5.c b/arch/arm/mach-socfpga/spl_gen5.c
index ccdc661d05..12d011e681 100644
--- a/arch/arm/mach-socfpga/spl_gen5.c
+++ b/arch/arm/mach-socfpga/spl_gen5.c
@@ -63,6 +63,60 @@  u32 spl_boot_mode(const u32 boot_device)
 }
 #endif
 
+static void socfpga_pl310_clear(void)
+{
+	u32 mask = 0xff, ena = 0;
+
+	icache_enable();
+
+	/* Disable the L2 cache */
+	clrbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
+
+	writel(0x111, &pl310->pl310_tag_latency_ctrl);
+	writel(0x121, &pl310->pl310_data_latency_ctrl);
+
+	/* enable BRESP, instruction and data prefetch, full line of zeroes */
+	setbits_le32(&pl310->pl310_aux_ctrl,
+		     L310_AUX_CTRL_DATA_PREFETCH_MASK |
+		     L310_AUX_CTRL_INST_PREFETCH_MASK |
+		     L310_SHARED_ATT_OVERRIDE_ENABLE);
+
+	/* Enable the L2 cache */
+	ena = readl(&pl310->pl310_ctrl);
+	ena |= L2X0_CTRL_EN;
+
+	/*
+	 * Invalidate the PL310 L2 cache. Keep the invalidation code
+	 * entirely in L1 I-cache to avoid any bus traffic through
+	 * the L2.
+	 */
+	asm volatile(
+		".align	5			\n"
+		"	b	3f		\n"
+		"1:	str	%1,	[%4]	\n"
+		"	dsb			\n"
+		"	isb			\n"
+		"	str	%0,	[%2]	\n"
+		"	dsb			\n"
+		"	isb			\n"
+		"2:	ldr	%0,	[%2]	\n"
+		"	cmp	%0,	#0	\n"
+		"	bne	2b		\n"
+		"	str	%0,	[%3]	\n"
+		"	dsb			\n"
+		"	isb			\n"
+		"	b	4f		\n"
+		"3:	b	1b		\n"
+		"4:	nop			\n"
+	: "+r"(mask), "+r"(ena)
+	: "r"(&pl310->pl310_inv_way),
+	  "r"(&pl310->pl310_cache_sync), "r"(&pl310->pl310_ctrl)
+	: "memory", "cc");
+
+	/* Disable the L2 cache */
+	clrbits_le32(&pl310->pl310_ctrl, L2X0_CTRL_EN);
+}
+
 void board_init_f(ulong dummy)
 {
 	const struct cm_config *cm_default_cfg = cm_get_default_config();
@@ -85,6 +139,7 @@  void board_init_f(ulong dummy)
 	memset(__bss_start, 0, __bss_end - __bss_start);
 
 	socfpga_sdram_remap_zero();
+	socfpga_pl310_clear();
 
 	debug("Freezing all I/O banks\n");
 	/* freeze all IO banks */