diff mbox

[U-Boot] nios2: convert cache flush to use dm cpu data

Message ID 1444119600-31999-1-git-send-email-thomas@wytron.com.tw
State Superseded
Delegated to: Thomas Chou
Headers show

Commit Message

Thomas Chou Oct. 6, 2015, 8:20 a.m. UTC
Convert cache flush to use dm cpu data. The cacheflush.c of
Linux nios2 arch is copied to arch/nios2/lib/cache.c to replace
the cache.S. The cache related functions in cpu.c is moved
to cache.c. Both flush_dcache() and flush_icache() are
replaced and removed. The flush_dcache_all() now flush icache
too, which is the same as what is done in Linux nios2 arch.

Signed-off-by: Thomas Chou <thomas@wytron.com.tw>
---
 arch/nios2/cpu/cpu.c           | 15 --------
 arch/nios2/include/asm/cache.h | 13 ++-----
 arch/nios2/lib/bootm.c         |  6 +--
 arch/nios2/lib/cache.S         | 68 ----------------------------------
 arch/nios2/lib/cache.c         | 84 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 88 insertions(+), 98 deletions(-)
 delete mode 100644 arch/nios2/lib/cache.S
 create mode 100644 arch/nios2/lib/cache.c

Comments

Marek Vasut Oct. 8, 2015, 9:39 p.m. UTC | #1
On Tuesday, October 06, 2015 at 10:20:00 AM, Thomas Chou wrote:
> Convert cache flush to use dm cpu data. The cacheflush.c of
> Linux nios2 arch is copied to arch/nios2/lib/cache.c to replace
> the cache.S. The cache related functions in cpu.c is moved
> to cache.c. Both flush_dcache() and flush_icache() are
> replaced and removed. The flush_dcache_all() now flush icache
> too, which is

... confusing as hell :-(

> the same as what is done in Linux nios2 arch.
> 
> Signed-off-by: Thomas Chou <thomas@wytron.com.tw>
> ---
>  arch/nios2/cpu/cpu.c           | 15 --------
>  arch/nios2/include/asm/cache.h | 13 ++-----
>  arch/nios2/lib/bootm.c         |  6 +--
>  arch/nios2/lib/cache.S         | 68 ----------------------------------
>  arch/nios2/lib/cache.c         | 84
> ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 88
> insertions(+), 98 deletions(-)
>  delete mode 100644 arch/nios2/lib/cache.S
>  create mode 100644 arch/nios2/lib/cache.c
> 
> diff --git a/arch/nios2/cpu/cpu.c b/arch/nios2/cpu/cpu.c
> index 5403c0d..8607c95 100644
> --- a/arch/nios2/cpu/cpu.c
> +++ b/arch/nios2/cpu/cpu.c
> @@ -29,21 +29,6 @@ int do_reset(cmd_tbl_t *cmdtp, int flag, int argc, char
> * const argv[]) return 0;
>  }
> 
> -int dcache_status(void)
> -{
> -	return 1;
> -}
> -
> -void dcache_enable(void)
> -{
> -	flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
> -}
> -
> -void dcache_disable(void)
> -{
> -	flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
> -}
> -
>  /*
>   * COPY EXCEPTION TRAMPOLINE -- copy the tramp to the
>   * exception address. Define CONFIG_ROM_STUBS to prevent
> diff --git a/arch/nios2/include/asm/cache.h
> b/arch/nios2/include/asm/cache.h index 9b87c9f..dde43cd 100644
> --- a/arch/nios2/include/asm/cache.h
> +++ b/arch/nios2/include/asm/cache.h
> @@ -8,18 +8,11 @@
>  #ifndef __ASM_NIOS2_CACHE_H_
>  #define __ASM_NIOS2_CACHE_H_
> 
> -extern void flush_dcache (unsigned long start, unsigned long size);
> -extern void flush_icache (unsigned long start, unsigned long size);
> -
>  /*
> - * Valid L1 data cache line sizes for the NIOS2 architecture are 4, 16,
> and 32 - * bytes.  If the board configuration has not specified one we
> default to the - * largest of these values for alignment of DMA buffers.
> + * Valid L1 data cache line sizes for the NIOS2 architecture are 4,
> + * 16, and 32 bytes. We default to the largest of these values for
> + * alignment of DMA buffers.
>   */
> -#ifdef CONFIG_SYS_CACHELINE_SIZE
> -#define ARCH_DMA_MINALIGN	CONFIG_SYS_CACHELINE_SIZE
> -#else
>  #define ARCH_DMA_MINALIGN	32
> -#endif
> 
>  #endif /* __ASM_NIOS2_CACHE_H_ */
> diff --git a/arch/nios2/lib/bootm.c b/arch/nios2/lib/bootm.c
> index c730a3f..4e5c269 100644
> --- a/arch/nios2/lib/bootm.c
> +++ b/arch/nios2/lib/bootm.c
> @@ -6,9 +6,6 @@
>   */
> 
>  #include <common.h>
> -#include <command.h>
> -#include <asm/byteorder.h>
> -#include <asm/cache.h>
> 
>  #define NIOS_MAGIC 0x534f494e /* enable command line and initrd passing */
> 
> @@ -40,8 +37,7 @@ int do_bootm_linux(int flag, int argc, char * const
> argv[], bootm_headers_t *ima
> 
>  	/* flushes data and instruction caches before calling the kernel */
>  	disable_interrupts();
> -	flush_dcache((ulong)kernel, CONFIG_SYS_DCACHE_SIZE);
> -	flush_icache((ulong)kernel, CONFIG_SYS_ICACHE_SIZE);
> +	flush_dcache_all();
> 
>  	debug("bootargs=%s @ 0x%lx\n", commandline, (ulong)&commandline);
>  	debug("initrd=0x%lx-0x%lx\n", (ulong)initrd_start, (ulong)initrd_end);
> diff --git a/arch/nios2/lib/cache.S b/arch/nios2/lib/cache.S
> deleted file mode 100644
> index 683f005..0000000
> --- a/arch/nios2/lib/cache.S
> +++ /dev/null
> @@ -1,68 +0,0 @@
> -/*
> - * (C) Copyright 2004, Psyent Corporation <www.psyent.com>
> - * Scott McNutt <smcnutt@psyent.com>
> - *
> - * SPDX-License-Identifier:	GPL-2.0+
> - */
> -
> -#include <config.h>
> -
> -	.text
> -
> -	.global flush_dcache
> -
> -flush_dcache:
> -	add	r5, r5, r4
> -	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
> -	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
> -0:	flushd	0(r4)
> -	add	r4, r4, r8
> -	bltu	r4, r5, 0b
> -	ret
> -
> -
> -	.global flush_icache
> -
> -flush_icache:
> -	add	r5, r5, r4
> -	movhi	r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
> -	ori	r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
> -1:	flushi	r4
> -	add	r4, r4, r8
> -	bltu	r4, r5, 1b
> -	ret
> -
> -	.global flush_dcache_range
> -
> -flush_dcache_range:
> -	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
> -	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
> -0:	flushd	0(r4)
> -	add	r4, r4, r8
> -	bltu	r4, r5, 0b
> -	ret
> -
> -	.global flush_cache
> -
> -flush_cache:
> -	add	r5, r5, r4
> -	mov	r9, r4
> -	mov	r10, r5
> -
> -	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
> -	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
> -0:	flushd	0(r4)
> -	add	r4, r4, r8
> -	bltu	r4, r5, 0b
> -
> -	mov	r4, r9
> -	mov	r5, r10
> -	movhi	r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
> -	ori	r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
> -1:	flushi	r4
> -	add	r4, r4, r8
> -	bltu	r4, r5, 1b
> -
> -	sync
> -	flushp
> -	ret
> diff --git a/arch/nios2/lib/cache.c b/arch/nios2/lib/cache.c
> new file mode 100644
> index 0000000..6f26d8d
> --- /dev/null
> +++ b/arch/nios2/lib/cache.c
> @@ -0,0 +1,84 @@
> +/*
> + * Copyright (C) 2015 Thomas Chou <thomas@wytron.com.tw>
> + * Copyright (C) 2009, Wind River Systems Inc
> + * Implemented by fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com
> + *
> + * SPDX-License-Identifier:	GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <asm/cache.h>
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +static void __flush_dcache_all(unsigned long start, unsigned long end)
> +{
> +	unsigned long addr;
> +
> +	start &= ~(gd->arch.dcache_line_size - 1);

I'd suggest to use GENMASK() here, but I don't think we picked this from Linux
just yet.

> +	end += (gd->arch.dcache_line_size - 1);
> +	end &= ~(gd->arch.dcache_line_size - 1);

Is this an attempt at poor-mans' rounding ? I think you want to implment 
something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT
do any rounding here. The reason for that is that if you do rounding, you
might accidentally corrupt a piece of memory which was just delivered via
DMA before you did the flush.

> +	if (end > start + gd->arch.dcache_size)
> +		end = start + gd->arch.dcache_size;
> +
> +	for (addr = start; addr < end; addr += gd->arch.dcache_line_size) {
> +		__asm__ __volatile__ ("   flushd 0(%0)\n"
> +					: /* Outputs */
> +					: /* Inputs  */ "r"(addr)
> +					/* : No clobber */);
> +	}
> +}
> +
> +static void __flush_icache(unsigned long start, unsigned long end)
> +{
> +	unsigned long addr;
> +
> +	start &= ~(gd->arch.icache_line_size - 1);
> +	end += (gd->arch.icache_line_size - 1);
> +	end &= ~(gd->arch.icache_line_size - 1);
> +
> +	if (end > start + gd->arch.icache_size)
> +		end = start + gd->arch.icache_size;
> +
> +	for (addr = start; addr < end; addr += gd->arch.icache_line_size) {
> +		__asm__ __volatile__ ("   flushi %0\n"
> +					: /* Outputs */
> +					: /* Inputs  */ "r"(addr)
> +					/* : No clobber */);
> +	}
> +	__asm__ __volatile(" flushp\n");
> +}
> +
> +void flush_dcache_all(void)
> +{
> +	__flush_dcache_all(0, gd->arch.dcache_size);
> +	__flush_icache(0, gd->arch.icache_size);
> +}
> +
> +void flush_dcache_range(unsigned long start, unsigned long end)
> +{
> +	__flush_dcache_all(start, end);
> +}
> +
> +void flush_cache(unsigned long start, unsigned long size)
> +{
> +	__flush_dcache_all(start, start + size);
> +	__flush_icache(start, start + size);
> +}
> +
> +/* nios2 data cache is always enabled */
> +int dcache_status(void)
> +{
> +	return 1;
> +}
> +
> +void dcache_enable(void)
> +{
> +	flush_dcache_all();
> +}
> +
> +void dcache_disable(void)
> +{
> +	flush_dcache_all();
> +}
Ley Foon Tan Oct. 9, 2015, 2:49 a.m. UTC | #2
On Fri, Oct 9, 2015 at 5:39 AM, Marek Vasut <marex@denx.de> wrote:

>> +
>> +DECLARE_GLOBAL_DATA_PTR;
>> +
>> +static void __flush_dcache_all(unsigned long start, unsigned long end)
>> +{
>> +     unsigned long addr;
>> +
>> +     start &= ~(gd->arch.dcache_line_size - 1);
>
> I'd suggest to use GENMASK() here, but I don't think we picked this from Linux
> just yet.
>
>> +     end += (gd->arch.dcache_line_size - 1);
>> +     end &= ~(gd->arch.dcache_line_size - 1);
>
> Is this an attempt at poor-mans' rounding ? I think you want to implment
> something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT
> do any rounding here. The reason for that is that if you do rounding, you
> might accidentally corrupt a piece of memory which was just delivered via
> DMA before you did the flush.
The code above is to convert the address to dcache line size.
arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache
flushing if it is unaligned to cache line size. I'm not sure how
frequent U-boot access to non-aligned cache line size.

Regards
Ley Foon
Thomas Chou Oct. 9, 2015, 8 a.m. UTC | #3
Hi Marek,

On 10/09/2015 10:49 AM, Ley Foon Tan wrote:
>> Is this an attempt at poor-mans' rounding ? I think you want to implment
>> something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT
>> do any rounding here. The reason for that is that if you do rounding, you
>> might accidentally corrupt a piece of memory which was just delivered via
>> DMA before you did the flush.
> The code above is to convert the address to dcache line size.
> arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache
> flushing if it is unaligned to cache line size. I'm not sure how
> frequent U-boot access to non-aligned cache line size.

Thanks a lot for your looking into this, Ley Foon.

I think we take cache flushing in a different way to arm926ejs.

In nios2 driver programming, we would request all the DMA buffers be 
aligned to cache line. This is necessary to avoid the cache racing issue 
as you mention above.

In nios2, we don't skip the flushing when the inputs are not aligned 
like that of arm926ejs. We always flush all cache lines in the range, 
even if a single byte to flush is in request. So the inputs are rounded 
to get the lower and upper cache lines range inside the cache flush 
functions. The caller need not be aware of the detail.

In the copy_exception_trampoline() patch, both dcache and icache must be 
flushed at the exception target address. Though the flush range is only 
12 bytes, which won't be aligned.

Thank you for your review.

Best regards,
Thomas
Marek Vasut Oct. 9, 2015, 2:40 p.m. UTC | #4
On Friday, October 09, 2015 at 04:49:03 AM, Ley Foon Tan wrote:
> On Fri, Oct 9, 2015 at 5:39 AM, Marek Vasut <marex@denx.de> wrote:
> >> +
> >> +DECLARE_GLOBAL_DATA_PTR;
> >> +
> >> +static void __flush_dcache_all(unsigned long start, unsigned long end)
> >> +{
> >> +     unsigned long addr;
> >> +
> >> +     start &= ~(gd->arch.dcache_line_size - 1);
> > 
> > I'd suggest to use GENMASK() here, but I don't think we picked this from
> > Linux just yet.
> > 
> >> +     end += (gd->arch.dcache_line_size - 1);
> >> +     end &= ~(gd->arch.dcache_line_size - 1);
> > 
> > Is this an attempt at poor-mans' rounding ? I think you want to implment
> > something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and NOT
> > do any rounding here. The reason for that is that if you do rounding, you
> > might accidentally corrupt a piece of memory which was just delivered via
> > DMA before you did the flush.
> 
> The code above is to convert the address to dcache line size.

So it's aligning the unaligned accesses, correct?

> arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache
> flushing if it is unaligned to cache line size.

That's right, unaligned cache flushes/invalidations should not happen, ever.

> I'm not sure how
> frequent U-boot access to non-aligned cache line size.

They must not ever happen, if some driver does them, the driver needs to be
fixed.

Best regards,
Marek Vasut
Marek Vasut Oct. 9, 2015, 2:42 p.m. UTC | #5
On Friday, October 09, 2015 at 10:00:26 AM, Thomas Chou wrote:
> Hi Marek,
> 
> On 10/09/2015 10:49 AM, Ley Foon Tan wrote:
> >> Is this an attempt at poor-mans' rounding ? I think you want to implment
> >> something like arch/arm/cpu/arm926ejs/cache.c check_cache_range() and
> >> NOT do any rounding here. The reason for that is that if you do
> >> rounding, you might accidentally corrupt a piece of memory which was
> >> just delivered via DMA before you did the flush.
> > 
> > The code above is to convert the address to dcache line size.
> > arch/arm/cpu/arm926ejs/cache.c check_cache_range() will skip the cache
> > flushing if it is unaligned to cache line size. I'm not sure how
> > frequent U-boot access to non-aligned cache line size.
> 
> Thanks a lot for your looking into this, Ley Foon.
> 
> I think we take cache flushing in a different way to arm926ejs.
> 
> In nios2 driver programming, we would request all the DMA buffers be
> aligned to cache line. This is necessary to avoid the cache racing issue
> as you mention above.

This is what ARM does as well.

> In nios2, we don't skip the flushing when the inputs are not aligned
> like that of arm926ejs. We always flush all cache lines in the range,
> even if a single byte to flush is in request. So the inputs are rounded
> to get the lower and upper cache lines range inside the cache flush
> functions. The caller need not be aware of the detail.

This is incorrect and all the places which produce these unaligned cache
operations must be fixed.

> In the copy_exception_trampoline() patch, both dcache and icache must be
> flushed at the exception target address. Though the flush range is only
> 12 bytes, which won't be aligned.
> 
> Thank you for your review.
> 
> Best regards,
> Thomas

Best regards,
Marek Vasut
Thomas Chou Oct. 10, 2015, 5:55 a.m. UTC | #6
Hi Marek,

On 10/09/2015 10:42 PM, Marek Vasut wrote:
>> In nios2, we don't skip the flushing when the inputs are not aligned
>> like that of arm926ejs. We always flush all cache lines in the range,
>> even if a single byte to flush is in request. So the inputs are rounded
>> to get the lower and upper cache lines range inside the cache flush
>> functions. The caller need not be aware of the detail.
>
> This is incorrect and all the places which produce these unaligned cache
> operations must be fixed.

I take a look into the cache flush operations in every arch of u-boot. 
It turns out that the arm926ejs is the only platform that does such 
cache line range check and skip. All other ARM and all other arch don't. 
And the cache flush in Linux don't.

Best regards,
Thomas
Thomas Chou Oct. 10, 2015, 6:32 a.m. UTC | #7
Hi Marek,

On 10/10/2015 01:55 PM, Thomas Chou wrote:
> Hi Marek,
>
> On 10/09/2015 10:42 PM, Marek Vasut wrote:
>>> In nios2, we don't skip the flushing when the inputs are not aligned
>>> like that of arm926ejs. We always flush all cache lines in the range,
>>> even if a single byte to flush is in request. So the inputs are rounded
>>> to get the lower and upper cache lines range inside the cache flush
>>> functions. The caller need not be aware of the detail.
>>
>> This is incorrect and all the places which produce these unaligned cache
>> operations must be fixed.
>
> I take a look into the cache flush operations in every arch of u-boot.
> It turns out that the arm926ejs is the only platform that does such
> cache line range check and skip. All other ARM and all other arch don't.
> And the cache flush in Linux don't.
>

+arm11, which is based on the same code.

I see your patch on this range check. I would prefer that the details of 
cache line configuration be kept inside the cache flush operators, and 
need not be exposed to drivers. As drivers might be used by different 
arch with different cache implementation, L1,L2..etc. It is not good for 
drivers to adjust the flush range before passing to cache flush operators.

Best regards,
Thomas
Marek Vasut Oct. 10, 2015, 6:12 p.m. UTC | #8
On Saturday, October 10, 2015 at 07:55:45 AM, Thomas Chou wrote:
> Hi Marek,

Hi,

> On 10/09/2015 10:42 PM, Marek Vasut wrote:
> >> In nios2, we don't skip the flushing when the inputs are not aligned
> >> like that of arm926ejs. We always flush all cache lines in the range,
> >> even if a single byte to flush is in request. So the inputs are rounded
> >> to get the lower and upper cache lines range inside the cache flush
> >> functions. The caller need not be aware of the detail.
> > 
> > This is incorrect and all the places which produce these unaligned cache
> > operations must be fixed.
> 
> I take a look into the cache flush operations in every arch of u-boot.
> It turns out that the arm926ejs is the only platform that does such
> cache line range check and skip. All other ARM and all other arch don't.

Yes, everyone else doesn't do the checks and if there is unaligned cache
flush/invalidation, that platform also suffers from various obscure and hard
to debug errors. I submitted patch to add the same check for ARMv7, but it
didn't receive attention :-( I should repost it I guess.

> And the cache flush in Linux don't.

Because the buffers there are always correctly aligned during allocation.

Best regards,
Marek Vasut
Marek Vasut Oct. 10, 2015, 6:18 p.m. UTC | #9
On Saturday, October 10, 2015 at 08:32:09 AM, Thomas Chou wrote:
> Hi Marek,

Hi,

> On 10/10/2015 01:55 PM, Thomas Chou wrote:
> > Hi Marek,
> > 
> > On 10/09/2015 10:42 PM, Marek Vasut wrote:
> >>> In nios2, we don't skip the flushing when the inputs are not aligned
> >>> like that of arm926ejs. We always flush all cache lines in the range,
> >>> even if a single byte to flush is in request. So the inputs are rounded
> >>> to get the lower and upper cache lines range inside the cache flush
> >>> functions. The caller need not be aware of the detail.
> >> 
> >> This is incorrect and all the places which produce these unaligned cache
> >> operations must be fixed.
> > 
> > I take a look into the cache flush operations in every arch of u-boot.
> > It turns out that the arm926ejs is the only platform that does such
> > cache line range check and skip. All other ARM and all other arch don't.
> > And the cache flush in Linux don't.
> 
> +arm11, which is based on the same code.
> 
> I see your patch on this range check. I would prefer that the details of
> cache line configuration be kept inside the cache flush operators, and
> need not be exposed to drivers.

Then you'd also need means to allocate variables to aligned memory location
to prevent invalid cache flush. (Linux does this with it's DMA API). We are
much simpler and thus this abstraction is still not available. I wonder if
the overhead of DMA API would be high or not for U-Boot.

> As drivers might be used by different
> arch with different cache implementation, L1,L2..etc. It is not good for
> drivers to adjust the flush range before passing to cache flush operators.

It is even worse if the cache flush operators permit incorrect cache flushes
or invalidations. Like I mentioned before, this can lead to hard to debug
problems when using DMA (at least on ARM).

Best regards,
Marek Vasut
Thomas Chou Oct. 11, 2015, 12:38 a.m. UTC | #10
Hi Marek,

On 10/11/2015 02:18 AM, Marek Vasut wrote:
> Then you'd also need means to allocate variables to aligned memory location
> to prevent invalid cache flush. (Linux does this with it's DMA API). We are
> much simpler and thus this abstraction is still not available. I wonder if
> the overhead of DMA API would be high or not for U-Boot.

I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to 
allocate DMA buffers so that they are cache aligned.

> It is even worse if the cache flush operators permit incorrect cache flushes
> or invalidations. Like I mentioned before, this can lead to hard to debug
> problems when using DMA (at least on ARM).

I would suggest debug check should be left as for debug only. The 
definition of common functions should be kept as it is more important 
than coding style.

I debugged DMA issues a lot in the past until I realized the importance 
of aligned buffers. So there should be a developer's guideline.

But it is even much more difficult when something you believed does not 
work as expected, what is taken as common sense. It will trap a lot of 
developers when they called your flush cache functions but was skipped 
just because, eg, the end of packets are not aligned which is usually 
the case.

I would suggest that, with the best of my knowledge, please change the 
range check to a debug probe, and restore the cache flush functions to 
the common definition.

Best regards,
Thomas
Marek Vasut Oct. 11, 2015, 12:15 p.m. UTC | #11
On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
> Hi Marek,

Hi,

> On 10/11/2015 02:18 AM, Marek Vasut wrote:
> > Then you'd also need means to allocate variables to aligned memory
> > location to prevent invalid cache flush. (Linux does this with it's DMA
> > API). We are much simpler and thus this abstraction is still not
> > available. I wonder if the overhead of DMA API would be high or not for
> > U-Boot.
> 
> I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to
> allocate DMA buffers so that they are cache aligned.

That and include/memalign.h , which contains macros that are used to align
variables.

> > It is even worse if the cache flush operators permit incorrect cache
> > flushes or invalidations. Like I mentioned before, this can lead to hard
> > to debug problems when using DMA (at least on ARM).
> 
> I would suggest debug check should be left as for debug only. The
> definition of common functions should be kept as it is more important
> than coding style.

Uh yes, that's what arm926 cache functions do, they're debug only.

> I debugged DMA issues a lot in the past until I realized the importance
> of aligned buffers. So there should be a developer's guideline.

For what exactly?

> But it is even much more difficult when something you believed does not
> work as expected, what is taken as common sense. It will trap a lot of
> developers when they called your flush cache functions but was skipped
> just because, eg, the end of packets are not aligned which is usually
> the case.

This is good, it should bite them, because this is a bug. If, on the other
hand, you will paper over such bugs by adding crap to the cache ops, there
will be even worse bugs coming for you, like variables which are sitting in
the same cacheline as your unaligned buffer that you want to invalidate or
flush will possibly get trashed by such cache operation.

Consider this:

cacheline 0: [ variable A ; buffer B ......... ]
cacheline 1: [ buffer B ......... ; Empty .... ]

Now you do the following:

1) Variable A = 0;
2) Flush buffer B (which is unaligned, so flush cacheline 0 and 1)
3) Start DMA to buffer B
4) Variable A = 1;
5) Check if DMA finished, it did
6) Invalidate buffer B ... oh, but it's unaligned, let's invalidate
   everything around it, which is cacheline 0 and 1.
7) What is the value of variable A ? Oh, it's fetched from memory and
   it's 0 there, even though we did set it to 1 ...

> I would suggest that, with the best of my knowledge, please change the
> range check to a debug probe, and restore the cache flush functions to
> the common definition.

See above, does my example make it clear why we should never ever hide
bugs in the cache ops code ?

Best regards,
Marek Vasut
Thomas Chou Oct. 12, 2015, 12:34 a.m. UTC | #12
Hi Marek,

On 10/11/2015 08:15 PM, Marek Vasut wrote:
> On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
>> Hi Marek,
>
> Hi,
>
>> On 10/11/2015 02:18 AM, Marek Vasut wrote:
>>> Then you'd also need means to allocate variables to aligned memory
>>> location to prevent invalid cache flush. (Linux does this with it's DMA
>>> API). We are much simpler and thus this abstraction is still not
>>> available. I wonder if the overhead of DMA API would be high or not for
>>> U-Boot.
>>
>> I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to
>> allocate DMA buffers so that they are cache aligned.
>
> That and include/memalign.h , which contains macros that are used to align
> variables.

Yes. It is malloc_cache_aligned(), which should be used to allocate DMA 
buffer. Thank you for the pointer.

>
>>> It is even worse if the cache flush operators permit incorrect cache
>>> flushes or invalidations. Like I mentioned before, this can lead to hard
>>> to debug problems when using DMA (at least on ARM).
>>
>> I would suggest debug check should be left as for debug only. The
>> definition of common functions should be kept as it is more important
>> than coding style.
>
> Uh yes, that's what arm926 cache functions do, they're debug only.
>
>> I debugged DMA issues a lot in the past until I realized the importance
>> of aligned buffers. So there should be a developer's guideline.
>
> For what exactly?

For u-boot, every DMA buffer must be allocated with 
malloc_cache_aligned(). Then there will be not variables and DMA buffers 
cache racing issues as you describe below.

>
>> But it is even much more difficult when something you believed does not
>> work as expected, what is taken as common sense. It will trap a lot of
>> developers when they called your flush cache functions but was skipped
>> just because, eg, the end of packets are not aligned which is usually
>> the case.
>
> This is good, it should bite them, because this is a bug. If, on the other
> hand, you will paper over such bugs by adding crap to the cache ops, there
> will be even worse bugs coming for you, like variables which are sitting in
> the same cacheline as your unaligned buffer that you want to invalidate or
> flush will possibly get trashed by such cache operation.
>
> Consider this:
>
> cacheline 0: [ variable A ; buffer B ......... ]
> cacheline 1: [ buffer B ......... ; Empty .... ]
>
> Now you do the following:
>
> 1) Variable A = 0;
> 2) Flush buffer B (which is unaligned, so flush cacheline 0 and 1)
> 3) Start DMA to buffer B
> 4) Variable A = 1;
> 5) Check if DMA finished, it did
> 6) Invalidate buffer B ... oh, but it's unaligned, let's invalidate
>     everything around it, which is cacheline 0 and 1.
> 7) What is the value of variable A ? Oh, it's fetched from memory and
>     it's 0 there, even though we did set it to 1 ...
>
>> I would suggest that, with the best of my knowledge, please change the
>> range check to a debug probe, and restore the cache flush functions to
>> the common definition.
>
> See above, does my example make it clear why we should never ever hide
> bugs in the cache ops code ?

It is the drivers' responsibility to follow the guide line above. If 
there is such a bug, it is not the cache flush ops bug. It is a driver's 
bug. You may add a probe to show the bug from caller, but you may not 
call it a bug of cache ops and skip the flush. Given that it is quite 
common that the return of such cache ops is not checked, few (if not 
none) will ever know that the flush was skipped.

Best regards,
Thomas
Marek Vasut Oct. 12, 2015, 10:30 a.m. UTC | #13
On Monday, October 12, 2015 at 02:34:16 AM, Thomas Chou wrote:
> Hi Marek,

Hi!

> On 10/11/2015 08:15 PM, Marek Vasut wrote:
> > On Sunday, October 11, 2015 at 02:38:35 AM, Thomas Chou wrote:
> >> Hi Marek,
> > 
> > Hi,
> > 
> >> On 10/11/2015 02:18 AM, Marek Vasut wrote:
> >>> Then you'd also need means to allocate variables to aligned memory
> >>> location to prevent invalid cache flush. (Linux does this with it's DMA
> >>> API). We are much simpler and thus this abstraction is still not
> >>> available. I wonder if the overhead of DMA API would be high or not for
> >>> U-Boot.
> >> 
> >> I see most people use memalign(ARCH_DMA_MINALIGN, len) in u-boot to
> >> allocate DMA buffers so that they are cache aligned.
> > 
> > That and include/memalign.h , which contains macros that are used to
> > align variables.
> 
> Yes. It is malloc_cache_aligned(), which should be used to allocate DMA
> buffer. Thank you for the pointer.

There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER()
macros which can be used to allocate such stuff on stack. And you sometimes
do want to allocate things on stack instead of using malloc().

> >>> It is even worse if the cache flush operators permit incorrect cache
> >>> flushes or invalidations. Like I mentioned before, this can lead to
> >>> hard to debug problems when using DMA (at least on ARM).
> >> 
> >> I would suggest debug check should be left as for debug only. The
> >> definition of common functions should be kept as it is more important
> >> than coding style.
> > 
> > Uh yes, that's what arm926 cache functions do, they're debug only.
> > 
> >> I debugged DMA issues a lot in the past until I realized the importance
> >> of aligned buffers. So there should be a developer's guideline.
> > 
> > For what exactly?
> 
> For u-boot, every DMA buffer must be allocated with
> malloc_cache_aligned(). Then there will be not variables and DMA buffers
> cache racing issues as you describe below.

Sometimes you might want to allocate DMA buffers on stack, for example if
you don't have mallocator running yet or if it's more convenient for some
reason. So forcing everyone to allocate DMA buffers using malloc is not
gonna slide I'm afraid.

> >> But it is even much more difficult when something you believed does not
> >> work as expected, what is taken as common sense. It will trap a lot of
> >> developers when they called your flush cache functions but was skipped
> >> just because, eg, the end of packets are not aligned which is usually
> >> the case.
> > 
> > This is good, it should bite them, because this is a bug. If, on the
> > other hand, you will paper over such bugs by adding crap to the cache
> > ops, there will be even worse bugs coming for you, like variables which
> > are sitting in the same cacheline as your unaligned buffer that you want
> > to invalidate or flush will possibly get trashed by such cache
> > operation.
> > 
> > Consider this:
> > 
> > cacheline 0: [ variable A ; buffer B ......... ]
> > cacheline 1: [ buffer B ......... ; Empty .... ]
> > 
> > Now you do the following:
> > 
> > 1) Variable A = 0;
> > 2) Flush buffer B (which is unaligned, so flush cacheline 0 and 1)
> > 3) Start DMA to buffer B
> > 4) Variable A = 1;
> > 5) Check if DMA finished, it did
> > 6) Invalidate buffer B ... oh, but it's unaligned, let's invalidate
> > 
> >     everything around it, which is cacheline 0 and 1.
> > 
> > 7) What is the value of variable A ? Oh, it's fetched from memory and
> > 
> >     it's 0 there, even though we did set it to 1 ...
> >> 
> >> I would suggest that, with the best of my knowledge, please change the
> >> range check to a debug probe, and restore the cache flush functions to
> >> the common definition.
> > 
> > See above, does my example make it clear why we should never ever hide
> > bugs in the cache ops code ?
> 
> It is the drivers' responsibility to follow the guide line above. If
> there is such a bug, it is not the cache flush ops bug. It is a driver's
> bug. You may add a probe to show the bug from caller, but you may not
> call it a bug of cache ops and skip the flush. Given that it is quite
> common that the return of such cache ops is not checked, few (if not
> none) will ever know that the flush was skipped.

The cache flush ops is the best place to scream death and murder if someone
tries such unaligned cache operation, so maybe you should even do a printf()
there to weed such crappy drivers out for the 2016.01 release.

I agree it's the responsibility of the driver, so if the driver doesn't do
things right, it's a bug and the behavior of cache ops is undefined, which
might as well be that we do the safer thing here and flush nothing.

Best regards,
Marek Vasut
Thomas Chou Oct. 12, 2015, 1:12 p.m. UTC | #14
Hi Marek,

On 10/12/2015 06:30 PM, Marek Vasut wrote:
> There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER()
> macros which can be used to allocate such stuff on stack. And you sometimes
> do want to allocate things on stack instead of using malloc().

Thanks for sharing this.

> Sometimes you might want to allocate DMA buffers on stack, for example if
> you don't have mallocator running yet or if it's more convenient for some
> reason. So forcing everyone to allocate DMA buffers using malloc is not
> gonna slide I'm afraid.

The same rule can be applied to buffer allocated on stack, with the 
macro you mentioned above. In all, cache line aware allocation on heap 
or on stack must be used for DMA buffer.

> The cache flush ops is the best place to scream death and murder if someone
> tries such unaligned cache operation, so maybe you should even do a printf()
> there to weed such crappy drivers out for the 2016.01 release.
>
> I agree it's the responsibility of the driver, so if the driver doesn't do
> things right, it's a bug and the behavior of cache ops is undefined, which
> might as well be that we do the safer thing here and flush nothing.

It won't be safer to flush nothing. Sooner or later the cache will be 
flushed due to data access, even if the cache flush ops is skip.

To solve problem like this, the only solution is to enforce the rule to 
allocate DMA buffer. It is wrong to skip the flush.

Best regards,
Thomas
Marek Vasut Oct. 12, 2015, 1:29 p.m. UTC | #15
On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
> Hi Marek,
> 
> On 10/12/2015 06:30 PM, Marek Vasut wrote:
> > There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER()
> > macros which can be used to allocate such stuff on stack. And you
> > sometimes do want to allocate things on stack instead of using malloc().
> 
> Thanks for sharing this.
> 
> > Sometimes you might want to allocate DMA buffers on stack, for example if
> > you don't have mallocator running yet or if it's more convenient for some
> > reason. So forcing everyone to allocate DMA buffers using malloc is not
> > gonna slide I'm afraid.
> 
> The same rule can be applied to buffer allocated on stack, with the
> macro you mentioned above. In all, cache line aware allocation on heap
> or on stack must be used for DMA buffer.

That's correct, they must be used. But sadly, this is not yet the case in
all the drivers, which we need to rectify. And how best to rectify this
than to scream when someone does such a thing, right ?

> > The cache flush ops is the best place to scream death and murder if
> > someone tries such unaligned cache operation, so maybe you should even
> > do a printf() there to weed such crappy drivers out for the 2016.01
> > release.
> > 
> > I agree it's the responsibility of the driver, so if the driver doesn't
> > do things right, it's a bug and the behavior of cache ops is undefined,
> > which might as well be that we do the safer thing here and flush
> > nothing.
> 
> It won't be safer to flush nothing. Sooner or later the cache will be
> flushed due to data access, even if the cache flush ops is skip.

That is bad bad bad, that's even nastier. We really need to fix the drivers,
not paper over it in the cache ops.

> To solve problem like this, the only solution is to enforce the rule to
> allocate DMA buffer. It is wrong to skip the flush.

I absolutelly agree we need aligned allocations for DMA memory areas. But,
we also shouldn't hide bugs. And I believe aligning the incorrect arguments
to cache functions is not the way to go. We should check the arguments and
if someone tries an unaligned cache op, we should scream. What do you think?

btw. I think you won't get way too many cache warnings nowadays and we can
fix those few remaining way before the 2016.01 is out.

Best regards,
Marek Vasut
Wolfgang Denk Oct. 12, 2015, 1:49 p.m. UTC | #16
Dear Marek,

In message <201510121529.45730.marex@denx.de> you wrote:
>
> That is bad bad bad, that's even nastier. We really need to fix the drivers,
> not paper over it in the cache ops.

Full ACK here.

> > To solve problem like this, the only solution is to enforce the rule to
> > allocate DMA buffer. It is wrong to skip the flush.
> 
> I absolutelly agree we need aligned allocations for DMA memory areas. But,
> we also shouldn't hide bugs. And I believe aligning the incorrect arguments
> to cache functions is not the way to go. We should check the arguments and
> if someone tries an unaligned cache op, we should scream. What do you think?

Again, full ACK.

We should make sure we get clear, unmistakable error messages for such
bugs, and not silent non-deterministic behaviour.

Best regards,

Wolfgang Denk
Thomas Chou Oct. 13, 2015, 1:04 a.m. UTC | #17
Hi Marek,

On 10/12/2015 09:29 PM, Marek Vasut wrote:
> On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
>> Hi Marek,
>>
>> On 10/12/2015 06:30 PM, Marek Vasut wrote:
>>> There are also DEFINE_CACHE_ALIGN_BUFFER() and ALLOC_CACHE_ALIGN_BUFFER()
>>> macros which can be used to allocate such stuff on stack. And you
>>> sometimes do want to allocate things on stack instead of using malloc().
>>
>> Thanks for sharing this.
>>
>>> Sometimes you might want to allocate DMA buffers on stack, for example if
>>> you don't have mallocator running yet or if it's more convenient for some
>>> reason. So forcing everyone to allocate DMA buffers using malloc is not
>>> gonna slide I'm afraid.
>>
>> The same rule can be applied to buffer allocated on stack, with the
>> macro you mentioned above. In all, cache line aware allocation on heap
>> or on stack must be used for DMA buffer.
>
> That's correct, they must be used. But sadly, this is not yet the case in
> all the drivers, which we need to rectify. And how best to rectify this
> than to scream when someone does such a thing, right ?
>

Given that there are not so many drivers using DMA in u-boot, as 
compared to Linux. I would suggest we can walk through and rectify the 
allocation of DMA buffers.

>>> The cache flush ops is the best place to scream death and murder if
>>> someone tries such unaligned cache operation, so maybe you should even
>>> do a printf() there to weed such crappy drivers out for the 2016.01
>>> release.
>>>
>>> I agree it's the responsibility of the driver, so if the driver doesn't
>>> do things right, it's a bug and the behavior of cache ops is undefined,
>>> which might as well be that we do the safer thing here and flush
>>> nothing.
>>
>> It won't be safer to flush nothing. Sooner or later the cache will be
>> flushed due to data access, even if the cache flush ops is skip.
>
> That is bad bad bad, that's even nastier. We really need to fix the drivers,
> not paper over it in the cache ops.

I know how bad it is, with over 35 years work with DMA. grin..

>
>> To solve problem like this, the only solution is to enforce the rule to
>> allocate DMA buffer. It is wrong to skip the flush.
>
> I absolutelly agree we need aligned allocations for DMA memory areas. But,
> we also shouldn't hide bugs. And I believe aligning the incorrect arguments
> to cache functions is not the way to go. We should check the arguments and
> if someone tries an unaligned cache op, we should scream. What do you think?
>
> btw. I think you won't get way too many cache warnings nowadays and we can
> fix those few remaining way before the 2016.01 is out.

I would suggest the "cache alignment check and skip" be removed from 
cache flush ops, and say out the DMA buffer allocation rule loudly in 
README, and enforce it by guardianship. Please allow me to restate the 
reasons,

1. The cache flush ops are commonly used. Please refer to the "Cache and 
TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt. 
Violating the defined interface is much worse than violating coding 
style. It will certainly impact the portability of u-boot. And might 
introduce more bug than resolve.

2. We all agree that enforcing DMA buffer allocation to cache aligned is 
the only real solution. Adding such "check and skip" to cache flush ops 
cannot prevent the flush or solve the problem.

3. Though the flush size of block device are usually aligned, the size 
of packet are not. Asking the packet drivers to adjust the flush size 
does not make sense. It is the job of cache flush ops. The debug probe 
should not override the original purpose. It should be spelled for 
common understanding.

It is free to your consideration. As it is free and open software. :)

Best regards,
Thomas
Marek Vasut Oct. 16, 2015, 11:03 p.m. UTC | #18
On Tuesday, October 13, 2015 at 03:04:44 AM, Thomas Chou wrote:
> Hi Marek,

Hi!

> On 10/12/2015 09:29 PM, Marek Vasut wrote:
> > On Monday, October 12, 2015 at 03:12:18 PM, Thomas Chou wrote:
> >> Hi Marek,
> >> 
> >> On 10/12/2015 06:30 PM, Marek Vasut wrote:
> >>> There are also DEFINE_CACHE_ALIGN_BUFFER() and
> >>> ALLOC_CACHE_ALIGN_BUFFER() macros which can be used to allocate such
> >>> stuff on stack. And you sometimes do want to allocate things on stack
> >>> instead of using malloc().
> >> 
> >> Thanks for sharing this.
> >> 
> >>> Sometimes you might want to allocate DMA buffers on stack, for example
> >>> if you don't have mallocator running yet or if it's more convenient
> >>> for some reason. So forcing everyone to allocate DMA buffers using
> >>> malloc is not gonna slide I'm afraid.
> >> 
> >> The same rule can be applied to buffer allocated on stack, with the
> >> macro you mentioned above. In all, cache line aware allocation on heap
> >> or on stack must be used for DMA buffer.
> > 
> > That's correct, they must be used. But sadly, this is not yet the case in
> > all the drivers, which we need to rectify. And how best to rectify this
> > than to scream when someone does such a thing, right ?
> 
> Given that there are not so many drivers using DMA in u-boot, as
> compared to Linux. I would suggest we can walk through and rectify the
> allocation of DMA buffers.

That's what we're pretty much trying to do -- fix all the DMA-using drivers
to behave correctly.

> >>> The cache flush ops is the best place to scream death and murder if
> >>> someone tries such unaligned cache operation, so maybe you should even
> >>> do a printf() there to weed such crappy drivers out for the 2016.01
> >>> release.
> >>> 
> >>> I agree it's the responsibility of the driver, so if the driver doesn't
> >>> do things right, it's a bug and the behavior of cache ops is undefined,
> >>> which might as well be that we do the safer thing here and flush
> >>> nothing.
> >> 
> >> It won't be safer to flush nothing. Sooner or later the cache will be
> >> flushed due to data access, even if the cache flush ops is skip.
> > 
> > That is bad bad bad, that's even nastier. We really need to fix the
> > drivers, not paper over it in the cache ops.
> 
> I know how bad it is, with over 35 years work with DMA. grin..

I can feel your pain here ;-/

> >> To solve problem like this, the only solution is to enforce the rule to
> >> allocate DMA buffer. It is wrong to skip the flush.
> > 
> > I absolutelly agree we need aligned allocations for DMA memory areas.
> > But, we also shouldn't hide bugs. And I believe aligning the incorrect
> > arguments to cache functions is not the way to go. We should check the
> > arguments and if someone tries an unaligned cache op, we should scream.
> > What do you think?
> > 
> > btw. I think you won't get way too many cache warnings nowadays and we
> > can fix those few remaining way before the 2016.01 is out.
> 
> I would suggest the "cache alignment check and skip" be removed from
> cache flush ops, and say out the DMA buffer allocation rule loudly in
> README, and enforce it by guardianship.

What exactly do you envision by this "guardianship" ?

> Please allow me to restate the reasons,
> 
> 1. The cache flush ops are commonly used. Please refer to the "Cache and
> TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt.
> Violating the defined interface is much worse than violating coding
> style. It will certainly impact the portability of u-boot. And might
> introduce more bug than resolve.

I agree with this one.

> 2. We all agree that enforcing DMA buffer allocation to cache aligned is
> the only real solution. Adding such "check and skip" to cache flush ops
> cannot prevent the flush or solve the problem.

We should probably check-scream-skip here.

> 3. Though the flush size of block device are usually aligned, the size
> of packet are not. Asking the packet drivers to adjust the flush size
> does not make sense. It is the job of cache flush ops. The debug probe
> should not override the original purpose. It should be spelled for
> common understanding.

The socket buffer(s) should be aligned, so network packets should be fine.

> It is free to your consideration. As it is free and open software. :)
> 
> Best regards,
> Thomas
Thomas Chou Oct. 17, 2015, 3:22 a.m. UTC | #19
Hi Marek,

On 10/17/2015 07:03 AM, Marek Vasut wrote:
>> I would suggest the "cache alignment check and skip" be removed from
>> cache flush ops, and say out the DMA buffer allocation rule loudly in
>> README, and enforce it by guardianship.
>
> What exactly do you envision by this "guardianship" ?

I mean the reviews of custodians.

>
>> Please allow me to restate the reasons,
>>
>> 1. The cache flush ops are commonly used. Please refer to the "Cache and
>> TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt.
>> Violating the defined interface is much worse than violating coding
>> style. It will certainly impact the portability of u-boot. And might
>> introduce more bug than resolve.
>
> I agree with this one.
>
>> 2. We all agree that enforcing DMA buffer allocation to cache aligned is
>> the only real solution. Adding such "check and skip" to cache flush ops
>> cannot prevent the flush or solve the problem.
>
> We should probably check-scream-skip here.
>
>> 3. Though the flush size of block device are usually aligned, the size
>> of packet are not. Asking the packet drivers to adjust the flush size
>> does not make sense. It is the job of cache flush ops. The debug probe
>> should not override the original purpose. It should be spelled for
>> common understanding.
>
> The socket buffer(s) should be aligned, so network packets should be fine.

While the start of socket buffer might be aligned, the size of the 
transfer might not for the send ops. It is depended on the net/tcp/ip 
packets size.

For example, with tftp, there is a lot of unaligned end of packets.

tftp d1000000 u-boot-dtb.bin

flush unaligned d7ff7020-d7ff704e
[repeat ..]

So, such an alarm may be false. And such a skip can be bug.

In fact, for my own projects, I have changed the memory allocation to 
always cache aligned. And I rarely worry about it ever after.

I look at the net.c of u-boot. There are packets buffer allocated on BSS 
and stack. I would suggest avoid such programming, and use aligned 
memory allocation stead.

Best regards,
Thomas
Marek Vasut Oct. 17, 2015, 11:44 a.m. UTC | #20
On Saturday, October 17, 2015 at 05:22:41 AM, Thomas Chou wrote:
> Hi Marek,

Hi!

> On 10/17/2015 07:03 AM, Marek Vasut wrote:
> >> I would suggest the "cache alignment check and skip" be removed from
> >> cache flush ops, and say out the DMA buffer allocation rule loudly in
> >> README, and enforce it by guardianship.
> > 
> > What exactly do you envision by this "guardianship" ?
> 
> I mean the reviews of custodians.

Wouldn't an automated check be better ? It's easier and costs almost nothing.
Besides, custodians are not perfect and cannot detect all the issues.

> >> Please allow me to restate the reasons,
> >> 
> >> 1. The cache flush ops are commonly used. Please refer to the "Cache and
> >> TLB Flushing Under Linux" doc, linux/Documentation/cachetlb.txt.
> >> Violating the defined interface is much worse than violating coding
> >> style. It will certainly impact the portability of u-boot. And might
> >> introduce more bug than resolve.
> > 
> > I agree with this one.
> > 
> >> 2. We all agree that enforcing DMA buffer allocation to cache aligned is
> >> the only real solution. Adding such "check and skip" to cache flush ops
> >> cannot prevent the flush or solve the problem.
> > 
> > We should probably check-scream-skip here.
> > 
> >> 3. Though the flush size of block device are usually aligned, the size
> >> of packet are not. Asking the packet drivers to adjust the flush size
> >> does not make sense. It is the job of cache flush ops. The debug probe
> >> should not override the original purpose. It should be spelled for
> >> common understanding.
> > 
> > The socket buffer(s) should be aligned, so network packets should be
> > fine.
> 
> While the start of socket buffer might be aligned, the size of the
> transfer might not for the send ops. It is depended on the net/tcp/ip
> packets size.

Aurgh :-( Now I see what you mean. This is purely bad, very bad. Here is
a real possibility for corruption of variables close to the allocated DMA
buffer, right ?

> For example, with tftp, there is a lot of unaligned end of packets.
> 
> tftp d1000000 u-boot-dtb.bin
> 
> flush unaligned d7ff7020-d7ff704e
> [repeat ..]
> 
> So, such an alarm may be false. And such a skip can be bug.
> 
> In fact, for my own projects, I have changed the memory allocation to
> always cache aligned. And I rarely worry about it ever after.
> 
> I look at the net.c of u-boot. There are packets buffer allocated on BSS
> and stack. I would suggest avoid such programming, and use aligned
> memory allocation stead.

The stack allocation there is used because it's slightly faster and you don't
need mallocator for that. I guess this is a topic for a broader discussion and
we should include Tom and others into it. Would you mind starting another thread
on the ML and CCing me, Tom Rini, Simon Glass etc please ?

Best regards,
Marek Vasut
diff mbox

Patch

diff --git a/arch/nios2/cpu/cpu.c b/arch/nios2/cpu/cpu.c
index 5403c0d..8607c95 100644
--- a/arch/nios2/cpu/cpu.c
+++ b/arch/nios2/cpu/cpu.c
@@ -29,21 +29,6 @@  int do_reset(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 	return 0;
 }
 
-int dcache_status(void)
-{
-	return 1;
-}
-
-void dcache_enable(void)
-{
-	flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
-}
-
-void dcache_disable(void)
-{
-	flush_dcache(CONFIG_SYS_DCACHE_SIZE, CONFIG_SYS_DCACHELINE_SIZE);
-}
-
 /*
  * COPY EXCEPTION TRAMPOLINE -- copy the tramp to the
  * exception address. Define CONFIG_ROM_STUBS to prevent
diff --git a/arch/nios2/include/asm/cache.h b/arch/nios2/include/asm/cache.h
index 9b87c9f..dde43cd 100644
--- a/arch/nios2/include/asm/cache.h
+++ b/arch/nios2/include/asm/cache.h
@@ -8,18 +8,11 @@ 
 #ifndef __ASM_NIOS2_CACHE_H_
 #define __ASM_NIOS2_CACHE_H_
 
-extern void flush_dcache (unsigned long start, unsigned long size);
-extern void flush_icache (unsigned long start, unsigned long size);
-
 /*
- * Valid L1 data cache line sizes for the NIOS2 architecture are 4, 16, and 32
- * bytes.  If the board configuration has not specified one we default to the
- * largest of these values for alignment of DMA buffers.
+ * Valid L1 data cache line sizes for the NIOS2 architecture are 4,
+ * 16, and 32 bytes. We default to the largest of these values for
+ * alignment of DMA buffers.
  */
-#ifdef CONFIG_SYS_CACHELINE_SIZE
-#define ARCH_DMA_MINALIGN	CONFIG_SYS_CACHELINE_SIZE
-#else
 #define ARCH_DMA_MINALIGN	32
-#endif
 
 #endif /* __ASM_NIOS2_CACHE_H_ */
diff --git a/arch/nios2/lib/bootm.c b/arch/nios2/lib/bootm.c
index c730a3f..4e5c269 100644
--- a/arch/nios2/lib/bootm.c
+++ b/arch/nios2/lib/bootm.c
@@ -6,9 +6,6 @@ 
  */
 
 #include <common.h>
-#include <command.h>
-#include <asm/byteorder.h>
-#include <asm/cache.h>
 
 #define NIOS_MAGIC 0x534f494e /* enable command line and initrd passing */
 
@@ -40,8 +37,7 @@  int do_bootm_linux(int flag, int argc, char * const argv[], bootm_headers_t *ima
 
 	/* flushes data and instruction caches before calling the kernel */
 	disable_interrupts();
-	flush_dcache((ulong)kernel, CONFIG_SYS_DCACHE_SIZE);
-	flush_icache((ulong)kernel, CONFIG_SYS_ICACHE_SIZE);
+	flush_dcache_all();
 
 	debug("bootargs=%s @ 0x%lx\n", commandline, (ulong)&commandline);
 	debug("initrd=0x%lx-0x%lx\n", (ulong)initrd_start, (ulong)initrd_end);
diff --git a/arch/nios2/lib/cache.S b/arch/nios2/lib/cache.S
deleted file mode 100644
index 683f005..0000000
--- a/arch/nios2/lib/cache.S
+++ /dev/null
@@ -1,68 +0,0 @@ 
-/*
- * (C) Copyright 2004, Psyent Corporation <www.psyent.com>
- * Scott McNutt <smcnutt@psyent.com>
- *
- * SPDX-License-Identifier:	GPL-2.0+
- */
-
-#include <config.h>
-
-	.text
-
-	.global flush_dcache
-
-flush_dcache:
-	add	r5, r5, r4
-	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
-	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0:	flushd	0(r4)
-	add	r4, r4, r8
-	bltu	r4, r5, 0b
-	ret
-
-
-	.global flush_icache
-
-flush_icache:
-	add	r5, r5, r4
-	movhi	r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
-	ori	r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
-1:	flushi	r4
-	add	r4, r4, r8
-	bltu	r4, r5, 1b
-	ret
-
-	.global flush_dcache_range
-
-flush_dcache_range:
-	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
-	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0:	flushd	0(r4)
-	add	r4, r4, r8
-	bltu	r4, r5, 0b
-	ret
-
-	.global flush_cache
-
-flush_cache:
-	add	r5, r5, r4
-	mov	r9, r4
-	mov	r10, r5
-
-	movhi	r8, %hi(CONFIG_SYS_DCACHELINE_SIZE)
-	ori	r8, r8, %lo(CONFIG_SYS_DCACHELINE_SIZE)
-0:	flushd	0(r4)
-	add	r4, r4, r8
-	bltu	r4, r5, 0b
-
-	mov	r4, r9
-	mov	r5, r10
-	movhi	r8, %hi(CONFIG_SYS_ICACHELINE_SIZE)
-	ori	r8, r8, %lo(CONFIG_SYS_ICACHELINE_SIZE)
-1:	flushi	r4
-	add	r4, r4, r8
-	bltu	r4, r5, 1b
-
-	sync
-	flushp
-	ret
diff --git a/arch/nios2/lib/cache.c b/arch/nios2/lib/cache.c
new file mode 100644
index 0000000..6f26d8d
--- /dev/null
+++ b/arch/nios2/lib/cache.c
@@ -0,0 +1,84 @@ 
+/*
+ * Copyright (C) 2015 Thomas Chou <thomas@wytron.com.tw>
+ * Copyright (C) 2009, Wind River Systems Inc
+ * Implemented by fredrik.markstrom@gmail.com and ivarholmqvist@gmail.com
+ *
+ * SPDX-License-Identifier:	GPL-2.0+
+ */
+
+#include <common.h>
+#include <asm/cache.h>
+
+DECLARE_GLOBAL_DATA_PTR;
+
+static void __flush_dcache_all(unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+
+	start &= ~(gd->arch.dcache_line_size - 1);
+	end += (gd->arch.dcache_line_size - 1);
+	end &= ~(gd->arch.dcache_line_size - 1);
+
+	if (end > start + gd->arch.dcache_size)
+		end = start + gd->arch.dcache_size;
+
+	for (addr = start; addr < end; addr += gd->arch.dcache_line_size) {
+		__asm__ __volatile__ ("   flushd 0(%0)\n"
+					: /* Outputs */
+					: /* Inputs  */ "r"(addr)
+					/* : No clobber */);
+	}
+}
+
+static void __flush_icache(unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+
+	start &= ~(gd->arch.icache_line_size - 1);
+	end += (gd->arch.icache_line_size - 1);
+	end &= ~(gd->arch.icache_line_size - 1);
+
+	if (end > start + gd->arch.icache_size)
+		end = start + gd->arch.icache_size;
+
+	for (addr = start; addr < end; addr += gd->arch.icache_line_size) {
+		__asm__ __volatile__ ("   flushi %0\n"
+					: /* Outputs */
+					: /* Inputs  */ "r"(addr)
+					/* : No clobber */);
+	}
+	__asm__ __volatile(" flushp\n");
+}
+
+void flush_dcache_all(void)
+{
+	__flush_dcache_all(0, gd->arch.dcache_size);
+	__flush_icache(0, gd->arch.icache_size);
+}
+
+void flush_dcache_range(unsigned long start, unsigned long end)
+{
+	__flush_dcache_all(start, end);
+}
+
+void flush_cache(unsigned long start, unsigned long size)
+{
+	__flush_dcache_all(start, start + size);
+	__flush_icache(start, start + size);
+}
+
+/* nios2 data cache is always enabled */
+int dcache_status(void)
+{
+	return 1;
+}
+
+void dcache_enable(void)
+{
+	flush_dcache_all();
+}
+
+void dcache_disable(void)
+{
+	flush_dcache_all();
+}