diff mbox

[U-Boot,2/8] armv7: cache maintenance operations for armv7

Message ID 1293018898-13253-3-git-send-email-aneesh@ti.com
State Changes Requested
Delegated to: Albert ARIBAUD
Headers show

Commit Message

Aneesh V Dec. 22, 2010, 11:54 a.m. UTC
- Add a framework for layered cache maintenance
	- separate out SOC specific outer cache maintenance from
	  maintenance of caches known to CPU

- Add generic ARMv7 cache maintenance operations that affect all
  caches known to ARMv7 CPUs. For instance in Cortex-A8 these
  opertions will affect both L1 and L2 caches. In Cortex-A9
  these will affect only L1 cache

- D-cache operations supported:
	- Invalidate entire D-cache
	- Invalidate D-cache range
	- Flush(clean & invalidate) entire D-cache
	- Flush D-cache range
- I-cache operations supported:
	- Invalidate entire I-cache

- Add maintenance functions for TLB, branch predictor array etc.

- Enable -march=armv7-a so that armv7 assembly instructions can be
  used

Signed-off-by: Aneesh V <aneesh@ti.com>
---
 arch/arm/cpu/armv7/Makefile   |    2 +-
 arch/arm/cpu/armv7/cache_v7.c |  359 +++++++++++++++++++++++++++++++++++++++++
 arch/arm/cpu/armv7/config.mk  |    2 +-
 arch/arm/include/asm/armv7.h  |   63 +++++++
 include/common.h              |    5 +-
 5 files changed, 428 insertions(+), 3 deletions(-)
 create mode 100644 arch/arm/cpu/armv7/cache_v7.c
 create mode 100644 arch/arm/include/asm/armv7.h

Comments

Albert ARIBAUD Jan. 8, 2011, 6:36 a.m. UTC | #1
Hi Aneesh,

Le 22/12/2010 12:54, Aneesh V a écrit :
> - Add a framework for layered cache maintenance
> 	- separate out SOC specific outer cache maintenance from
> 	  maintenance of caches known to CPU
>
> - Add generic ARMv7 cache maintenance operations that affect all
>    caches known to ARMv7 CPUs. For instance in Cortex-A8 these
>    opertions will affect both L1 and L2 caches. In Cortex-A9
>    these will affect only L1 cache
>
> - D-cache operations supported:
> 	- Invalidate entire D-cache
> 	- Invalidate D-cache range
> 	- Flush(clean&  invalidate) entire D-cache
> 	- Flush D-cache range
> - I-cache operations supported:
> 	- Invalidate entire I-cache
>
> - Add maintenance functions for TLB, branch predictor array etc.
>
> - Enable -march=armv7-a so that armv7 assembly instructions can be
>    used
>
> Signed-off-by: Aneesh V<aneesh@ti.com>
> ---
>   arch/arm/cpu/armv7/Makefile   |    2 +-
>   arch/arm/cpu/armv7/cache_v7.c |  359 +++++++++++++++++++++++++++++++++++++++++
>   arch/arm/cpu/armv7/config.mk  |    2 +-
>   arch/arm/include/asm/armv7.h  |   63 +++++++
>   include/common.h              |    5 +-
>   5 files changed, 428 insertions(+), 3 deletions(-)
>   create mode 100644 arch/arm/cpu/armv7/cache_v7.c
>   create mode 100644 arch/arm/include/asm/armv7.h
>
> diff --git a/arch/arm/cpu/armv7/Makefile b/arch/arm/cpu/armv7/Makefile
> index 8c0e915..299792a 100644
> --- a/arch/arm/cpu/armv7/Makefile
> +++ b/arch/arm/cpu/armv7/Makefile
> @@ -26,7 +26,7 @@ include $(TOPDIR)/config.mk
>   LIB	= $(obj)lib$(CPU).o
>
>   START	:= start.o
> -COBJS	:= cpu.o
> +COBJS	:= cpu.o cache_v7.o
>   COBJS  += syslib.o
>
>   SRCS	:= $(START:.o=.S) $(COBJS:.o=.c)
> diff --git a/arch/arm/cpu/armv7/cache_v7.c b/arch/arm/cpu/armv7/cache_v7.c
> new file mode 100644
> index 0000000..0521d66
> --- /dev/null
> +++ b/arch/arm/cpu/armv7/cache_v7.c
> @@ -0,0 +1,359 @@
> +/*
> + * (C) Copyright 2010
> + * Texas Instruments Incorporated - http://www.ti.com/
> + * Aneesh V<aneesh@ti.com>
> + *
> + * See file CREDITS for list of people who contributed to this
> + * project.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation; either version 2 of
> + * the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
> + * MA 02111-1307 USA
> + */
> +#include<linux/types.h>
> +#include<common.h>
> +#include<asm/armv7.h>
> +
> +#define ARMV7_DCACHE_INVAL_ALL		1
> +#define ARMV7_DCACHE_CLEAN_INVAL_ALL	2
> +#define ARMV7_DCACHE_INVAL_RANGE	3
> +#define ARMV7_DCACHE_CLEAN_INVAL_RANGE	4
> +
> +struct v7_outer_cache_ops v7_outer_cache;
> +
> +#ifndef CONFIG_SYS_NO_DCACHE
> +/*
> + * Write the level and type you want to Cache Size Selection Register(CSSELR)
> + * to get size details from Current Cache Size ID Register(CCSIDR)
> + */
> +static void set_csselr(u32 level, u32 type)
> +{	u32 csselr = level<<  1 | type;
> +	/* Write to Cache Size Selection Register(CSSELR) */
> +	asm volatile ("mcr p15, 2, %0, c0, c0, 0" : : "r" (csselr));
> +}
> +
> +static u32 get_ccsidr(void)
> +{
> +	u32 ccsidr;
> +	/* Read current CP15 Cache Size ID Register */
> +	asm volatile ("mrc p15, 1, %0, c0, c0, 0" : "=r" (ccsidr));
> +	return ccsidr;
> +}
> +
> +static u32 get_clidr(void)
> +{
> +	u32 clidr;
> +	/* Read current CP15 Cache Level ID Register */
> +	asm volatile ("mrc p15,1,%0,c0,c0,1" : "=r" (clidr));
> +	return clidr;
> +}
> +
> +/* round up the input number to a power of 2 and get the log2 */
> +static inline u32 log_2_round_up(u32 num)
> +{
> +	/* count leading zeros */
> +	asm volatile ("CLZ %0, %0" : "+r" (num));
> +
> +	/* position of most significant 1 */
> +	num = 31 - num;
> +
> +	return num;
> +}
> +
> +static void v7_inval_dcache_level_setway(u32 level, u32 num_sets,
> +					 u32 num_ways, u32 way_shift,
> +					 u32 log2_line_len)
> +{
> +	int way, set, setway;
> +	/*
> +	 * For optimal assembly code:
> +	 *	a. count down
> +	 *	b. have bigger loop inside
> +	 */

Out of curiosity, can you elaborate on why the compiler would optimize 
better in these cases?

> +	for (way = num_ways - 1; way>= 0 ; way--)
> +		for (set = num_sets - 1; set>= 0; set--) {

Please fix whitespacing around operators. The best way to ''catch'em 
all'' is to run Linux' checkpatch.pl (I do this with option --no-tree) 
on all patches that you submit to u-boot and, fix all warning and errors 
and if some are left that you think should not be fixed, mention them 
and explain why they're wrongly emitted.

> +			setway = (level<<  1) | (set<<  log2_line_len) |
> +				 (way<<  way_shift);
> +			/* Invalidate data/unified cache line by set/way */
> +			asm volatile ("	mcr p15, 0, %0, c7, c6, 2"
> +					: : "r" (setway));
> +		}
> +	/* Make sure the operation is complete */
> +	asm volatile ("DMB");
> +}
> +
> +static void v7_clean_inval_dcache_level_setway(u32 level, u32 num_sets,
> +					       u32 num_ways, u32 way_shift,
> +					       u32 log2_line_len)
> +{
> +	int way, set, setway;
> +	/*
> +	 * For optimal assembly code:
> +	 *	a. count down
> +	 *	b. have bigger loop inside
> +	 */
> +	for (way = num_ways - 1; way>= 0 ; way--)
> +		for (set = num_sets - 1; set>= 0; set--) {
> +			setway = (level<<  1) | (set<<  log2_line_len) |
> +				 (way<<  way_shift);
> +			/*
> +			 * Clean&  Invalidate data/unified
> +			 * cache line by set/way
> +			 */
> +			asm volatile ("	mcr p15, 0, %0, c7, c14, 2"
> +					: : "r" (setway));
> +		}
> +	/* Make sure the operation is complete */
> +	asm volatile ("DMB");
> +}
> +
> +static void v7_maint_dcache_level_setway(u32 level, u32 operation)
> +{
> +	u32 ccsidr;
> +	u32 num_sets, num_ways, log2_line_len, log2_num_ways;
> +	u32 way_shift;
> +	set_csselr(level, ARMV7_CSSELR_IND_DATA_UNIFIED);
> +
> +	ccsidr = get_ccsidr();
> +
> +	log2_line_len = mask_n_get(ccsidr, 0, 2) + 2;
> +	/* Converting from words to bytes */
> +	log2_line_len += 2;
> +
> +	num_ways  = mask_n_get(ccsidr, 3, 12) + 1;
> +	num_sets  = mask_n_get(ccsidr, 13, 27) + 1;
> +	/*
> +	 * According to ARMv7 ARM number of sets and number of ways need
> +	 * not be a power of 2
> +	 */
> +	log2_num_ways = log_2_round_up(num_ways);
> +
> +	way_shift = (32 - log2_num_ways);
> +	if (operation == ARMV7_DCACHE_INVAL_ALL)
> +		v7_inval_dcache_level_setway(level, num_sets, num_ways,
> +				      way_shift, log2_line_len);
> +	else if (operation == ARMV7_DCACHE_CLEAN_INVAL_ALL)
> +		v7_clean_inval_dcache_level_setway(level, num_sets, num_ways,
> +						   way_shift, log2_line_len);
> +}
> +
> +static void v7_maint_dcache_all(u32 operation)
> +{
> +	u32 level, cache_type, level_start_bit = 0;
> +
> +	u32 clidr = get_clidr();
> +
> +	for (level = 0; level<  7; level++) {
> +		cache_type = mask_n_get(clidr, level_start_bit,
> +					level_start_bit + 2);
> +		if ((cache_type == ARMV7_CLIDR_CTYPE_DATA_ONLY) ||
> +		    (cache_type == ARMV7_CLIDR_CTYPE_INSTRUCTION_DATA) ||
> +		    (cache_type == ARMV7_CLIDR_CTYPE_UNIFIED))
> +			v7_maint_dcache_level_setway(level, operation);
> +		level_start_bit += 3;
> +	}
> +}
> +
> +static void v7_dcache_clean_inval_range(u32 start,
> +					u32 stop, u32 line_len)
> +{
> +	u32 mva;
> +	/* Align start to cache line boundary */
> +	start&= ~(line_len - 1);
> +	for (mva = start; mva<  stop; mva = mva + line_len)
> +		/* DCCIMVAC - Clean&  Invalidate data cache by MVA to PoC */
> +		asm volatile ("mcr p15, 0, %0, c7, c14, 1" : : "r" (mva));
> +}
> +
> +static void v7_dcache_inval_range(u32 start, u32 stop, u32 line_len)
> +{
> +	u32 mva;
> +
> +	/*
> +	 * If start address is not aligned to cache-line flush the first
> +	 * line to prevent affecting somebody else's buffer
> +	 */
> +	if (start&  (line_len - 1)) {
> +		v7_dcache_clean_inval_range(start, start + 1, line_len);
> +		/* move to next cache line */
> +		start = (start + line_len - 1)&  ~(line_len - 1);
> +	}
> +
> +	/*
> +	 * If stop address is not aligned to cache-line flush the last
> +	 * line to prevent affecting somebody else's buffer
> +	 */
> +	if (stop&  (line_len - 1)) {
> +		v7_dcache_clean_inval_range(stop, stop + 1, line_len);
> +		/* align to the beginning of this cache line */
> +		stop&= ~(line_len - 1);
> +	}
> +
> +	for (mva = start; mva<  stop; mva = mva + line_len)
> +		/* DCIMVAC - Invalidate data cache by MVA to PoC */
> +		asm volatile ("mcr p15, 0, %0, c7, c6, 1" : : "r" (mva));
> +}
> +
> +static void v7_dcache_maint_range(u32 start, u32 stop, u32 range_op)
> +{
> +	u32 line_len, ccsidr;
> +	ccsidr = get_ccsidr();
> +	line_len = mask_n_get(ccsidr, 0, 2) + 2;
> +	/* Converting from words to bytes */
> +	line_len += 2;
> +	/* converting from log2(linelen) to linelen */
> +	line_len = 1<<  line_len;
> +
> +	switch (range_op) {
> +	case ARMV7_DCACHE_CLEAN_INVAL_RANGE:
> +		v7_dcache_clean_inval_range(start, stop, line_len);
> +		break;
> +	case ARMV7_DCACHE_INVAL_RANGE:
> +		v7_dcache_inval_range(start, stop, line_len);
> +		break;
> +	}
> +
> +	/* Make sure the operation is complete */
> +	asm volatile ("DMB");
> +}
> +
> +/* Invalidate TLB */
> +static void v7_inval_tlb(void)
> +{
> +	/* Invalidate entire unified TLB */
> +	asm volatile ("mcr p15, 0, %0, c8, c7, 0" : : "r" (0));
> +	/* Invalidate entire data TLB */
> +	asm volatile ("mcr p15, 0, %0, c8, c6, 0" : : "r" (0));
> +	/* Invalidate entire instruction TLB */
> +	asm volatile ("mcr p15, 0, %0, c8, c5, 0" : : "r" (0));
> +	/* Full system DSB - make sure that the invalidation is complete */
> +	asm volatile ("DSB");
> +	/* Full system ISB - make sure the instruction stream sees it */
> +	asm volatile ("ISB");
> +}
> +
> +void invalidate_dcache_all(void)
> +{
> +	v7_maint_dcache_all(ARMV7_DCACHE_INVAL_ALL);
> +	if (v7_outer_cache.inval_all)
> +		v7_outer_cache.inval_all();

Why use pointers here rather than weak functions?

> +}
> +
> +/*
> + * Performs a clean&  invalidation of the entire data cache
> + * at all levels
> + */
> +void flush_dcache_all(void)
> +{
> +	v7_maint_dcache_all(ARMV7_DCACHE_CLEAN_INVAL_ALL);
> +	if (v7_outer_cache.flush_all)
> +		v7_outer_cache.flush_all();
> +}
> +
> +/*
> + * Invalidates range in all levels of D-cache/unified cache used:
> + * Affects the range [start, stop - 1]
> + */
> +void invalidate_dcache_range(unsigned long start, unsigned long stop)
> +{
> +
> +	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_INVAL_RANGE);
> +	if (v7_outer_cache.inval_range)
> +		/* physical address is same as virtual address */
> +		v7_outer_cache.inval_range(start, stop);
> +}
> +
> +/*
> + * Flush range(clean&  invalidate) from all levels of D-cache/unified
> + * cache used:
> + * Affects the range [start, stop - 1]
> + */
> +void flush_dcache_range(unsigned long start, unsigned long stop)
> +{
> +	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_CLEAN_INVAL_RANGE);
> +	if (v7_outer_cache.flush_range)
> +		/* physical address is same as virtual address */
> +		v7_outer_cache.flush_range(start, stop);
> +}
> +static void __v7_setup_outer_cache_ops(void)
> +{
> +	puts("v7_setup_outer_cache_ops: dummy implementation! "
> +	     "real implementation not available!!\n");
> +}
> +
> +void v7_setup_outer_cache_ops(void)
> +	__attribute__((weak, alias("__v7_setup_outer_cache_ops")));
> +
> +void arm_init_before_mmu(void)
> +{
> +	v7_setup_outer_cache_ops();
> +	if (v7_outer_cache.enable)
> +		v7_outer_cache.enable();
> +	invalidate_dcache_all();
> +	v7_inval_tlb();
> +}
> +#else
> +void invalidate_dcache_all(void)
> +{
> +}
> +
> +void flush_dcache_all(void)
> +{
> +}
> +
> +void invalidate_dcache_range(unsigned long start, unsigned long stop)
> +{
> +}
> +
> +void flush_dcache_range(unsigned long start, unsigned long stop)
> +{
> +}
> +
> +void arm_init_before_mmu(void)
> +{
> +}
> +#endif
> +
> +#ifndef CONFIG_SYS_NO_ICACHE
> +/* Invalidate entire I-cache and branch predictor array */
> +void invalidate_icache_all(void)
> +{
> +	/*
> +	 * Invalidate all instruction caches to PoU.
> +	 * Also flushes branch target cache.
> +	 */
> +	asm volatile ("mcr p15, 0, %0, c7, c5, 0" : : "r" (0));
> +
> +	/* Invalidate entire branch predictor array */
> +	asm volatile ("mcr p15, 0, %0, c7, c5, 6" : : "r" (0));
> +
> +	/* Full system DSB - make sure that the invalidation is complete */
> +	asm volatile ("DSB");
> +	/* Full system ISB - make sure the instruction stream sees it */
> +	asm volatile ("ISB");
> +}
> +#else
> +void invalidate_icache_all(void)
> +{
> +}
> +#endif
> +
> +/*
> + * Flush range from all levels of d-cache/unified-cache used:
> + * Affects the range [start, start + size - 1]
> + */
> +void  flush_cache(unsigned long start, unsigned long size)
> +{
> +	flush_dcache_range(start, start + size);
> +}

This function is the only one which is defined to something non-empty 
when CONFIG_SYS_NO_DCACHE is defined. Why is it not in the big #ifndef 
for dcache above ?

> diff --git a/arch/arm/cpu/armv7/config.mk b/arch/arm/cpu/armv7/config.mk
> index 49ac9c7..7f9b171 100644
> --- a/arch/arm/cpu/armv7/config.mk
> +++ b/arch/arm/cpu/armv7/config.mk
> @@ -23,7 +23,7 @@
>   PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
>
>   # Make ARMv5 to allow more compilers to work, even though its v7a.
> -PLATFORM_CPPFLAGS += -march=armv5
> +PLATFORM_CPPFLAGS += -march=armv7-a

Did you check that this does not break any board using armv7?

>   # =========================================================================
>   #
>   # Supply options according to compiler version
> diff --git a/arch/arm/include/asm/armv7.h b/arch/arm/include/asm/armv7.h
> new file mode 100644
> index 0000000..57409b6
> --- /dev/null
> +++ b/arch/arm/include/asm/armv7.h
> @@ -0,0 +1,63 @@
> +/*
> + * (C) Copyright 2010
> + * Texas Instruments Incorporated - http://www.ti.com/
> + *
> + * Aneesh V<aneesh@ti.com>
> + *
> + * See file CREDITS for list of people who contributed to this
> + * project.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation; either version 2 of
> + * the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
> + * MA 02111-1307 USA
> + */
> +#ifndef ARMV7_H
> +#define ARMV7_H
> +
> +#include<linux/types.h>
> +
> +/*
> + * Values for InD field in CSSELR
> + * Selects the type of cache
> + */
> +#define ARMV7_CSSELR_IND_DATA_UNIFIED	0
> +#define ARMV7_CSSELR_IND_INSTRUCTION	1
> +
> +/* Values for Ctype fields in CLIDR */
> +#define ARMV7_CLIDR_CTYPE_NO_CACHE		0
> +#define ARMV7_CLIDR_CTYPE_INSTRUCTION_ONLY	1
> +#define ARMV7_CLIDR_CTYPE_DATA_ONLY		2
> +#define ARMV7_CLIDR_CTYPE_INSTRUCTION_DATA	3
> +#define ARMV7_CLIDR_CTYPE_UNIFIED		4
> +
> +/* some utility macros */
> +#define mask(start, end) \
> +	(((1<<  ((end) - (start) + 1)) - 1)<<  (start))
> +
> +#define mask_n_get(reg, start, end) \
> +	(((reg)&  mask(start, end))>>  (start))

Seeing as these functions are only used in the ARMv7 cache C file, they 
should be moved there.

> +struct v7_outer_cache_ops {
> +	void (*enable)(void);
> +	void (*disable)(void);
> +	void (*flush_all)(void);
> +	void (*inval_all)(void);
> +	void (*flush_range)(u32 start, u32 end);
> +	void (*inval_range)(u32 start, u32 end);
> +};
> +
> +extern struct v7_outer_cache_ops v7_outer_cache;
> +
> +void v7_setup_outer_cache_ops(void);
> +#endif
> diff --git a/include/common.h b/include/common.h
> index 189ad81..d750ff9 100644
> --- a/include/common.h
> +++ b/include/common.h
> @@ -411,6 +411,7 @@ void	icache_disable(void);
>   int	dcache_status (void);
>   void	dcache_enable (void);
>   void	dcache_disable(void);
> +void	mmu_disable(void);
>   void	relocate_code (ulong, gd_t *, ulong) __attribute__ ((noreturn));
>   ulong	get_endaddr   (void);
>   void	trap_init     (ulong);
> @@ -603,9 +604,11 @@ ulong	video_setmem (ulong);
>
>   /* arch/$(ARCH)/lib/cache.c */
>   void	flush_cache   (unsigned long, unsigned long);
> +void	flush_dcache_all(void);
>   void	flush_dcache_range(unsigned long start, unsigned long stop);
>   void	invalidate_dcache_range(unsigned long start, unsigned long stop);
> -
> +void	invalidate_dcache_all(void);
> +void	invalidate_icache_all(void);
>
>   /* arch/$(ARCH)/lib/ticks.S */
>   unsigned long long get_ticks(void);

Amicalement,
Albert ARIBAUD Jan. 8, 2011, 8:40 a.m. UTC | #2
Le 08/01/2011 07:36, Albert ARIBAUD a écrit :

>> --- a/arch/arm/cpu/armv7/config.mk
>> +++ b/arch/arm/cpu/armv7/config.mk
>> @@ -23,7 +23,7 @@
>>    PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
>>
>>    # Make ARMv5 to allow more compilers to work, even though its v7a.
>> -PLATFORM_CPPFLAGS += -march=armv5
>> +PLATFORM_CPPFLAGS += -march=armv7-a
>
> Did you check that this does not break any board using armv7?

Actually I should have said "Did you check that it does not break 
building with any common toolchain", considering the comment above the 
line. And indeed, it breaks building with ELDK 4.2 toolchain.

Amicalement,
Aneesh V Jan. 8, 2011, 10:06 a.m. UTC | #3
Hi Albert,

On Saturday 08 January 2011 12:06 PM, Albert ARIBAUD wrote:
> Hi Aneesh,
>
> Le 22/12/2010 12:54, Aneesh V a écrit :
>> - Add a framework for layered cache maintenance
>> 	- separate out SOC specific outer cache maintenance from
>> 	  maintenance of caches known to CPU
>>
>> - Add generic ARMv7 cache maintenance operations that affect all
>>     caches known to ARMv7 CPUs. For instance in Cortex-A8 these
>>     opertions will affect both L1 and L2 caches. In Cortex-A9
>>     these will affect only L1 cache
>>
>> - D-cache operations supported:
>> 	- Invalidate entire D-cache
>> 	- Invalidate D-cache range
>> 	- Flush(clean&   invalidate) entire D-cache
>> 	- Flush D-cache range
>> - I-cache operations supported:
>> 	- Invalidate entire I-cache
>>
>> - Add maintenance functions for TLB, branch predictor array etc.
>>
>> - Enable -march=armv7-a so that armv7 assembly instructions can be
>>     used
>>
>> Signed-off-by: Aneesh V<aneesh@ti.com>
>> ---
>>    arch/arm/cpu/armv7/Makefile   |    2 +-
>>    arch/arm/cpu/armv7/cache_v7.c |  359 +++++++++++++++++++++++++++++++++++++++++
>>    arch/arm/cpu/armv7/config.mk  |    2 +-
>>    arch/arm/include/asm/armv7.h  |   63 +++++++
>>    include/common.h              |    5 +-
>>    5 files changed, 428 insertions(+), 3 deletions(-)
>>    create mode 100644 arch/arm/cpu/armv7/cache_v7.c
>>    create mode 100644 arch/arm/include/asm/armv7.h
>>
>> diff --git a/arch/arm/cpu/armv7/Makefile b/arch/arm/cpu/armv7/Makefile
>> index 8c0e915..299792a 100644
>> --- a/arch/arm/cpu/armv7/Makefile
>> +++ b/arch/arm/cpu/armv7/Makefile
>> @@ -26,7 +26,7 @@ include $(TOPDIR)/config.mk
>>    LIB	= $(obj)lib$(CPU).o
>>
>>    START	:= start.o
>> -COBJS	:= cpu.o
>> +COBJS	:= cpu.o cache_v7.o
>>    COBJS  += syslib.o
>>
>>    SRCS	:= $(START:.o=.S) $(COBJS:.o=.c)
>> diff --git a/arch/arm/cpu/armv7/cache_v7.c b/arch/arm/cpu/armv7/cache_v7.c
>> new file mode 100644
>> index 0000000..0521d66
>> --- /dev/null
>> +++ b/arch/arm/cpu/armv7/cache_v7.c
>> @@ -0,0 +1,359 @@
>> +/*
>> + * (C) Copyright 2010
>> + * Texas Instruments Incorporated - http://www.ti.com/
>> + * Aneesh V<aneesh@ti.com>
>> + *
>> + * See file CREDITS for list of people who contributed to this
>> + * project.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License as
>> + * published by the Free Software Foundation; either version 2 of
>> + * the License, or (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
>> + * MA 02111-1307 USA
>> + */
>> +#include<linux/types.h>
>> +#include<common.h>
>> +#include<asm/armv7.h>
>> +
>> +#define ARMV7_DCACHE_INVAL_ALL		1
>> +#define ARMV7_DCACHE_CLEAN_INVAL_ALL	2
>> +#define ARMV7_DCACHE_INVAL_RANGE	3
>> +#define ARMV7_DCACHE_CLEAN_INVAL_RANGE	4
>> +
>> +struct v7_outer_cache_ops v7_outer_cache;
>> +
>> +#ifndef CONFIG_SYS_NO_DCACHE
>> +/*
>> + * Write the level and type you want to Cache Size Selection Register(CSSELR)
>> + * to get size details from Current Cache Size ID Register(CCSIDR)
>> + */
>> +static void set_csselr(u32 level, u32 type)
>> +{	u32 csselr = level<<   1 | type;
>> +	/* Write to Cache Size Selection Register(CSSELR) */
>> +	asm volatile ("mcr p15, 2, %0, c0, c0, 0" : : "r" (csselr));
>> +}
>> +
>> +static u32 get_ccsidr(void)
>> +{
>> +	u32 ccsidr;
>> +	/* Read current CP15 Cache Size ID Register */
>> +	asm volatile ("mrc p15, 1, %0, c0, c0, 0" : "=r" (ccsidr));
>> +	return ccsidr;
>> +}
>> +
>> +static u32 get_clidr(void)
>> +{
>> +	u32 clidr;
>> +	/* Read current CP15 Cache Level ID Register */
>> +	asm volatile ("mrc p15,1,%0,c0,c0,1" : "=r" (clidr));
>> +	return clidr;
>> +}
>> +
>> +/* round up the input number to a power of 2 and get the log2 */
>> +static inline u32 log_2_round_up(u32 num)
>> +{
>> +	/* count leading zeros */
>> +	asm volatile ("CLZ %0, %0" : "+r" (num));
>> +
>> +	/* position of most significant 1 */
>> +	num = 31 - num;
>> +
>> +	return num;
>> +}
>> +
>> +static void v7_inval_dcache_level_setway(u32 level, u32 num_sets,
>> +					 u32 num_ways, u32 way_shift,
>> +					 u32 log2_line_len)
>> +{
>> +	int way, set, setway;
>> +	/*
>> +	 * For optimal assembly code:
>> +	 *	a. count down
>> +	 *	b. have bigger loop inside
>> +	 */
>
> Out of curiosity, can you elaborate on why the compiler would optimize
> better in these cases?

While counting down the termination condition check is against 0. So
you can just decrement the loop count using a 'subs' and do a 'bne'.
When you count up you have to do a comparison with a non-zero value. So
you will need one 'cmp' instruction extra:-)

bigger loop inside because that reduces the frequency at which your
outer parameter changes and hence the overall number of instructions 
executed. Consider this:
1. We encode both the loop counts along with other data into a register
that is finally written to CP15 register.
2. outer loop has the code for shifting and ORing the outer variable to
this register.
3. Inner loop has the code for shifting and ORing the inner variable.
Step (3) has to be executed 'way x set' number of times anyways.
But having bigger loop inside makes sure that 2 is executed fewer times!

With these tweaks the assembly code generated by this C code is as good
as the original hand-written assembly code with my compiler.


>
>> +	for (way = num_ways - 1; way>= 0 ; way--)
>> +		for (set = num_sets - 1; set>= 0; set--) {
>
> Please fix whitespacing around operators. The best way to ''catch'em
> all'' is to run Linux' checkpatch.pl (I do this with option --no-tree)
> on all patches that you submit to u-boot and, fix all warning and errors
> and if some are left that you think should not be fixed, mention them
> and explain why they're wrongly emitted.

I religiously do checkpatch whenever I send out a patch. Please note 
that my original mail seems to be fine. I saved it and ran checkpatch 
again. No errors, no warnings! Something amiss?

Best regards,
Aneesh
Aneesh V Jan. 8, 2011, 1:17 p.m. UTC | #4
On Saturday 08 January 2011 12:06 PM, Albert ARIBAUD wrote:
> Hi Aneesh,
Pressed the Send button too fast last time. Missed answering the last
few questions.

<snip..>
>> +
>> +void invalidate_dcache_all(void)
>> +{
>> +	v7_maint_dcache_all(ARMV7_DCACHE_INVAL_ALL);
>> +	if (v7_outer_cache.inval_all)
>> +		v7_outer_cache.inval_all();
>
> Why use pointers here rather than weak functions?

In fact, I hadn't thought about it. Maybe I was biased by the Linux
implementation.The only reason I can think of is that pointer gives the
flexibility of doing this assignment at run-time. Let's say we had a
multi-platform u-boot that detects the SOC at run-time?

>
>> +}
>> +
>> +/*
>> + * Performs a clean&   invalidation of the entire data cache
>> + * at all levels
>> + */
>> +void flush_dcache_all(void)
>> +{
>> +	v7_maint_dcache_all(ARMV7_DCACHE_CLEAN_INVAL_ALL);
>> +	if (v7_outer_cache.flush_all)
>> +		v7_outer_cache.flush_all();
>> +}
>> +
>> +/*
>> + * Invalidates range in all levels of D-cache/unified cache used:
>> + * Affects the range [start, stop - 1]
>> + */
>> +void invalidate_dcache_range(unsigned long start, unsigned long stop)
>> +{
>> +
>> +	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_INVAL_RANGE);
>> +	if (v7_outer_cache.inval_range)
>> +		/* physical address is same as virtual address */
>> +		v7_outer_cache.inval_range(start, stop);
>> +}
>> +
>> +/*
>> + * Flush range(clean&   invalidate) from all levels of D-cache/unified
>> + * cache used:
>> + * Affects the range [start, stop - 1]
>> + */
>> +void flush_dcache_range(unsigned long start, unsigned long stop)
>> +{
>> +	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_CLEAN_INVAL_RANGE);
>> +	if (v7_outer_cache.flush_range)
>> +		/* physical address is same as virtual address */
>> +		v7_outer_cache.flush_range(start, stop);
>> +}
>> +static void __v7_setup_outer_cache_ops(void)
>> +{
>> +	puts("v7_setup_outer_cache_ops: dummy implementation! "
>> +	     "real implementation not available!!\n");
>> +}
>> +
>> +void v7_setup_outer_cache_ops(void)
>> +	__attribute__((weak, alias("__v7_setup_outer_cache_ops")));
>> +
>> +void arm_init_before_mmu(void)
>> +{
>> +	v7_setup_outer_cache_ops();
>> +	if (v7_outer_cache.enable)
>> +		v7_outer_cache.enable();
>> +	invalidate_dcache_all();
>> +	v7_inval_tlb();
>> +}
>> +#else
>> +void invalidate_dcache_all(void)
>> +{
>> +}
>> +
>> +void flush_dcache_all(void)
>> +{
>> +}
>> +
>> +void invalidate_dcache_range(unsigned long start, unsigned long stop)
>> +{
>> +}
>> +
>> +void flush_dcache_range(unsigned long start, unsigned long stop)
>> +{
>> +}
>> +
>> +void arm_init_before_mmu(void)
>> +{
>> +}
>> +#endif
>> +
>> +#ifndef CONFIG_SYS_NO_ICACHE
>> +/* Invalidate entire I-cache and branch predictor array */
>> +void invalidate_icache_all(void)
>> +{
>> +	/*
>> +	 * Invalidate all instruction caches to PoU.
>> +	 * Also flushes branch target cache.
>> +	 */
>> +	asm volatile ("mcr p15, 0, %0, c7, c5, 0" : : "r" (0));
>> +
>> +	/* Invalidate entire branch predictor array */
>> +	asm volatile ("mcr p15, 0, %0, c7, c5, 6" : : "r" (0));
>> +
>> +	/* Full system DSB - make sure that the invalidation is complete */
>> +	asm volatile ("DSB");
>> +	/* Full system ISB - make sure the instruction stream sees it */
>> +	asm volatile ("ISB");
>> +}
>> +#else
>> +void invalidate_icache_all(void)
>> +{
>> +}
>> +#endif
>> +
>> +/*
>> + * Flush range from all levels of d-cache/unified-cache used:
>> + * Affects the range [start, start + size - 1]
>> + */
>> +void  flush_cache(unsigned long start, unsigned long size)
>> +{
>> +	flush_dcache_range(start, start + size);
>> +}
>
> This function is the only one which is defined to something non-empty
> when CONFIG_SYS_NO_DCACHE is defined. Why is it not in the big #ifndef
> for dcache above ?

Although this function is non-empty, flush_dcache_range() is in turn
empty. Effect will be the same, right?

>
>> diff --git a/arch/arm/cpu/armv7/config.mk b/arch/arm/cpu/armv7/config.mk
>> index 49ac9c7..7f9b171 100644
>> --- a/arch/arm/cpu/armv7/config.mk
>> +++ b/arch/arm/cpu/armv7/config.mk
>> @@ -23,7 +23,7 @@
>>    PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
>>
>>    # Make ARMv5 to allow more compilers to work, even though its v7a.
>> -PLATFORM_CPPFLAGS += -march=armv5
>> +PLATFORM_CPPFLAGS += -march=armv7-a
>
> Did you check that this does not break any board using armv7?

I checked only Codesourcery tool chain.
Linux kernel build for a v7 architecture processor uses armv7-a. Is it
not fair to assume that the toolchain used for bootloader also supports
it? Do we have to support those out-dated compilers?

>>    # =========================================================================
>>    #
>>    # Supply options according to compiler version
>> diff --git a/arch/arm/include/asm/armv7.h b/arch/arm/include/asm/armv7.h
>> new file mode 100644
>> index 0000000..57409b6
>> --- /dev/null
>> +++ b/arch/arm/include/asm/armv7.h
>> @@ -0,0 +1,63 @@
>> +/*
>> + * (C) Copyright 2010
>> + * Texas Instruments Incorporated - http://www.ti.com/
>> + *
>> + * Aneesh V<aneesh@ti.com>
>> + *
>> + * See file CREDITS for list of people who contributed to this
>> + * project.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License as
>> + * published by the Free Software Foundation; either version 2 of
>> + * the License, or (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
>> + * MA 02111-1307 USA
>> + */
>> +#ifndef ARMV7_H
>> +#define ARMV7_H
>> +
>> +#include<linux/types.h>
>> +
>> +/*
>> + * Values for InD field in CSSELR
>> + * Selects the type of cache
>> + */
>> +#define ARMV7_CSSELR_IND_DATA_UNIFIED	0
>> +#define ARMV7_CSSELR_IND_INSTRUCTION	1
>> +
>> +/* Values for Ctype fields in CLIDR */
>> +#define ARMV7_CLIDR_CTYPE_NO_CACHE		0
>> +#define ARMV7_CLIDR_CTYPE_INSTRUCTION_ONLY	1
>> +#define ARMV7_CLIDR_CTYPE_DATA_ONLY		2
>> +#define ARMV7_CLIDR_CTYPE_INSTRUCTION_DATA	3
>> +#define ARMV7_CLIDR_CTYPE_UNIFIED		4
>> +
>> +/* some utility macros */
>> +#define mask(start, end) \
>> +	(((1<<   ((end) - (start) + 1)) - 1)<<   (start))
>> +
>> +#define mask_n_get(reg, start, end) \
>> +	(((reg)&   mask(start, end))>>   (start))
>
> Seeing as these functions are only used in the ARMv7 cache C file, they
> should be moved there.

I plan to use a modified version of mask_n_get() and its set couterpart
mask_n_set() in my subsequent works in more files.

Can I keep it here itself or should I move it to an OMAP specific
header file or can I move it to a more generic header file? Please
suggest.

Best regards,
Aneesh
Albert ARIBAUD Jan. 8, 2011, 2:06 p.m. UTC | #5
Le 08/01/2011 14:17, Aneesh V a écrit :

>> Why use pointers here rather than weak functions?
>
> In fact, I hadn't thought about it. Maybe I was biased by the Linux
> implementation.The only reason I can think of is that pointer gives the
> flexibility of doing this assignment at run-time. Let's say we had a
> multi-platform u-boot that detects the SOC at run-time?

I know we consider multi-board u-boot binaries when boards are variant 
of a given SoC, that's one reason why we wanted relocation. I'm not sure 
about multi-SoC when SoC is a variant of a given cpu, though. Wolfgang, 
your opinion?

>>> +void flush_cache(unsigned long start, unsigned long size)
>>> +{
>>> + flush_dcache_range(start, start + size);
>>> +}
>>
>> This function is the only one which is defined to something non-empty
>> when CONFIG_SYS_NO_DCACHE is defined. Why is it not in the big #ifndef
>> for dcache above ?
>
> Although this function is non-empty, flush_dcache_range() is in turn
> empty. Effect will be the same, right?

Yes the effect is the same, but your patch results in a non-trivially 
empty function; I'd prefer it to be visibly empty when we compile 
without cache support.

>>> diff --git a/arch/arm/cpu/armv7/config.mk b/arch/arm/cpu/armv7/config.mk
>>> index 49ac9c7..7f9b171 100644
>>> --- a/arch/arm/cpu/armv7/config.mk
>>> +++ b/arch/arm/cpu/armv7/config.mk
>>> @@ -23,7 +23,7 @@
>>> PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
>>>
>>> # Make ARMv5 to allow more compilers to work, even though its v7a.
>>> -PLATFORM_CPPFLAGS += -march=armv5
>>> +PLATFORM_CPPFLAGS += -march=armv7-a
>>
>> Did you check that this does not break any board using armv7?
>
> I checked only Codesourcery tool chain.
> Linux kernel build for a v7 architecture processor uses armv7-a. Is it
> not fair to assume that the toolchain used for bootloader also supports
> it? Do we have to support those out-dated compilers?

Just because Linux uses armv7-a for a v7 arch does not mean we must have 
it for u-boot. For starters, U-boot does not always boot Linux. :)

As for out-dated compilers, that's the question I'm asking: do we 
consider e.g. ELDK 4.2 as outdated or not? It won't accept armv7-a.

>>> +/* some utility macros */
>>> +#define mask(start, end) \
>>> + (((1<< ((end) - (start) + 1)) - 1)<< (start))
>>> +
>>> +#define mask_n_get(reg, start, end) \
>>> + (((reg)& mask(start, end))>> (start))
>>
>> Seeing as these functions are only used in the ARMv7 cache C file, they
>> should be moved there.
>
> I plan to use a modified version of mask_n_get() and its set couterpart
> mask_n_set() in my subsequent works in more files.
>
> Can I keep it here itself or should I move it to an OMAP specific
> header file or can I move it to a more generic header file? Please
> suggest.

They're very generic actually. I think they should go to a gereric bit 
manipulation header, and be named a... bit... more explicitly. For 
instance, the name 'mask' does not show that the macro creates a range 
of 'one' bits from start to end.

> Best regards,
> Aneesh

Amicalement,
Wolfgang Denk Jan. 9, 2011, 10:41 p.m. UTC | #6
Dear Albert ARIBAUD,

In message <4D286F58.9010605@free.fr> you wrote:
> 
> I know we consider multi-board u-boot binaries when boards are variant 
> of a given SoC, that's one reason why we wanted relocation. I'm not sure 
> about multi-SoC when SoC is a variant of a given cpu, though. Wolfgang, 
> your opinion?

Unless we see a specific example which uses this feature, we should
not add provisions that make the code more complicated than needed.

And when we start supporting such a feature, we should probably do
this based on a device tree approach.

> > Although this function is non-empty, flush_dcache_range() is in turn
> > empty. Effect will be the same, right?
> 
> Yes the effect is the same, but your patch results in a non-trivially 
> empty function; I'd prefer it to be visibly empty when we compile 
> without cache support.

Yes, that's my opinion, too.


> Just because Linux uses armv7-a for a v7 arch does not mean we must have 
> it for u-boot. For starters, U-boot does not always boot Linux. :)
> 
> As for out-dated compilers, that's the question I'm asking: do we 
> consider e.g. ELDK 4.2 as outdated or not? It won't accept armv7-a.

That's a catch question.

Yes, ELDK 4.2 is outdated.  But it is still one of the most stable
versions of a tool chain known to me, especially when it comes to
using the very same tool versions across several architectures.

I cannot see any benefits of this code change that would justiify such
a breakage.


Best regards,

Wolfgang Denk
Aneesh V Jan. 10, 2011, 4:56 a.m. UTC | #7
Dear Wolfgang,

On Monday 10 January 2011 04:11 AM, Wolfgang Denk wrote:
> Dear Albert ARIBAUD,
>
> In message<4D286F58.9010605@free.fr>  you wrote:
>>
>> I know we consider multi-board u-boot binaries when boards are variant
>> of a given SoC, that's one reason why we wanted relocation. I'm not sure
>> about multi-SoC when SoC is a variant of a given cpu, though. Wolfgang,
>> your opinion?
>
> Unless we see a specific example which uses this feature, we should
> not add provisions that make the code more complicated than needed.

Agree. But do you think the pointer based approach makes it overly
complex?

>
> And when we start supporting such a feature, we should probably do
> this based on a device tree approach.
>
>>> Although this function is non-empty, flush_dcache_range() is in turn
>>> empty. Effect will be the same, right?
>>
>> Yes the effect is the same, but your patch results in a non-trivially
>> empty function; I'd prefer it to be visibly empty when we compile
>> without cache support.
>
> Yes, that's my opinion, too.

Agree.

>
>
>> Just because Linux uses armv7-a for a v7 arch does not mean we must have
>> it for u-boot. For starters, U-boot does not always boot Linux. :)
>>
>> As for out-dated compilers, that's the question I'm asking: do we
>> consider e.g. ELDK 4.2 as outdated or not? It won't accept armv7-a.
>
> That's a catch question.
>
> Yes, ELDK 4.2 is outdated.  But it is still one of the most stable
> versions of a tool chain known to me, especially when it comes to
> using the very same tool versions across several architectures.
>
> I cannot see any benefits of this code change that would justiify such
> a breakage.

Agree. The only benefit is that I can use some memory barrier
instructions without hand-coding the respective machine instructions.
But that can be done if it helps avoiding compiler breakage.

>
>
> Best regards,
>
> Wolfgang Denk
>
Aneesh V Jan. 12, 2011, 9:08 a.m. UTC | #8
On Saturday 08 January 2011 07:36 PM, Albert ARIBAUD wrote:
> Le 08/01/2011 14:17, Aneesh V a écrit :
>
<snip..>

>>>> +/* some utility macros */
>>>> +#define mask(start, end) \
>>>> + (((1<< ((end) - (start) + 1)) - 1)<< (start))
>>>> +
>>>> +#define mask_n_get(reg, start, end) \
>>>> + (((reg)& mask(start, end))>> (start))
>>>
>>> Seeing as these functions are only used in the ARMv7 cache C file, they
>>> should be moved there.
>>
>> I plan to use a modified version of mask_n_get() and its set couterpart
>> mask_n_set() in my subsequent works in more files.
>>
>> Can I keep it here itself or should I move it to an OMAP specific
>> header file or can I move it to a more generic header file? Please
>> suggest.
>
> They're very generic actually. I think they should go to a gereric bit
> manipulation header, and be named a... bit... more explicitly. For
> instance, the name 'mask' does not show that the macro creates a range
> of 'one' bits from start to end.
>

What I need is something like below:

#define get_bit_field(nr, start, mask)\
	(((nr) & (mask)) >> (start))

#define set_bit_field(nr, start, mask, val)\
	(nr) = ((nr) & ~(mask)) | (((val) << (start)) & (mask))

Can these go in a generic header? If so, can I add them to
"include/linux/bitops.h"


Best regards,
Aneesh
Albert ARIBAUD Jan. 12, 2011, 7:18 p.m. UTC | #9
(I realize I did not answer the other ones)

Le 08/01/2011 11:06, Aneesh V a écrit :

>> Out of curiosity, can you elaborate on why the compiler would optimize
>> better in these cases?
>
> While counting down the termination condition check is against 0. So
> you can just decrement the loop count using a 'subs' and do a 'bne'.
> When you count up you have to do a comparison with a non-zero value. So
> you will need one 'cmp' instruction extra:-)

I would not try to be too smart about what instructions are generated 
and how by a compiler such as gcc which has rather complex code 
generation optimizations.

> bigger loop inside because that reduces the frequency at which your
> outer parameter changes and hence the overall number of instructions
> executed. Consider this:
> 1. We encode both the loop counts along with other data into a register
> that is finally written to CP15 register.
> 2. outer loop has the code for shifting and ORing the outer variable to
> this register.
> 3. Inner loop has the code for shifting and ORing the inner variable.
> Step (3) has to be executed 'way x set' number of times anyways.
> But having bigger loop inside makes sure that 2 is executed fewer times!

Here too it seems like you're underestimating the compiler's optimizing 
capabilities -- your explanation seems to amount to extracting a 
constant calculation from a loop, something that is rather usual in code 
optimizing.

> With these tweaks the assembly code generated by this C code is as good
> as the original hand-written assembly code with my compiler.

How about other compilers?

>>> +	for (way = num_ways - 1; way>= 0 ; way--)
>>> +		for (set = num_sets - 1; set>= 0; set--) {
>>
>> Please fix whitespacing around operators. The best way to ''catch'em
>> all'' is to run Linux' checkpatch.pl (I do this with option --no-tree)
>> on all patches that you submit to u-boot and, fix all warning and errors
>> and if some are left that you think should not be fixed, mention them
>> and explain why they're wrongly emitted.
>
> I religiously do checkpatch whenever I send out a patch. Please note
> that my original mail seems to be fine. I saved it and ran checkpatch
> again. No errors, no warnings! Something amiss?

Well, something like "set>= 0" is quite surprising as it has 
inconsistent spacing around a binary operators. But you're right, 
checkpatch does not detect it. Can you fix them manually?

> Best regards,
> Aneesh

Amicalement,
Albert ARIBAUD Jan. 12, 2011, 7:23 p.m. UTC | #10
Le 12/01/2011 10:08, Aneesh V a écrit :
> On Saturday 08 January 2011 07:36 PM, Albert ARIBAUD wrote:
>> Le 08/01/2011 14:17, Aneesh V a écrit :
>>
> <snip..>
>
>>>>> +/* some utility macros */
>>>>> +#define mask(start, end) \
>>>>> + (((1<< ((end) - (start) + 1)) - 1)<< (start))
>>>>> +
>>>>> +#define mask_n_get(reg, start, end) \
>>>>> + (((reg)& mask(start, end))>> (start))
>>>>
>>>> Seeing as these functions are only used in the ARMv7 cache C file, they
>>>> should be moved there.
>>>
>>> I plan to use a modified version of mask_n_get() and its set couterpart
>>> mask_n_set() in my subsequent works in more files.
>>>
>>> Can I keep it here itself or should I move it to an OMAP specific
>>> header file or can I move it to a more generic header file? Please
>>> suggest.
>>
>> They're very generic actually. I think they should go to a gereric bit
>> manipulation header, and be named a... bit... more explicitly. For
>> instance, the name 'mask' does not show that the macro creates a range
>> of 'one' bits from start to end.
>
> What I need is something like below:
>
> #define get_bit_field(nr, start, mask)\
> (((nr) & (mask)) >> (start))
>
> #define set_bit_field(nr, start, mask, val)\
> (nr) = ((nr) & ~(mask)) | (((val) << (start)) & (mask))
>
> Can these go in a generic header? If so, can I add them to
> "include/linux/bitops.h"

After some more thought, I am wondering if a *generic* field setting and 
getting macro is really useful. So far everyone is fine with at most 
defining field-specific macros.

> Best regards,
> Aneesh

Amicalement,
Aneesh V Jan. 13, 2011, 11:10 a.m. UTC | #11
On Thursday 13 January 2011 12:48 AM, Albert ARIBAUD wrote:
> (I realize I did not answer the other ones)
>
> Le 08/01/2011 11:06, Aneesh V a écrit :
>
>>> Out of curiosity, can you elaborate on why the compiler would optimize
>>> better in these cases?
>>
>> While counting down the termination condition check is against 0. So
>> you can just decrement the loop count using a 'subs' and do a 'bne'.
>> When you count up you have to do a comparison with a non-zero value. So
>> you will need one 'cmp' instruction extra:-)
>
> I would not try to be too smart about what instructions are generated
> and how by a compiler such as gcc which has rather complex code
> generation optimizations.

IMHO, on ARM comparing with 0 is always going to be efficient than
comparing with a non-zero number for a termination condition, assuming
a decent compiler.

>
>> bigger loop inside because that reduces the frequency at which your
>> outer parameter changes and hence the overall number of instructions
>> executed. Consider this:
>> 1. We encode both the loop counts along with other data into a register
>> that is finally written to CP15 register.
>> 2. outer loop has the code for shifting and ORing the outer variable to
>> this register.
>> 3. Inner loop has the code for shifting and ORing the inner variable.
>> Step (3) has to be executed 'way x set' number of times anyways.
>> But having bigger loop inside makes sure that 2 is executed fewer times!
>

It's not a constant calculation. It's based on loop index. And this
optimization is not relying on compiler specifics. This is a logic
level optimization. It should generally give good results with all
compilers. Perhaps I was wrong in stating that it helps in getting
better assembly. It just helps in better run-time efficiency.

 > Here too it seems like you're underestimating the compiler's optimizing
 > capabilities -- your explanation seems to amount to extracting a
 > constant calculation from a loop, something that is rather usual in code
 > optimizing.

Actually, in my experience(in this same context) GCC does a terrible
job at this! For instance:

+		for (set = num_sets - 1; set >= 0; set--) {
+			setway = (level << 1) | (set << log2_line_len) |
+				 (way << way_shift);

Here, way_shift = 32 - log2_num_ways

But if you substitute way_shift with the latter, GCC will put the 
subtraction instruction inside the loop! - where as it is clearly loop 
invariant. So, I had to move it explicitly out of the loop!
In fact, I was thinking of giving this feedback to GCC.

>
>> With these tweaks the assembly code generated by this C code is as good
>> as the original hand-written assembly code with my compiler.
>
> How about other compilers?
>

I haven't tested other compilers. However, as I mentioned above the
latter one is a logic optimization. The former hopefully should help
all ARM compilers.

As you must be knowing, existing code for cache maintenance was in
assembly. When I moved it to C I wanted to make sure that the generated
code is as good as the original assembly for this critical piece of
code (I expected some criticism about moving it to C :-)). That's
why I checked the generated code and did these ,hopefully, minor tweaks
to make it better. I hope they don't have any serious drawbacks.

Best regards,
Aneesh
Aneesh V Jan. 13, 2011, 12:05 p.m. UTC | #12
On Thursday 13 January 2011 12:53 AM, Albert ARIBAUD wrote:
> Le 12/01/2011 10:08, Aneesh V a écrit :
>> On Saturday 08 January 2011 07:36 PM, Albert ARIBAUD wrote:
>>> Le 08/01/2011 14:17, Aneesh V a écrit :
>>>
>> <snip..>
>>
>>>>>> +/* some utility macros */
>>>>>> +#define mask(start, end) \
>>>>>> + (((1<< ((end) - (start) + 1)) - 1)<< (start))
>>>>>> +
>>>>>> +#define mask_n_get(reg, start, end) \
>>>>>> + (((reg)& mask(start, end))>> (start))
>>>>>
>>>>> Seeing as these functions are only used in the ARMv7 cache C file,
>>>>> they
>>>>> should be moved there.
>>>>
>>>> I plan to use a modified version of mask_n_get() and its set couterpart
>>>> mask_n_set() in my subsequent works in more files.
>>>>
>>>> Can I keep it here itself or should I move it to an OMAP specific
>>>> header file or can I move it to a more generic header file? Please
>>>> suggest.
>>>
>>> They're very generic actually. I think they should go to a gereric bit
>>> manipulation header, and be named a... bit... more explicitly. For
>>> instance, the name 'mask' does not show that the macro creates a range
>>> of 'one' bits from start to end.
>>
>> What I need is something like below:
>>
>> #define get_bit_field(nr, start, mask)\
>> (((nr) & (mask)) >> (start))
>>
>> #define set_bit_field(nr, start, mask, val)\
>> (nr) = ((nr) & ~(mask)) | (((val) << (start)) & (mask))
>>
>> Can these go in a generic header? If so, can I add them to
>> "include/linux/bitops.h"
>
> After some more thought, I am wondering if a *generic* field setting and
> getting macro is really useful. So far everyone is fine with at most
> defining field-specific macros.

Is it going to be easy if you have many fields to deal with?

However, I agree that the above may be specific to our needs.

What may be of more generic interest may be something like this with
the mask automatically generated:
#define get_bit_field(nr, start, end)
#define set_bit_field(nr, start, end, val)

However, in our case I am already given the mask and start position for
each field (automatically generated from hw database). So, I prefer the
former versions.

If that doesn't look useful for generic use I will put them in
OMAP specific headers.

Best regards,
Aneesh
Aneesh V Jan. 13, 2011, 12:14 p.m. UTC | #13
On Thursday 13 January 2011 12:48 AM, Albert ARIBAUD wrote:
<snip ..>

>>>> + for (way = num_ways - 1; way>= 0 ; way--)
>>>> + for (set = num_sets - 1; set>= 0; set--) {
>>>
>>> Please fix whitespacing around operators. The best way to ''catch'em
>>> all'' is to run Linux' checkpatch.pl (I do this with option --no-tree)
>>> on all patches that you submit to u-boot and, fix all warning and errors
>>> and if some are left that you think should not be fixed, mention them
>>> and explain why they're wrongly emitted.
>>
>> I religiously do checkpatch whenever I send out a patch. Please note
>> that my original mail seems to be fine. I saved it and ran checkpatch
>> again. No errors, no warnings! Something amiss?
>
> Well, something like "set>= 0" is quite surprising as it has
> inconsistent spacing around a binary operators. But you're right,
> checkpatch does not detect it. Can you fix them manually?

Checkpatch does find such issues. I was trying to say that my original
mail doesn't have any spacing issues. The problem seems to have
appeared in your reply. Is your mail client doing something funny?

>
>> Best regards,
>> Aneesh
>
> Amicalement,
Albert ARIBAUD Jan. 13, 2011, 1:14 p.m. UTC | #14
Le 13/01/2011 13:05, Aneesh V a écrit :

>>> What I need is something like below:
>>>
>>> #define get_bit_field(nr, start, mask)\
>>> (((nr) & (mask)) >> (start))
>>>
>>> #define set_bit_field(nr, start, mask, val)\
>>> (nr) = ((nr) & ~(mask)) | (((val) << (start)) & (mask))
>>>
>>> Can these go in a generic header? If so, can I add them to
>>> "include/linux/bitops.h"
>>
>> After some more thought, I am wondering if a *generic* field setting and
>> getting macro is really useful. So far everyone is fine with at most
>> defining field-specific macros.
>
> Is it going to be easy if you have many fields to deal with?

I don't see how the generic macros ease anything. Instead of defining say

#define get_field_F(x)  ((x >> F_start) & F_mask)
#define set_field_F(x,v)  { x = (x ~ F_mask ) | (v << F_start) }

You'd have

#define get_field_F(x)  get_bit_field(x, F_start, F_mask)
#define set_field_F(x,v) set_bit_field(x, F_start, F_mask, v);

Which does not seem to bring any simplicity to me.

> However, I agree that the above may be specific to our needs.
>
> What may be of more generic interest may be something like this with
> the mask automatically generated:
> #define get_bit_field(nr, start, end)
> #define set_bit_field(nr, start, end, val)
>
> However, in our case I am already given the mask and start position for
> each field (automatically generated from hw database). So, I prefer the
> former versions.
>
> If that doesn't look useful for generic use I will put them in
> OMAP specific headers.
>
> Best regards,
> Aneesh

Amicalement,
Aneesh V Jan. 13, 2011, 2:30 p.m. UTC | #15
On Thursday 13 January 2011 06:44 PM, Albert ARIBAUD wrote:
> Le 13/01/2011 13:05, Aneesh V a écrit :
>
>>>> What I need is something like below:
>>>>
>>>> #define get_bit_field(nr, start, mask)\
>>>> (((nr) & (mask)) >> (start))
>>>>
>>>> #define set_bit_field(nr, start, mask, val)\
>>>> (nr) = ((nr) & ~(mask)) | (((val) << (start)) & (mask))
>>>>
>>>> Can these go in a generic header? If so, can I add them to
>>>> "include/linux/bitops.h"
>>>
>>> After some more thought, I am wondering if a *generic* field setting and
>>> getting macro is really useful. So far everyone is fine with at most
>>> defining field-specific macros.
>>
>> Is it going to be easy if you have many fields to deal with?
>
> I don't see how the generic macros ease anything. Instead of defining say
>
> #define get_field_F(x) ((x >> F_start) & F_mask)
> #define set_field_F(x,v) { x = (x ~ F_mask ) | (v << F_start) }
>
> You'd have
>
> #define get_field_F(x) get_bit_field(x, F_start, F_mask)
> #define set_field_F(x,v) set_bit_field(x, F_start, F_mask, v);
>
> Which does not seem to bring any simplicity to me.

I wouldn't define get_field_F.
Instead I would just use set_bit_field(x, F_start, F_mask, v) directly
in the code and I have F_start and F_mask defined in the header files
(automatically generated)

Even if it was manual isn't it easier to define just F_start and F_mask
per field than defining a get_field_F per field?

Perhaps my requirement is different. If this scheme is not used by
many, I shall put these macros in OMAP specific headers.

best regards,
Aneesh
Albert ARIBAUD Jan. 13, 2011, 5:06 p.m. UTC | #16
Le 13/01/2011 15:30, Aneesh V a écrit :

> Perhaps my requirement is different. If this scheme is not used by
> many, I shall put these macros in OMAP specific headers.

Yes, I'd prefer that, finally.

> best regards,
> Aneesh

Amicalement,
Albert ARIBAUD Jan. 13, 2011, 5:12 p.m. UTC | #17
Le 13/01/2011 13:14, Aneesh V a écrit :
> On Thursday 13 January 2011 12:48 AM, Albert ARIBAUD wrote:
> <snip ..>
>
>>>>> + for (way = num_ways - 1; way>= 0 ; way--)
>>>>> + for (set = num_sets - 1; set>= 0; set--) {
>>>>
>>>> Please fix whitespacing around operators. The best way to ''catch'em
>>>> all'' is to run Linux' checkpatch.pl (I do this with option --no-tree)
>>>> on all patches that you submit to u-boot and, fix all warning and
>>>> errors
>>>> and if some are left that you think should not be fixed, mention them
>>>> and explain why they're wrongly emitted.
>>>
>>> I religiously do checkpatch whenever I send out a patch. Please note
>>> that my original mail seems to be fine. I saved it and ran checkpatch
>>> again. No errors, no warnings! Something amiss?
>>
>> Well, something like "set>= 0" is quite surprising as it has
>> inconsistent spacing around a binary operators. But you're right,
>> checkpatch does not detect it. Can you fix them manually?
>
> Checkpatch does find such issues. I was trying to say that my original
> mail doesn't have any spacing issues. The problem seems to have
> appeared in your reply. Is your mail client doing something funny?

You're right. The issue appears when doing a "reply" with Thunderbird. 
Seems to me like a recent change in behavior, as I recall having replied 
to mails with this kind of spacing preserved. Sorry for the noise.

Amicalement,
Wolfgang Denk Jan. 17, 2011, 9:47 p.m. UTC | #18
Dear Aneesh V,

In message <4D2A9164.5020107@ti.com> you wrote:
> 
> > Unless we see a specific example which uses this feature, we should
> > not add provisions that make the code more complicated than needed.
> 
> Agree. But do you think the pointer based approach makes it overly
> complex?

Not overly complex, but more complex than needed.

Best regards,

Wolfgang Denk
Aneesh V March 1, 2011, 11:54 a.m. UTC | #19
Hi Albert,

On Saturday 08 January 2011 12:06 PM, Albert ARIBAUD wrote:
> Hi Aneesh,
[snip ..]
>> +/* some utility macros */
>> +#define mask(start, end) \
>> +	(((1<<   ((end) - (start) + 1)) - 1)<<   (start))
>> +
>> +#define mask_n_get(reg, start, end) \
>> +	(((reg)&   mask(start, end))>>   (start))
>
> Seeing as these functions are only used in the ARMv7 cache C file, they
> should be moved there.
>

I am working on v2 of this series.

We had aligned on moving these macros to omap specific headers. But I
now realize that I want to use them from cache_v7.c, so it can not be
in omap header. I have used them in other places too in my recent work
so it needs to be in a header file. I am putting these functions and
some other utils I need in a new file "arch/arm/include/asm/utils.h"
Is that fine?

Best regards,
Aneesh
Albert ARIBAUD March 1, 2011, 1:36 p.m. UTC | #20
Le 01/03/2011 12:54, Aneesh V a écrit :
> Hi Albert,
>
> On Saturday 08 January 2011 12:06 PM, Albert ARIBAUD wrote:
>> Hi Aneesh,
> [snip ..]
>>> +/* some utility macros */
>>> +#define mask(start, end) \
>>> + (((1<< ((end) - (start) + 1)) - 1)<< (start))
>>> +
>>> +#define mask_n_get(reg, start, end) \
>>> + (((reg)& mask(start, end))>> (start))
>>
>> Seeing as these functions are only used in the ARMv7 cache C file, they
>> should be moved there.
>>
>
> I am working on v2 of this series.
>
> We had aligned on moving these macros to omap specific headers. But I
> now realize that I want to use them from cache_v7.c, so it can not be
> in omap header. I have used them in other places too in my recent work
> so it needs to be in a header file. I am putting these functions and
> some other utils I need in a new file "arch/arm/include/asm/utils.h"
> Is that fine?

I need to look at the code that uses these macros again. I'll do that 
tonight.

> Best regards,
> Aneesh

Amicalement,
diff mbox

Patch

diff --git a/arch/arm/cpu/armv7/Makefile b/arch/arm/cpu/armv7/Makefile
index 8c0e915..299792a 100644
--- a/arch/arm/cpu/armv7/Makefile
+++ b/arch/arm/cpu/armv7/Makefile
@@ -26,7 +26,7 @@  include $(TOPDIR)/config.mk
 LIB	= $(obj)lib$(CPU).o
 
 START	:= start.o
-COBJS	:= cpu.o
+COBJS	:= cpu.o cache_v7.o
 COBJS  += syslib.o
 
 SRCS	:= $(START:.o=.S) $(COBJS:.o=.c)
diff --git a/arch/arm/cpu/armv7/cache_v7.c b/arch/arm/cpu/armv7/cache_v7.c
new file mode 100644
index 0000000..0521d66
--- /dev/null
+++ b/arch/arm/cpu/armv7/cache_v7.c
@@ -0,0 +1,359 @@ 
+/*
+ * (C) Copyright 2010
+ * Texas Instruments Incorporated - http://www.ti.com/
+ * Aneesh V <aneesh@ti.com>
+ *
+ * See file CREDITS for list of people who contributed to this
+ * project.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ */
+#include <linux/types.h>
+#include <common.h>
+#include <asm/armv7.h>
+
+#define ARMV7_DCACHE_INVAL_ALL		1
+#define ARMV7_DCACHE_CLEAN_INVAL_ALL	2
+#define ARMV7_DCACHE_INVAL_RANGE	3
+#define ARMV7_DCACHE_CLEAN_INVAL_RANGE	4
+
+struct v7_outer_cache_ops v7_outer_cache;
+
+#ifndef CONFIG_SYS_NO_DCACHE
+/*
+ * Write the level and type you want to Cache Size Selection Register(CSSELR)
+ * to get size details from Current Cache Size ID Register(CCSIDR)
+ */
+static void set_csselr(u32 level, u32 type)
+{	u32 csselr = level << 1 | type;
+	/* Write to Cache Size Selection Register(CSSELR) */
+	asm volatile ("mcr p15, 2, %0, c0, c0, 0" : : "r" (csselr));
+}
+
+static u32 get_ccsidr(void)
+{
+	u32 ccsidr;
+	/* Read current CP15 Cache Size ID Register */
+	asm volatile ("mrc p15, 1, %0, c0, c0, 0" : "=r" (ccsidr));
+	return ccsidr;
+}
+
+static u32 get_clidr(void)
+{
+	u32 clidr;
+	/* Read current CP15 Cache Level ID Register */
+	asm volatile ("mrc p15,1,%0,c0,c0,1" : "=r" (clidr));
+	return clidr;
+}
+
+/* round up the input number to a power of 2 and get the log2 */
+static inline u32 log_2_round_up(u32 num)
+{
+	/* count leading zeros */
+	asm volatile ("CLZ %0, %0" : "+r" (num));
+
+	/* position of most significant 1 */
+	num = 31 - num;
+
+	return num;
+}
+
+static void v7_inval_dcache_level_setway(u32 level, u32 num_sets,
+					 u32 num_ways, u32 way_shift,
+					 u32 log2_line_len)
+{
+	int way, set, setway;
+	/*
+	 * For optimal assembly code:
+	 *	a. count down
+	 *	b. have bigger loop inside
+	 */
+	for (way = num_ways - 1; way >= 0 ; way--)
+		for (set = num_sets - 1; set >= 0; set--) {
+			setway = (level << 1) | (set << log2_line_len) |
+				 (way << way_shift);
+			/* Invalidate data/unified cache line by set/way */
+			asm volatile ("	mcr p15, 0, %0, c7, c6, 2"
+					: : "r" (setway));
+		}
+	/* Make sure the operation is complete */
+	asm volatile ("DMB");
+}
+
+static void v7_clean_inval_dcache_level_setway(u32 level, u32 num_sets,
+					       u32 num_ways, u32 way_shift,
+					       u32 log2_line_len)
+{
+	int way, set, setway;
+	/*
+	 * For optimal assembly code:
+	 *	a. count down
+	 *	b. have bigger loop inside
+	 */
+	for (way = num_ways - 1; way >= 0 ; way--)
+		for (set = num_sets - 1; set >= 0; set--) {
+			setway = (level << 1) | (set << log2_line_len) |
+				 (way << way_shift);
+			/*
+			 * Clean & Invalidate data/unified
+			 * cache line by set/way
+			 */
+			asm volatile ("	mcr p15, 0, %0, c7, c14, 2"
+					: : "r" (setway));
+		}
+	/* Make sure the operation is complete */
+	asm volatile ("DMB");
+}
+
+static void v7_maint_dcache_level_setway(u32 level, u32 operation)
+{
+	u32 ccsidr;
+	u32 num_sets, num_ways, log2_line_len, log2_num_ways;
+	u32 way_shift;
+	set_csselr(level, ARMV7_CSSELR_IND_DATA_UNIFIED);
+
+	ccsidr = get_ccsidr();
+
+	log2_line_len = mask_n_get(ccsidr, 0, 2) + 2;
+	/* Converting from words to bytes */
+	log2_line_len += 2;
+
+	num_ways  = mask_n_get(ccsidr, 3, 12) + 1;
+	num_sets  = mask_n_get(ccsidr, 13, 27) + 1;
+	/*
+	 * According to ARMv7 ARM number of sets and number of ways need
+	 * not be a power of 2
+	 */
+	log2_num_ways = log_2_round_up(num_ways);
+
+	way_shift = (32 - log2_num_ways);
+	if (operation == ARMV7_DCACHE_INVAL_ALL)
+		v7_inval_dcache_level_setway(level, num_sets, num_ways,
+				      way_shift, log2_line_len);
+	else if (operation == ARMV7_DCACHE_CLEAN_INVAL_ALL)
+		v7_clean_inval_dcache_level_setway(level, num_sets, num_ways,
+						   way_shift, log2_line_len);
+}
+
+static void v7_maint_dcache_all(u32 operation)
+{
+	u32 level, cache_type, level_start_bit = 0;
+
+	u32 clidr = get_clidr();
+
+	for (level = 0; level < 7; level++) {
+		cache_type = mask_n_get(clidr, level_start_bit,
+					level_start_bit + 2);
+		if ((cache_type == ARMV7_CLIDR_CTYPE_DATA_ONLY) ||
+		    (cache_type == ARMV7_CLIDR_CTYPE_INSTRUCTION_DATA) ||
+		    (cache_type == ARMV7_CLIDR_CTYPE_UNIFIED))
+			v7_maint_dcache_level_setway(level, operation);
+		level_start_bit += 3;
+	}
+}
+
+static void v7_dcache_clean_inval_range(u32 start,
+					u32 stop, u32 line_len)
+{
+	u32 mva;
+	/* Align start to cache line boundary */
+	start &= ~(line_len - 1);
+	for (mva = start; mva < stop; mva = mva + line_len)
+		/* DCCIMVAC - Clean & Invalidate data cache by MVA to PoC */
+		asm volatile ("mcr p15, 0, %0, c7, c14, 1" : : "r" (mva));
+}
+
+static void v7_dcache_inval_range(u32 start, u32 stop, u32 line_len)
+{
+	u32 mva;
+
+	/*
+	 * If start address is not aligned to cache-line flush the first
+	 * line to prevent affecting somebody else's buffer
+	 */
+	if (start & (line_len - 1)) {
+		v7_dcache_clean_inval_range(start, start + 1, line_len);
+		/* move to next cache line */
+		start = (start + line_len - 1) & ~(line_len - 1);
+	}
+
+	/*
+	 * If stop address is not aligned to cache-line flush the last
+	 * line to prevent affecting somebody else's buffer
+	 */
+	if (stop & (line_len - 1)) {
+		v7_dcache_clean_inval_range(stop, stop + 1, line_len);
+		/* align to the beginning of this cache line */
+		stop &= ~(line_len - 1);
+	}
+
+	for (mva = start; mva < stop; mva = mva + line_len)
+		/* DCIMVAC - Invalidate data cache by MVA to PoC */
+		asm volatile ("mcr p15, 0, %0, c7, c6, 1" : : "r" (mva));
+}
+
+static void v7_dcache_maint_range(u32 start, u32 stop, u32 range_op)
+{
+	u32 line_len, ccsidr;
+	ccsidr = get_ccsidr();
+	line_len = mask_n_get(ccsidr, 0, 2) + 2;
+	/* Converting from words to bytes */
+	line_len += 2;
+	/* converting from log2(linelen) to linelen */
+	line_len = 1 << line_len;
+
+	switch (range_op) {
+	case ARMV7_DCACHE_CLEAN_INVAL_RANGE:
+		v7_dcache_clean_inval_range(start, stop, line_len);
+		break;
+	case ARMV7_DCACHE_INVAL_RANGE:
+		v7_dcache_inval_range(start, stop, line_len);
+		break;
+	}
+
+	/* Make sure the operation is complete */
+	asm volatile ("DMB");
+}
+
+/* Invalidate TLB */
+static void v7_inval_tlb(void)
+{
+	/* Invalidate entire unified TLB */
+	asm volatile ("mcr p15, 0, %0, c8, c7, 0" : : "r" (0));
+	/* Invalidate entire data TLB */
+	asm volatile ("mcr p15, 0, %0, c8, c6, 0" : : "r" (0));
+	/* Invalidate entire instruction TLB */
+	asm volatile ("mcr p15, 0, %0, c8, c5, 0" : : "r" (0));
+	/* Full system DSB - make sure that the invalidation is complete */
+	asm volatile ("DSB");
+	/* Full system ISB - make sure the instruction stream sees it */
+	asm volatile ("ISB");
+}
+
+void invalidate_dcache_all(void)
+{
+	v7_maint_dcache_all(ARMV7_DCACHE_INVAL_ALL);
+	if (v7_outer_cache.inval_all)
+		v7_outer_cache.inval_all();
+}
+
+/*
+ * Performs a clean & invalidation of the entire data cache
+ * at all levels
+ */
+void flush_dcache_all(void)
+{
+	v7_maint_dcache_all(ARMV7_DCACHE_CLEAN_INVAL_ALL);
+	if (v7_outer_cache.flush_all)
+		v7_outer_cache.flush_all();
+}
+
+/*
+ * Invalidates range in all levels of D-cache/unified cache used:
+ * Affects the range [start, stop - 1]
+ */
+void invalidate_dcache_range(unsigned long start, unsigned long stop)
+{
+
+	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_INVAL_RANGE);
+	if (v7_outer_cache.inval_range)
+		/* physical address is same as virtual address */
+		v7_outer_cache.inval_range(start, stop);
+}
+
+/*
+ * Flush range(clean & invalidate) from all levels of D-cache/unified
+ * cache used:
+ * Affects the range [start, stop - 1]
+ */
+void flush_dcache_range(unsigned long start, unsigned long stop)
+{
+	v7_dcache_maint_range(start, stop, ARMV7_DCACHE_CLEAN_INVAL_RANGE);
+	if (v7_outer_cache.flush_range)
+		/* physical address is same as virtual address */
+		v7_outer_cache.flush_range(start, stop);
+}
+static void __v7_setup_outer_cache_ops(void)
+{
+	puts("v7_setup_outer_cache_ops: dummy implementation! "
+	     "real implementation not available!!\n");
+}
+
+void v7_setup_outer_cache_ops(void)
+	__attribute__((weak, alias("__v7_setup_outer_cache_ops")));
+
+void arm_init_before_mmu(void)
+{
+	v7_setup_outer_cache_ops();
+	if (v7_outer_cache.enable)
+		v7_outer_cache.enable();
+	invalidate_dcache_all();
+	v7_inval_tlb();
+}
+#else
+void invalidate_dcache_all(void)
+{
+}
+
+void flush_dcache_all(void)
+{
+}
+
+void invalidate_dcache_range(unsigned long start, unsigned long stop)
+{
+}
+
+void flush_dcache_range(unsigned long start, unsigned long stop)
+{
+}
+
+void arm_init_before_mmu(void)
+{
+}
+#endif
+
+#ifndef CONFIG_SYS_NO_ICACHE
+/* Invalidate entire I-cache and branch predictor array */
+void invalidate_icache_all(void)
+{
+	/*
+	 * Invalidate all instruction caches to PoU.
+	 * Also flushes branch target cache.
+	 */
+	asm volatile ("mcr p15, 0, %0, c7, c5, 0" : : "r" (0));
+
+	/* Invalidate entire branch predictor array */
+	asm volatile ("mcr p15, 0, %0, c7, c5, 6" : : "r" (0));
+
+	/* Full system DSB - make sure that the invalidation is complete */
+	asm volatile ("DSB");
+	/* Full system ISB - make sure the instruction stream sees it */
+	asm volatile ("ISB");
+}
+#else
+void invalidate_icache_all(void)
+{
+}
+#endif
+
+/*
+ * Flush range from all levels of d-cache/unified-cache used:
+ * Affects the range [start, start + size - 1]
+ */
+void  flush_cache(unsigned long start, unsigned long size)
+{
+	flush_dcache_range(start, start + size);
+}
diff --git a/arch/arm/cpu/armv7/config.mk b/arch/arm/cpu/armv7/config.mk
index 49ac9c7..7f9b171 100644
--- a/arch/arm/cpu/armv7/config.mk
+++ b/arch/arm/cpu/armv7/config.mk
@@ -23,7 +23,7 @@ 
 PLATFORM_RELFLAGS += -fno-common -ffixed-r8 -msoft-float
 
 # Make ARMv5 to allow more compilers to work, even though its v7a.
-PLATFORM_CPPFLAGS += -march=armv5
+PLATFORM_CPPFLAGS += -march=armv7-a
 # =========================================================================
 #
 # Supply options according to compiler version
diff --git a/arch/arm/include/asm/armv7.h b/arch/arm/include/asm/armv7.h
new file mode 100644
index 0000000..57409b6
--- /dev/null
+++ b/arch/arm/include/asm/armv7.h
@@ -0,0 +1,63 @@ 
+/*
+ * (C) Copyright 2010
+ * Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * Aneesh V <aneesh@ti.com>
+ *
+ * See file CREDITS for list of people who contributed to this
+ * project.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ */
+#ifndef ARMV7_H
+#define ARMV7_H
+
+#include <linux/types.h>
+
+/*
+ * Values for InD field in CSSELR
+ * Selects the type of cache
+ */
+#define ARMV7_CSSELR_IND_DATA_UNIFIED	0
+#define ARMV7_CSSELR_IND_INSTRUCTION	1
+
+/* Values for Ctype fields in CLIDR */
+#define ARMV7_CLIDR_CTYPE_NO_CACHE		0
+#define ARMV7_CLIDR_CTYPE_INSTRUCTION_ONLY	1
+#define ARMV7_CLIDR_CTYPE_DATA_ONLY		2
+#define ARMV7_CLIDR_CTYPE_INSTRUCTION_DATA	3
+#define ARMV7_CLIDR_CTYPE_UNIFIED		4
+
+/* some utility macros */
+#define mask(start, end) \
+	(((1 << ((end) - (start) + 1)) - 1) << (start))
+
+#define mask_n_get(reg, start, end) \
+	(((reg) & mask(start, end)) >> (start))
+
+struct v7_outer_cache_ops {
+	void (*enable)(void);
+	void (*disable)(void);
+	void (*flush_all)(void);
+	void (*inval_all)(void);
+	void (*flush_range)(u32 start, u32 end);
+	void (*inval_range)(u32 start, u32 end);
+};
+
+extern struct v7_outer_cache_ops v7_outer_cache;
+
+void v7_setup_outer_cache_ops(void);
+#endif
diff --git a/include/common.h b/include/common.h
index 189ad81..d750ff9 100644
--- a/include/common.h
+++ b/include/common.h
@@ -411,6 +411,7 @@  void	icache_disable(void);
 int	dcache_status (void);
 void	dcache_enable (void);
 void	dcache_disable(void);
+void	mmu_disable(void);
 void	relocate_code (ulong, gd_t *, ulong) __attribute__ ((noreturn));
 ulong	get_endaddr   (void);
 void	trap_init     (ulong);
@@ -603,9 +604,11 @@  ulong	video_setmem (ulong);
 
 /* arch/$(ARCH)/lib/cache.c */
 void	flush_cache   (unsigned long, unsigned long);
+void	flush_dcache_all(void);
 void	flush_dcache_range(unsigned long start, unsigned long stop);
 void	invalidate_dcache_range(unsigned long start, unsigned long stop);
-
+void	invalidate_dcache_all(void);
+void	invalidate_icache_all(void);
 
 /* arch/$(ARCH)/lib/ticks.S */
 unsigned long long get_ticks(void);