[AARCH64] Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC
diff mbox series

Message ID 1567235533-5731-1-git-send-email-zhangshaokun@hisilicon.com
State New
Headers show
Series
  • [AARCH64] Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC
Related show

Commit Message

Shaokun Zhang Aug. 31, 2019, 7:12 a.m. UTC
The DCache clean & ICache invalidation requirements for instructions
to be data coherence are discoverable through new fields in CTR_EL0.
Let's support the two bits if they are enabled, the CPU core will
not execute the unnecessary DCache clean or Icache Invalidation
instructions.

2019-08-31  Shaokun Zhang  <zhangshaokun@hisilicon.com>

    * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in
__aarch64_sync_cache_range function.
---
 libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 21 deletions(-)

Comments

Kyrill Tkachov Sept. 2, 2019, 2:24 p.m. UTC | #1
Hi Shaokun,

On 8/31/19 8:12 AM, Shaokun Zhang wrote:
> The DCache clean & ICache invalidation requirements for instructions
> to be data coherence are discoverable through new fields in CTR_EL0.
> Let's support the two bits if they are enabled, the CPU core will
> not execute the unnecessary DCache clean or Icache Invalidation
> instructions.
>
> 2019-08-31  Shaokun Zhang <zhangshaokun@hisilicon.com>
>
>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in
> __aarch64_sync_cache_range function.

As mentioned in the RFC, I think this is ok but...


> ---
>  libgcc/config/aarch64/sync-cache.c | 56 
> ++++++++++++++++++++++++--------------
>  1 file changed, 35 insertions(+), 21 deletions(-)
>
> diff --git a/libgcc/config/aarch64/sync-cache.c 
> b/libgcc/config/aarch64/sync-cache.c
> index 791f5e42ff44..0b057efbdcab 100644
> --- a/libgcc/config/aarch64/sync-cache.c
> +++ b/libgcc/config/aarch64/sync-cache.c
> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along 
> with this program;
>  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>  <http://www.gnu.org/licenses/>. */
>
> +#define CTR_IDC_SHIFT           28
> +#define CTR_DIC_SHIFT           29
> +
>  void __aarch64_sync_cache_range (const void *, const void *);
>
>  void
> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, 
> const void *end)
>    icache_lsize = 4 << (cache_info & 0xF);
>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>
> -  /* Loop over the address range, clearing one cache line at once.
> -     Data cache must be flushed to unification first to make sure the
> -     instruction cache fetches the updated data.  'end' is exclusive,
> -     as per the GNU definition of __clear_cache.  */
> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of 
> Unification is
> +     not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
> +    /* Loop over the address range, clearing one cache line at once.
> +       Data cache must be flushed to unification first to make sure the
> +       instruction cache fetches the updated data.  'end' is exclusive,
> +       as per the GNU definition of __clear_cache.  */
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>
> -  for (; address < (const char *) end; address += dcache_lsize)
> -    asm volatile ("dc\tcvau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += dcache_lsize)
> +      asm volatile ("dc\tcvau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
> +  }
>
>    asm volatile ("dsb\tish" : : : "memory");
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the 
> Point of
> +     Unification is not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>
> -  for (; address < (const char *) end; address += icache_lsize)
> -    asm volatile ("ic\tivau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += icache_lsize)
> +      asm volatile ("ic\tivau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
>
> -  asm volatile ("dsb\tish; isb" : : : "memory");
> +    asm volatile ("dsb\tish" : : : "memory");
> +  }
> +  asm volatile("isb" : : : "memory")

... a semicolon has gone missing here after the asm volatile so this 
will not compiler. A problem with the mail client?

Thanks,

Kyrill



>  }
> -- 
> 2.7.4
>
Kyrill Tkachov Sept. 2, 2019, 2:31 p.m. UTC | #2
Hi Shaokun

On 8/31/19 8:12 AM, Shaokun Zhang wrote:
> The DCache clean & ICache invalidation requirements for instructions
> to be data coherence are discoverable through new fields in CTR_EL0.
> Let's support the two bits if they are enabled, the CPU core will
> not execute the unnecessary DCache clean or Icache Invalidation
> instructions.
>
> 2019-08-31  Shaokun Zhang <zhangshaokun@hisilicon.com>
>
>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in
> __aarch64_sync_cache_range function.

Sorry, I just tried compiling this to look at the assembly output. I 
think there's a couple of issues...


> ---
>  libgcc/config/aarch64/sync-cache.c | 56 
> ++++++++++++++++++++++++--------------
>  1 file changed, 35 insertions(+), 21 deletions(-)
>
> diff --git a/libgcc/config/aarch64/sync-cache.c 
> b/libgcc/config/aarch64/sync-cache.c
> index 791f5e42ff44..0b057efbdcab 100644
> --- a/libgcc/config/aarch64/sync-cache.c
> +++ b/libgcc/config/aarch64/sync-cache.c
> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along 
> with this program;
>  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>  <http://www.gnu.org/licenses/>. */
>
> +#define CTR_IDC_SHIFT           28
> +#define CTR_DIC_SHIFT           29
> +
>  void __aarch64_sync_cache_range (const void *, const void *);
>
>  void
> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, 
> const void *end)
>    icache_lsize = 4 << (cache_info & 0xF);
>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>
> -  /* Loop over the address range, clearing one cache line at once.
> -     Data cache must be flushed to unification first to make sure the
> -     instruction cache fetches the updated data.  'end' is exclusive,
> -     as per the GNU definition of __clear_cache.  */
> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of 
> Unification is
> +     not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {


By the C rules, this will evaluate to 0 always and the whole path is 
eliminated. What you want here is:

   if (((cache_info >> CTR_IDC_SHIFT) & 0x1) == 0x0)


> +    /* Loop over the address range, clearing one cache line at once.
> +       Data cache must be flushed to unification first to make sure the
> +       instruction cache fetches the updated data.  'end' is exclusive,
> +       as per the GNU definition of __clear_cache.  */
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>
> -  for (; address < (const char *) end; address += dcache_lsize)
> -    asm volatile ("dc\tcvau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += dcache_lsize)
> +      asm volatile ("dc\tcvau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
> +  }
>
>    asm volatile ("dsb\tish" : : : "memory");
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the 
> Point of
> +     Unification is not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {

Same here, this should be:

   if (((cache_info >> CTR_DIC_SHIFT) & 0x1) == 0x0)


Thanks,

Kyrill

> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>
> -  for (; address < (const char *) end; address += icache_lsize)
> -    asm volatile ("ic\tivau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += icache_lsize)
> +      asm volatile ("ic\tivau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
>
> -  asm volatile ("dsb\tish; isb" : : : "memory");
> +    asm volatile ("dsb\tish" : : : "memory");
> +  }
> +  asm volatile("isb" : : : "memory")
>  }
> -- 
> 2.7.4
>
Shaokun Zhang Sept. 3, 2019, 7:15 a.m. UTC | #3
Hi Kyrill,

On 2019/9/2 22:24, Kyrill Tkachov wrote:
> Hi Shaokun,
> 
> On 8/31/19 8:12 AM, Shaokun Zhang wrote:
>> The DCache clean & ICache invalidation requirements for instructions
>> to be data coherence are discoverable through new fields in CTR_EL0.
>> Let's support the two bits if they are enabled, the CPU core will
>> not execute the unnecessary DCache clean or Icache Invalidation
>> instructions.
>>
>> 2019-08-31  Shaokun Zhang  <zhangshaokun@hisilicon.com>
>>
>>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in
>> __aarch64_sync_cache_range function.
> 
> As mentioned in the RFC, I think this is ok but...
> 
> 
>> ---
>>  libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
>>  1 file changed, 35 insertions(+), 21 deletions(-)
>>
>> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
>> index 791f5e42ff44..0b057efbdcab 100644
>> --- a/libgcc/config/aarch64/sync-cache.c
>> +++ b/libgcc/config/aarch64/sync-cache.c
>> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program;
>>  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>  <http://www.gnu.org/licenses/>.  */
>>  
>> +#define CTR_IDC_SHIFT           28
>> +#define CTR_DIC_SHIFT           29
>> +
>>  void __aarch64_sync_cache_range (const void *, const void *);
>>  
>>  void
>> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end)
>>    icache_lsize = 4 << (cache_info & 0xF);
>>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>>  
>> -  /* Loop over the address range, clearing one cache line at once.
>> -     Data cache must be flushed to unification first to make sure the
>> -     instruction cache fetches the updated data.  'end' is exclusive,
>> -     as per the GNU definition of __clear_cache.  */
>> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
>> +     not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
>> +    /* Loop over the address range, clearing one cache line at once.
>> +       Data cache must be flushed to unification first to make sure the
>> +       instruction cache fetches the updated data.  'end' is exclusive,
>> +       as per the GNU definition of __clear_cache.  */
>>  
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>> +    /* Make the start address of the loop cache aligned.  */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>  
>> -  for (; address < (const char *) end; address += dcache_lsize)
>> -    asm volatile ("dc\tcvau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += dcache_lsize)
>> +      asm volatile ("dc\tcvau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>> +  }
>>  
>>    asm volatile ("dsb\tish" : : : "memory");
>>  
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
>> +     Unification is not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
>> +    /* Make the start address of the loop cache aligned.  */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>  
>> -  for (; address < (const char *) end; address += icache_lsize)
>> -    asm volatile ("ic\tivau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += icache_lsize)
>> +      asm volatile ("ic\tivau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>>  
>> -  asm volatile ("dsb\tish; isb" : : : "memory");
>> +    asm volatile ("dsb\tish" : : : "memory");
>> +  }
>> +  asm volatile("isb" : : : "memory")
> 
> ... a semicolon has gone missing here after the asm volatile so this will not compiler. A problem with the mail client?
> 

Apologies that it seems that I made the wrong branch when I compiled the patch between several branches. :-(

zhangshaokun@ubuntu:~/gcc_installdir/bin$ ./gcc --version
gcc (GCC) 10.0.0 20190831 (experimental)
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

zhangshaokun@ubuntu:~/gcc_installdir/bin$

I have fixed it and double check its branch, it is ok now. ;-)

Thanks,
Shaokun


> Thanks,
> 
> Kyrill
> 
> 
> 
>>  }
>> -- 
>> 2.7.4
>>
Shaokun Zhang Sept. 3, 2019, 7:18 a.m. UTC | #4
Hi Kyrill,

On 2019/9/2 22:31, Kyrill Tkachov wrote:
> Hi Shaokun
> 
> On 8/31/19 8:12 AM, Shaokun Zhang wrote:
>> The DCache clean & ICache invalidation requirements for instructions
>> to be data coherence are discoverable through new fields in CTR_EL0.
>> Let's support the two bits if they are enabled, the CPU core will
>> not execute the unnecessary DCache clean or Icache Invalidation
>> instructions.
>>
>> 2019-08-31  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>
>>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in
>> __aarch64_sync_cache_range function.
> 
> Sorry, I just tried compiling this to look at the assembly output. I think there's a couple of issues...
> 

Hmm, sorry for this -Wparentheses issue and I have fixed them in next version.
About all the problems, I'm so sorry that I shall double check it before I send
it to community.

Thanks,
Shaokun

> 
>> ---
>>  libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
>>  1 file changed, 35 insertions(+), 21 deletions(-)
>>
>> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
>> index 791f5e42ff44..0b057efbdcab 100644
>> --- a/libgcc/config/aarch64/sync-cache.c
>> +++ b/libgcc/config/aarch64/sync-cache.c
>> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program;
>>  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>>  <http://www.gnu.org/licenses/>. */
>>
>> +#define CTR_IDC_SHIFT           28
>> +#define CTR_DIC_SHIFT           29
>> +
>>  void __aarch64_sync_cache_range (const void *, const void *);
>>
>>  void
>> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end)
>>    icache_lsize = 4 << (cache_info & 0xF);
>>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>>
>> -  /* Loop over the address range, clearing one cache line at once.
>> -     Data cache must be flushed to unification first to make sure the
>> -     instruction cache fetches the updated data.  'end' is exclusive,
>> -     as per the GNU definition of __clear_cache.  */
>> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
>> +     not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
> 
> 
> By the C rules, this will evaluate to 0 always and the whole path is eliminated. What you want here is:
> 
>   if (((cache_info >> CTR_IDC_SHIFT) & 0x1) == 0x0)
> 
> 
>> +    /* Loop over the address range, clearing one cache line at once.
>> +       Data cache must be flushed to unification first to make sure the
>> +       instruction cache fetches the updated data.  'end' is exclusive,
>> +       as per the GNU definition of __clear_cache.  */
>>
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>> +    /* Make the start address of the loop cache aligned. */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>
>> -  for (; address < (const char *) end; address += dcache_lsize)
>> -    asm volatile ("dc\tcvau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += dcache_lsize)
>> +      asm volatile ("dc\tcvau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>> +  }
>>
>>    asm volatile ("dsb\tish" : : : "memory");
>>
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
>> +     Unification is not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
> 
> Same here, this should be:
> 
>   if (((cache_info >> CTR_DIC_SHIFT) & 0x1) == 0x0)
> 
> 
> Thanks,
> 
> Kyrill
> 
>> +    /* Make the start address of the loop cache aligned. */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>
>> -  for (; address < (const char *) end; address += icache_lsize)
>> -    asm volatile ("ic\tivau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += icache_lsize)
>> +      asm volatile ("ic\tivau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>>
>> -  asm volatile ("dsb\tish; isb" : : : "memory");
>> +    asm volatile ("dsb\tish" : : : "memory");
>> +  }
>> +  asm volatile("isb" : : : "memory")
>>  }
>> -- 
>> 2.7.4
>>
> 
> .
>

Patch
diff mbox series

diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
index 791f5e42ff44..0b057efbdcab 100644
--- a/libgcc/config/aarch64/sync-cache.c
+++ b/libgcc/config/aarch64/sync-cache.c
@@ -23,6 +23,9 @@  a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
+#define CTR_IDC_SHIFT           28
+#define CTR_DIC_SHIFT           29
+
 void __aarch64_sync_cache_range (const void *, const void *);
 
 void
@@ -41,32 +44,43 @@  __aarch64_sync_cache_range (const void *base, const void *end)
   icache_lsize = 4 << (cache_info & 0xF);
   dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
 
-  /* Loop over the address range, clearing one cache line at once.
-     Data cache must be flushed to unification first to make sure the
-     instruction cache fetches the updated data.  'end' is exclusive,
-     as per the GNU definition of __clear_cache.  */
+  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
+     not required for instruction to data coherence.  */
+
+  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
+    /* Loop over the address range, clearing one cache line at once.
+       Data cache must be flushed to unification first to make sure the
+       instruction cache fetches the updated data.  'end' is exclusive,
+       as per the GNU definition of __clear_cache.  */
 
-  /* Make the start address of the loop cache aligned.  */
-  address = (const char*) ((__UINTPTR_TYPE__) base
-			   & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
+    /* Make the start address of the loop cache aligned.  */
+    address = (const char*) ((__UINTPTR_TYPE__) base
+			     & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
 
-  for (; address < (const char *) end; address += dcache_lsize)
-    asm volatile ("dc\tcvau, %0"
-		  :
-		  : "r" (address)
-		  : "memory");
+    for (; address < (const char *) end; address += dcache_lsize)
+      asm volatile ("dc\tcvau, %0"
+		    :
+		    : "r" (address)
+		    : "memory");
+  }
 
   asm volatile ("dsb\tish" : : : "memory");
 
-  /* Make the start address of the loop cache aligned.  */
-  address = (const char*) ((__UINTPTR_TYPE__) base
-			   & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
+  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
+     Unification is not required for instruction to data coherence.  */
+
+  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
+    /* Make the start address of the loop cache aligned.  */
+    address = (const char*) ((__UINTPTR_TYPE__) base
+			     & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
 
-  for (; address < (const char *) end; address += icache_lsize)
-    asm volatile ("ic\tivau, %0"
-		  :
-		  : "r" (address)
-		  : "memory");
+    for (; address < (const char *) end; address += icache_lsize)
+      asm volatile ("ic\tivau, %0"
+		    :
+		    : "r" (address)
+		    : "memory");
 
-  asm volatile ("dsb\tish; isb" : : : "memory");
+    asm volatile ("dsb\tish" : : : "memory");
+  }
+  asm volatile("isb" : : : "memory")
 }