diff mbox series

[RFC,AARCH64] Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC

Message ID 1566483040-28204-1-git-send-email-zhangshaokun@hisilicon.com
State New
Headers show
Series [RFC,AARCH64] Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC | expand

Commit Message

Shaokun Zhang Aug. 22, 2019, 2:10 p.m. UTC
The DCache clean & ICache invalidation requirements for instructions
to be data coherence are discoverable through new fields in CTR_EL0.
Let's support the two bits if they are enabled, then we can get some
performance benefit from this feature.

2019-08-22  Shaokun Zhang  <zhangshaokun@hisilicon.com>

    * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC

---
 libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
 1 file changed, 35 insertions(+), 21 deletions(-)

Comments

Kyrill Tkachov Aug. 27, 2019, 10:16 a.m. UTC | #1
Hi Shaokun,

On 8/22/19 3:10 PM, Shaokun Zhang wrote:
> The DCache clean & ICache invalidation requirements for instructions
> to be data coherence are discoverable through new fields in CTR_EL0.
> Let's support the two bits if they are enabled, then we can get some
> performance benefit from this feature.
>
> 2019-08-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>
>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC
>
This needs to mention __aarch64_sync_cache_range as the function being 
changed.


> ---
>  libgcc/config/aarch64/sync-cache.c | 56 
> ++++++++++++++++++++++++--------------
>  1 file changed, 35 insertions(+), 21 deletions(-)
>
> diff --git a/libgcc/config/aarch64/sync-cache.c 
> b/libgcc/config/aarch64/sync-cache.c
> index 791f5e42ff44..0b057efbdcab 100644
> --- a/libgcc/config/aarch64/sync-cache.c
> +++ b/libgcc/config/aarch64/sync-cache.c
> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along 
> with this program;
>  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>  <http://www.gnu.org/licenses/>. */
>
> +#define CTR_IDC_SHIFT           28
> +#define CTR_DIC_SHIFT           29
> +
>  void __aarch64_sync_cache_range (const void *, const void *);
>
>  void
> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, 
> const void *end)
>    icache_lsize = 4 << (cache_info & 0xF);
>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>
> -  /* Loop over the address range, clearing one cache line at once.
> -     Data cache must be flushed to unification first to make sure the
> -     instruction cache fetches the updated data.  'end' is exclusive,
> -     as per the GNU definition of __clear_cache.  */
> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of 
> Unification is
> +     not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
> +    /* Loop over the address range, clearing one cache line at once.
> +       Data cache must be flushed to unification first to make sure the
> +       instruction cache fetches the updated data.  'end' is exclusive,
> +       as per the GNU definition of __clear_cache.  */
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>
> -  for (; address < (const char *) end; address += dcache_lsize)
> -    asm volatile ("dc\tcvau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += dcache_lsize)
> +      asm volatile ("dc\tcvau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
> +  }
>
>    asm volatile ("dsb\tish" : : : "memory");
>
> -  /* Make the start address of the loop cache aligned.  */
> -  address = (const char*) ((__UINTPTR_TYPE__) base
> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the 
> Point of
> +     Unification is not required for instruction to data coherence.  */
> +
> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
> +    /* Make the start address of the loop cache aligned. */
> +    address = (const char*) ((__UINTPTR_TYPE__) base
> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>
> -  for (; address < (const char *) end; address += icache_lsize)
> -    asm volatile ("ic\tivau, %0"
> -                 :
> -                 : "r" (address)
> -                 : "memory");
> +    for (; address < (const char *) end; address += icache_lsize)
> +      asm volatile ("ic\tivau, %0"
> +                   :
> +                   : "r" (address)
> +                   : "memory");
>
> -  asm volatile ("dsb\tish; isb" : : : "memory");
> +    asm volatile ("dsb\tish" : : : "memory");
> +  }
> +  asm volatile("isb" : : : "memory")
>  }

This looks ok to me (but you'll need approval from the maintainers).

There is a question of whether we need the barriers if both DIC and IDC 
are 1 (in which case no cache-maintentance instructions are emitted).

I think we still want them to ensure the writes have been completed and 
the fetches from the updated cache are up-to-date.

For arch versions before CTR_EL0.{DIC, IDC} these bits are reserved as 
zero so the code will do the right thing on those targets.

How has this patch been tested? Do you also have any performance results 
you can share?

Thanks,

Kyrill


> -- 
> 2.7.4
>
Shaokun Zhang Aug. 28, 2019, 8:47 a.m. UTC | #2
Hi Kyrill,

On 2019/8/27 18:16, Kyrill Tkachov wrote:
> Hi Shaokun,
> 
> On 8/22/19 3:10 PM, Shaokun Zhang wrote:
>> The DCache clean & ICache invalidation requirements for instructions
>> to be data coherence are discoverable through new fields in CTR_EL0.
>> Let's support the two bits if they are enabled, then we can get some
>> performance benefit from this feature.
>>
>> 2019-08-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>
>>     * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC
>>
> This needs to mention __aarch64_sync_cache_range as the function being changed.
> 

Ok, I will update in next version.

> 
>> ---
>>  libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
>>  1 file changed, 35 insertions(+), 21 deletions(-)
>>
>> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
>> index 791f5e42ff44..0b057efbdcab 100644
>> --- a/libgcc/config/aarch64/sync-cache.c
>> +++ b/libgcc/config/aarch64/sync-cache.c
>> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program;
>>  see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>>  <http://www.gnu.org/licenses/>. */
>>
>> +#define CTR_IDC_SHIFT           28
>> +#define CTR_DIC_SHIFT           29
>> +
>>  void __aarch64_sync_cache_range (const void *, const void *);
>>
>>  void
>> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end)
>>    icache_lsize = 4 << (cache_info & 0xF);
>>    dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>>
>> -  /* Loop over the address range, clearing one cache line at once.
>> -     Data cache must be flushed to unification first to make sure the
>> -     instruction cache fetches the updated data.  'end' is exclusive,
>> -     as per the GNU definition of __clear_cache.  */
>> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
>> +     not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
>> +    /* Loop over the address range, clearing one cache line at once.
>> +       Data cache must be flushed to unification first to make sure the
>> +       instruction cache fetches the updated data.  'end' is exclusive,
>> +       as per the GNU definition of __clear_cache.  */
>>
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>> +    /* Make the start address of the loop cache aligned. */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>
>> -  for (; address < (const char *) end; address += dcache_lsize)
>> -    asm volatile ("dc\tcvau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += dcache_lsize)
>> +      asm volatile ("dc\tcvau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>> +  }
>>
>>    asm volatile ("dsb\tish" : : : "memory");
>>
>> -  /* Make the start address of the loop cache aligned.  */
>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
>> +     Unification is not required for instruction to data coherence.  */
>> +
>> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
>> +    /* Make the start address of the loop cache aligned. */
>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>
>> -  for (; address < (const char *) end; address += icache_lsize)
>> -    asm volatile ("ic\tivau, %0"
>> -                 :
>> -                 : "r" (address)
>> -                 : "memory");
>> +    for (; address < (const char *) end; address += icache_lsize)
>> +      asm volatile ("ic\tivau, %0"
>> +                   :
>> +                   : "r" (address)
>> +                   : "memory");
>>
>> -  asm volatile ("dsb\tish; isb" : : : "memory");
>> +    asm volatile ("dsb\tish" : : : "memory");
>> +  }
>> +  asm volatile("isb" : : : "memory")
>>  }
> 
> This looks ok to me (but you'll need approval from the maintainers).
> 
> There is a question of whether we need the barriers if both DIC and IDC are 1 (in which case no cache-maintentance instructions are emitted).
> 
> I think we still want them to ensure the writes have been completed and the fetches from the updated cache are up-to-date.
> 

At the beginning, I'm not sure how to deal with the barrier instructions if DIC and IDC
are 1, So I sent one mail to discuss it, it is a pity that no response.
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg217561.html

From the arm ARM document:
    STR W11, [X1]         ; W11 contains a new instruction to be stored in program memory
    DC CVAU, X1           ; clean to PoU makes the new instruction visible to instruction cache
    DSB ISH
    IC IVAU, X1           ; ensures instruction cache/branch predictor discards stale data
    DSB ISH               ; ensures completion of the invalidation
    ISB                   ; ensures instruction fetch path sees new instruction cache state

AFAIK, the first "DSB ISH" is used ensure STR and DC CVAU instruction; Even if IDC is 1,
"DSB ISH" shall be kept for STR instruction. The second "DSB ISH" and "ISB" are explained
in comments, so I think "ISB" shall be kept also.

> For arch versions before CTR_EL0.{DIC, IDC} these bits are reserved as zero so the code will do the right thing on those targets.
> 
> How has this patch been tested? Do you also have any performance results you can share?
> 

I sent the patch as RFC because our cpu core which supports DIC and IDC is designing,
so there is no performance result at the moment.

Thanks,
Shaokun

> Thanks,
> 
> Kyrill
> 
> 
>> -- 
>> 2.7.4
>>
> 
> .
>
Kyrill Tkachov Aug. 28, 2019, 8:57 a.m. UTC | #3
Hi Shaokun,

On 8/28/19 9:47 AM, Shaokun Zhang wrote:
> Hi Kyrill,
>
> On 2019/8/27 18:16, Kyrill Tkachov wrote:
>> Hi Shaokun,
>>
>> On 8/22/19 3:10 PM, Shaokun Zhang wrote:
>>> The DCache clean & ICache invalidation requirements for instructions
>>> to be data coherence are discoverable through new fields in CTR_EL0.
>>> Let's support the two bits if they are enabled, then we can get some
>>> performance benefit from this feature.
>>>
>>> 2019-08-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>>
>>>      * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC
>>>
>> This needs to mention __aarch64_sync_cache_range as the function being changed.
>>
> Ok, I will update in next version.
>
>>> ---
>>>   libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
>>>   1 file changed, 35 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
>>> index 791f5e42ff44..0b057efbdcab 100644
>>> --- a/libgcc/config/aarch64/sync-cache.c
>>> +++ b/libgcc/config/aarch64/sync-cache.c
>>> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program;
>>>   see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>>>   <http://www.gnu.org/licenses/>. */
>>>
>>> +#define CTR_IDC_SHIFT           28
>>> +#define CTR_DIC_SHIFT           29
>>> +
>>>   void __aarch64_sync_cache_range (const void *, const void *);
>>>
>>>   void
>>> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end)
>>>     icache_lsize = 4 << (cache_info & 0xF);
>>>     dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>>>
>>> -  /* Loop over the address range, clearing one cache line at once.
>>> -     Data cache must be flushed to unification first to make sure the
>>> -     instruction cache fetches the updated data.  'end' is exclusive,
>>> -     as per the GNU definition of __clear_cache.  */
>>> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
>>> +     not required for instruction to data coherence.  */
>>> +
>>> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
>>> +    /* Loop over the address range, clearing one cache line at once.
>>> +       Data cache must be flushed to unification first to make sure the
>>> +       instruction cache fetches the updated data.  'end' is exclusive,
>>> +       as per the GNU definition of __clear_cache.  */
>>>
>>> -  /* Make the start address of the loop cache aligned.  */
>>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>>> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>> +    /* Make the start address of the loop cache aligned. */
>>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>>> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>>
>>> -  for (; address < (const char *) end; address += dcache_lsize)
>>> -    asm volatile ("dc\tcvau, %0"
>>> -                 :
>>> -                 : "r" (address)
>>> -                 : "memory");
>>> +    for (; address < (const char *) end; address += dcache_lsize)
>>> +      asm volatile ("dc\tcvau, %0"
>>> +                   :
>>> +                   : "r" (address)
>>> +                   : "memory");
>>> +  }
>>>
>>>     asm volatile ("dsb\tish" : : : "memory");
>>>
>>> -  /* Make the start address of the loop cache aligned.  */
>>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>>> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
>>> +     Unification is not required for instruction to data coherence.  */
>>> +
>>> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
>>> +    /* Make the start address of the loop cache aligned. */
>>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>>> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>>
>>> -  for (; address < (const char *) end; address += icache_lsize)
>>> -    asm volatile ("ic\tivau, %0"
>>> -                 :
>>> -                 : "r" (address)
>>> -                 : "memory");
>>> +    for (; address < (const char *) end; address += icache_lsize)
>>> +      asm volatile ("ic\tivau, %0"
>>> +                   :
>>> +                   : "r" (address)
>>> +                   : "memory");
>>>
>>> -  asm volatile ("dsb\tish; isb" : : : "memory");
>>> +    asm volatile ("dsb\tish" : : : "memory");
>>> +  }
>>> +  asm volatile("isb" : : : "memory")
>>>   }
>> This looks ok to me (but you'll need approval from the maintainers).
>>
>> There is a question of whether we need the barriers if both DIC and IDC are 1 (in which case no cache-maintentance instructions are emitted).
>>
>> I think we still want them to ensure the writes have been completed and the fetches from the updated cache are up-to-date.
>>
> At the beginning, I'm not sure how to deal with the barrier instructions if DIC and IDC
> are 1, So I sent one mail to discuss it, it is a pity that no response.
> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg217561.html

Sorry for that. August is a tricky month due to summer vacation in many 
places and mail can slip through the cracks...

I recommend you ping your question if you haven't had a reply within a 
week or so.

>
>  From the arm ARM document:
>      STR W11, [X1]         ; W11 contains a new instruction to be stored in program memory
>      DC CVAU, X1           ; clean to PoU makes the new instruction visible to instruction cache
>      DSB ISH
>      IC IVAU, X1           ; ensures instruction cache/branch predictor discards stale data
>      DSB ISH               ; ensures completion of the invalidation
>      ISB                   ; ensures instruction fetch path sees new instruction cache state
>
> AFAIK, the first "DSB ISH" is used ensure STR and DC CVAU instruction; Even if IDC is 1,
> "DSB ISH" shall be kept for STR instruction. The second "DSB ISH" and "ISB" are explained
> in comments, so I think "ISB" shall be kept also.


Thanks, that's my understanding as well.


>> For arch versions before CTR_EL0.{DIC, IDC} these bits are reserved as zero so the code will do the right thing on those targets.
>>
>> How has this patch been tested? Do you also have any performance results you can share?
>>
> I sent the patch as RFC because our cpu core which supports DIC and IDC is designing,
> so there is no performance result at the moment.


That's okay. In the meantime a bootstrap and test cycle on a current 
aarch64 machine would be good to make sure existing functionality 
doesn't break.

Thanks,

Kyrill

>
> Thanks,
> Shaokun
>
>> Thanks,
>>
>> Kyrill
>>
>>
>>> -- 
>>> 2.7.4
>>>
>> .
>>
Shaokun Zhang Aug. 28, 2019, 9:27 a.m. UTC | #4
Hi Kyrill,

On 2019/8/28 16:57, Kyrill Tkachov wrote:
> Hi Shaokun,
> 
> On 8/28/19 9:47 AM, Shaokun Zhang wrote:
>> Hi Kyrill,
>>
>> On 2019/8/27 18:16, Kyrill Tkachov wrote:
>>> Hi Shaokun,
>>>
>>> On 8/22/19 3:10 PM, Shaokun Zhang wrote:
>>>> The DCache clean & ICache invalidation requirements for instructions
>>>> to be data coherence are discoverable through new fields in CTR_EL0.
>>>> Let's support the two bits if they are enabled, then we can get some
>>>> performance benefit from this feature.
>>>>
>>>> 2019-08-22  Shaokun Zhang <zhangshaokun@hisilicon.com>
>>>>
>>>>      * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC
>>>>
>>> This needs to mention __aarch64_sync_cache_range as the function being changed.
>>>
>> Ok, I will update in next version.
>>
>>>> ---
>>>>   libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++--------------
>>>>   1 file changed, 35 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
>>>> index 791f5e42ff44..0b057efbdcab 100644
>>>> --- a/libgcc/config/aarch64/sync-cache.c
>>>> +++ b/libgcc/config/aarch64/sync-cache.c
>>>> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program;
>>>>   see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
>>>>   <http://www.gnu.org/licenses/>. */
>>>>
>>>> +#define CTR_IDC_SHIFT           28
>>>> +#define CTR_DIC_SHIFT           29
>>>> +
>>>>   void __aarch64_sync_cache_range (const void *, const void *);
>>>>
>>>>   void
>>>> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end)
>>>>     icache_lsize = 4 << (cache_info & 0xF);
>>>>     dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
>>>>
>>>> -  /* Loop over the address range, clearing one cache line at once.
>>>> -     Data cache must be flushed to unification first to make sure the
>>>> -     instruction cache fetches the updated data.  'end' is exclusive,
>>>> -     as per the GNU definition of __clear_cache.  */
>>>> +  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
>>>> +     not required for instruction to data coherence.  */
>>>> +
>>>> +  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
>>>> +    /* Loop over the address range, clearing one cache line at once.
>>>> +       Data cache must be flushed to unification first to make sure the
>>>> +       instruction cache fetches the updated data.  'end' is exclusive,
>>>> +       as per the GNU definition of __clear_cache.  */
>>>>
>>>> -  /* Make the start address of the loop cache aligned.  */
>>>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>>>> -                          & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>>> +    /* Make the start address of the loop cache aligned. */
>>>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>>>> +                            & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
>>>>
>>>> -  for (; address < (const char *) end; address += dcache_lsize)
>>>> -    asm volatile ("dc\tcvau, %0"
>>>> -                 :
>>>> -                 : "r" (address)
>>>> -                 : "memory");
>>>> +    for (; address < (const char *) end; address += dcache_lsize)
>>>> +      asm volatile ("dc\tcvau, %0"
>>>> +                   :
>>>> +                   : "r" (address)
>>>> +                   : "memory");
>>>> +  }
>>>>
>>>>     asm volatile ("dsb\tish" : : : "memory");
>>>>
>>>> -  /* Make the start address of the loop cache aligned.  */
>>>> -  address = (const char*) ((__UINTPTR_TYPE__) base
>>>> -                          & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>>> +  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
>>>> +     Unification is not required for instruction to data coherence.  */
>>>> +
>>>> +  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
>>>> +    /* Make the start address of the loop cache aligned. */
>>>> +    address = (const char*) ((__UINTPTR_TYPE__) base
>>>> +                            & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
>>>>
>>>> -  for (; address < (const char *) end; address += icache_lsize)
>>>> -    asm volatile ("ic\tivau, %0"
>>>> -                 :
>>>> -                 : "r" (address)
>>>> -                 : "memory");
>>>> +    for (; address < (const char *) end; address += icache_lsize)
>>>> +      asm volatile ("ic\tivau, %0"
>>>> +                   :
>>>> +                   : "r" (address)
>>>> +                   : "memory");
>>>>
>>>> -  asm volatile ("dsb\tish; isb" : : : "memory");
>>>> +    asm volatile ("dsb\tish" : : : "memory");
>>>> +  }
>>>> +  asm volatile("isb" : : : "memory")
>>>>   }
>>> This looks ok to me (but you'll need approval from the maintainers).
>>>
>>> There is a question of whether we need the barriers if both DIC and IDC are 1 (in which case no cache-maintentance instructions are emitted).
>>>
>>> I think we still want them to ensure the writes have been completed and the fetches from the updated cache are up-to-date.
>>>
>> At the beginning, I'm not sure how to deal with the barrier instructions if DIC and IDC
>> are 1, So I sent one mail to discuss it, it is a pity that no response.
>> https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg217561.html
> 
> Sorry for that. August is a tricky month due to summer vacation in many places and mail can slip through the cracks...
> 

Oh, thanks for more information about it, Got it.

> I recommend you ping your question if you haven't had a reply within a week or so.
> 

Good idea, I will follow it in the future work.

>>
>>  From the arm ARM document:
>>      STR W11, [X1]         ; W11 contains a new instruction to be stored in program memory
>>      DC CVAU, X1           ; clean to PoU makes the new instruction visible to instruction cache
>>      DSB ISH
>>      IC IVAU, X1           ; ensures instruction cache/branch predictor discards stale data
>>      DSB ISH               ; ensures completion of the invalidation
>>      ISB                   ; ensures instruction fetch path sees new instruction cache state
>>
>> AFAIK, the first "DSB ISH" is used ensure STR and DC CVAU instruction; Even if IDC is 1,
>> "DSB ISH" shall be kept for STR instruction. The second "DSB ISH" and "ISB" are explained
>> in comments, so I think "ISB" shall be kept also.
> 
> 
> Thanks, that's my understanding as well.
> 
> 
>>> For arch versions before CTR_EL0.{DIC, IDC} these bits are reserved as zero so the code will do the right thing on those targets.
>>>
>>> How has this patch been tested? Do you also have any performance results you can share?
>>>
>> I sent the patch as RFC because our cpu core which supports DIC and IDC is designing,
>> so there is no performance result at the moment.
> 
> 
> That's okay. In the meantime a bootstrap and test cycle on a current aarch64 machine would be good to make sure existing functionality doesn't break.
> 

I will update the ChangeLog and do the formal patch if the maintainers are happy on it.

Cheers,
Shaokun

> Thanks,
> 
> Kyrill
> 
>>
>> Thanks,
>> Shaokun
>>
>>> Thanks,
>>>
>>> Kyrill
>>>
>>>
>>>> -- 
>>>> 2.7.4
>>>>
>>> .
>>>
> 
> .
>
diff mbox series

Patch

diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c
index 791f5e42ff44..0b057efbdcab 100644
--- a/libgcc/config/aarch64/sync-cache.c
+++ b/libgcc/config/aarch64/sync-cache.c
@@ -23,6 +23,9 @@  a copy of the GCC Runtime Library Exception along with this program;
 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 <http://www.gnu.org/licenses/>.  */
 
+#define CTR_IDC_SHIFT           28
+#define CTR_DIC_SHIFT           29
+
 void __aarch64_sync_cache_range (const void *, const void *);
 
 void
@@ -41,32 +44,43 @@  __aarch64_sync_cache_range (const void *base, const void *end)
   icache_lsize = 4 << (cache_info & 0xF);
   dcache_lsize = 4 << ((cache_info >> 16) & 0xF);
 
-  /* Loop over the address range, clearing one cache line at once.
-     Data cache must be flushed to unification first to make sure the
-     instruction cache fetches the updated data.  'end' is exclusive,
-     as per the GNU definition of __clear_cache.  */
+  /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is
+     not required for instruction to data coherence.  */
+
+  if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) {
+    /* Loop over the address range, clearing one cache line at once.
+       Data cache must be flushed to unification first to make sure the
+       instruction cache fetches the updated data.  'end' is exclusive,
+       as per the GNU definition of __clear_cache.  */
 
-  /* Make the start address of the loop cache aligned.  */
-  address = (const char*) ((__UINTPTR_TYPE__) base
-			   & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
+    /* Make the start address of the loop cache aligned.  */
+    address = (const char*) ((__UINTPTR_TYPE__) base
+			     & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1));
 
-  for (; address < (const char *) end; address += dcache_lsize)
-    asm volatile ("dc\tcvau, %0"
-		  :
-		  : "r" (address)
-		  : "memory");
+    for (; address < (const char *) end; address += dcache_lsize)
+      asm volatile ("dc\tcvau, %0"
+		    :
+		    : "r" (address)
+		    : "memory");
+  }
 
   asm volatile ("dsb\tish" : : : "memory");
 
-  /* Make the start address of the loop cache aligned.  */
-  address = (const char*) ((__UINTPTR_TYPE__) base
-			   & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
+  /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of
+     Unification is not required for instruction to data coherence.  */
+
+  if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) {
+    /* Make the start address of the loop cache aligned.  */
+    address = (const char*) ((__UINTPTR_TYPE__) base
+			     & ~ (__UINTPTR_TYPE__) (icache_lsize - 1));
 
-  for (; address < (const char *) end; address += icache_lsize)
-    asm volatile ("ic\tivau, %0"
-		  :
-		  : "r" (address)
-		  : "memory");
+    for (; address < (const char *) end; address += icache_lsize)
+      asm volatile ("ic\tivau, %0"
+		    :
+		    : "r" (address)
+		    : "memory");
 
-  asm volatile ("dsb\tish; isb" : : : "memory");
+    asm volatile ("dsb\tish" : : : "memory");
+  }
+  asm volatile("isb" : : : "memory")
 }