Message ID | 1567235533-5731-1-git-send-email-zhangshaokun@hisilicon.com |
---|---|
State | New |
Headers | show |
Series | [AARCH64] Add support for new control bits CTR_EL0.DIC and CTR_EL0.IDC | expand |
Hi Shaokun, On 8/31/19 8:12 AM, Shaokun Zhang wrote: > The DCache clean & ICache invalidation requirements for instructions > to be data coherence are discoverable through new fields in CTR_EL0. > Let's support the two bits if they are enabled, the CPU core will > not execute the unnecessary DCache clean or Icache Invalidation > instructions. > > 2019-08-31 Shaokun Zhang <zhangshaokun@hisilicon.com> > > * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in > __aarch64_sync_cache_range function. As mentioned in the RFC, I think this is ok but... > --- > libgcc/config/aarch64/sync-cache.c | 56 > ++++++++++++++++++++++++-------------- > 1 file changed, 35 insertions(+), 21 deletions(-) > > diff --git a/libgcc/config/aarch64/sync-cache.c > b/libgcc/config/aarch64/sync-cache.c > index 791f5e42ff44..0b057efbdcab 100644 > --- a/libgcc/config/aarch64/sync-cache.c > +++ b/libgcc/config/aarch64/sync-cache.c > @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along > with this program; > see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > <http://www.gnu.org/licenses/>. */ > > +#define CTR_IDC_SHIFT 28 > +#define CTR_DIC_SHIFT 29 > + > void __aarch64_sync_cache_range (const void *, const void *); > > void > @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, > const void *end) > icache_lsize = 4 << (cache_info & 0xF); > dcache_lsize = 4 << ((cache_info >> 16) & 0xF); > > - /* Loop over the address range, clearing one cache line at once. > - Data cache must be flushed to unification first to make sure the > - instruction cache fetches the updated data. 'end' is exclusive, > - as per the GNU definition of __clear_cache. */ > + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of > Unification is > + not required for instruction to data coherence. */ > + > + if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) { > + /* Loop over the address range, clearing one cache line at once. > + Data cache must be flushed to unification first to make sure the > + instruction cache fetches the updated data. 'end' is exclusive, > + as per the GNU definition of __clear_cache. */ > > - /* Make the start address of the loop cache aligned. */ > - address = (const char*) ((__UINTPTR_TYPE__) base > - & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); > + /* Make the start address of the loop cache aligned. */ > + address = (const char*) ((__UINTPTR_TYPE__) base > + & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); > > - for (; address < (const char *) end; address += dcache_lsize) > - asm volatile ("dc\tcvau, %0" > - : > - : "r" (address) > - : "memory"); > + for (; address < (const char *) end; address += dcache_lsize) > + asm volatile ("dc\tcvau, %0" > + : > + : "r" (address) > + : "memory"); > + } > > asm volatile ("dsb\tish" : : : "memory"); > > - /* Make the start address of the loop cache aligned. */ > - address = (const char*) ((__UINTPTR_TYPE__) base > - & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); > + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the > Point of > + Unification is not required for instruction to data coherence. */ > + > + if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) { > + /* Make the start address of the loop cache aligned. */ > + address = (const char*) ((__UINTPTR_TYPE__) base > + & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); > > - for (; address < (const char *) end; address += icache_lsize) > - asm volatile ("ic\tivau, %0" > - : > - : "r" (address) > - : "memory"); > + for (; address < (const char *) end; address += icache_lsize) > + asm volatile ("ic\tivau, %0" > + : > + : "r" (address) > + : "memory"); > > - asm volatile ("dsb\tish; isb" : : : "memory"); > + asm volatile ("dsb\tish" : : : "memory"); > + } > + asm volatile("isb" : : : "memory") ... a semicolon has gone missing here after the asm volatile so this will not compiler. A problem with the mail client? Thanks, Kyrill > } > -- > 2.7.4 >
Hi Shaokun On 8/31/19 8:12 AM, Shaokun Zhang wrote: > The DCache clean & ICache invalidation requirements for instructions > to be data coherence are discoverable through new fields in CTR_EL0. > Let's support the two bits if they are enabled, the CPU core will > not execute the unnecessary DCache clean or Icache Invalidation > instructions. > > 2019-08-31 Shaokun Zhang <zhangshaokun@hisilicon.com> > > * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in > __aarch64_sync_cache_range function. Sorry, I just tried compiling this to look at the assembly output. I think there's a couple of issues... > --- > libgcc/config/aarch64/sync-cache.c | 56 > ++++++++++++++++++++++++-------------- > 1 file changed, 35 insertions(+), 21 deletions(-) > > diff --git a/libgcc/config/aarch64/sync-cache.c > b/libgcc/config/aarch64/sync-cache.c > index 791f5e42ff44..0b057efbdcab 100644 > --- a/libgcc/config/aarch64/sync-cache.c > +++ b/libgcc/config/aarch64/sync-cache.c > @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along > with this program; > see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > <http://www.gnu.org/licenses/>. */ > > +#define CTR_IDC_SHIFT 28 > +#define CTR_DIC_SHIFT 29 > + > void __aarch64_sync_cache_range (const void *, const void *); > > void > @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, > const void *end) > icache_lsize = 4 << (cache_info & 0xF); > dcache_lsize = 4 << ((cache_info >> 16) & 0xF); > > - /* Loop over the address range, clearing one cache line at once. > - Data cache must be flushed to unification first to make sure the > - instruction cache fetches the updated data. 'end' is exclusive, > - as per the GNU definition of __clear_cache. */ > + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of > Unification is > + not required for instruction to data coherence. */ > + > + if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) { By the C rules, this will evaluate to 0 always and the whole path is eliminated. What you want here is: if (((cache_info >> CTR_IDC_SHIFT) & 0x1) == 0x0) > + /* Loop over the address range, clearing one cache line at once. > + Data cache must be flushed to unification first to make sure the > + instruction cache fetches the updated data. 'end' is exclusive, > + as per the GNU definition of __clear_cache. */ > > - /* Make the start address of the loop cache aligned. */ > - address = (const char*) ((__UINTPTR_TYPE__) base > - & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); > + /* Make the start address of the loop cache aligned. */ > + address = (const char*) ((__UINTPTR_TYPE__) base > + & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); > > - for (; address < (const char *) end; address += dcache_lsize) > - asm volatile ("dc\tcvau, %0" > - : > - : "r" (address) > - : "memory"); > + for (; address < (const char *) end; address += dcache_lsize) > + asm volatile ("dc\tcvau, %0" > + : > + : "r" (address) > + : "memory"); > + } > > asm volatile ("dsb\tish" : : : "memory"); > > - /* Make the start address of the loop cache aligned. */ > - address = (const char*) ((__UINTPTR_TYPE__) base > - & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); > + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the > Point of > + Unification is not required for instruction to data coherence. */ > + > + if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) { Same here, this should be: if (((cache_info >> CTR_DIC_SHIFT) & 0x1) == 0x0) Thanks, Kyrill > + /* Make the start address of the loop cache aligned. */ > + address = (const char*) ((__UINTPTR_TYPE__) base > + & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); > > - for (; address < (const char *) end; address += icache_lsize) > - asm volatile ("ic\tivau, %0" > - : > - : "r" (address) > - : "memory"); > + for (; address < (const char *) end; address += icache_lsize) > + asm volatile ("ic\tivau, %0" > + : > + : "r" (address) > + : "memory"); > > - asm volatile ("dsb\tish; isb" : : : "memory"); > + asm volatile ("dsb\tish" : : : "memory"); > + } > + asm volatile("isb" : : : "memory") > } > -- > 2.7.4 >
Hi Kyrill, On 2019/9/2 22:24, Kyrill Tkachov wrote: > Hi Shaokun, > > On 8/31/19 8:12 AM, Shaokun Zhang wrote: >> The DCache clean & ICache invalidation requirements for instructions >> to be data coherence are discoverable through new fields in CTR_EL0. >> Let's support the two bits if they are enabled, the CPU core will >> not execute the unnecessary DCache clean or Icache Invalidation >> instructions. >> >> 2019-08-31 Shaokun Zhang <zhangshaokun@hisilicon.com> >> >> * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in >> __aarch64_sync_cache_range function. > > As mentioned in the RFC, I think this is ok but... > > >> --- >> libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++-------------- >> 1 file changed, 35 insertions(+), 21 deletions(-) >> >> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c >> index 791f5e42ff44..0b057efbdcab 100644 >> --- a/libgcc/config/aarch64/sync-cache.c >> +++ b/libgcc/config/aarch64/sync-cache.c >> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program; >> see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> <http://www.gnu.org/licenses/>. */ >> >> +#define CTR_IDC_SHIFT 28 >> +#define CTR_DIC_SHIFT 29 >> + >> void __aarch64_sync_cache_range (const void *, const void *); >> >> void >> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end) >> icache_lsize = 4 << (cache_info & 0xF); >> dcache_lsize = 4 << ((cache_info >> 16) & 0xF); >> >> - /* Loop over the address range, clearing one cache line at once. >> - Data cache must be flushed to unification first to make sure the >> - instruction cache fetches the updated data. 'end' is exclusive, >> - as per the GNU definition of __clear_cache. */ >> + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is >> + not required for instruction to data coherence. */ >> + >> + if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) { >> + /* Loop over the address range, clearing one cache line at once. >> + Data cache must be flushed to unification first to make sure the >> + instruction cache fetches the updated data. 'end' is exclusive, >> + as per the GNU definition of __clear_cache. */ >> >> - /* Make the start address of the loop cache aligned. */ >> - address = (const char*) ((__UINTPTR_TYPE__) base >> - & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); >> + /* Make the start address of the loop cache aligned. */ >> + address = (const char*) ((__UINTPTR_TYPE__) base >> + & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); >> >> - for (; address < (const char *) end; address += dcache_lsize) >> - asm volatile ("dc\tcvau, %0" >> - : >> - : "r" (address) >> - : "memory"); >> + for (; address < (const char *) end; address += dcache_lsize) >> + asm volatile ("dc\tcvau, %0" >> + : >> + : "r" (address) >> + : "memory"); >> + } >> >> asm volatile ("dsb\tish" : : : "memory"); >> >> - /* Make the start address of the loop cache aligned. */ >> - address = (const char*) ((__UINTPTR_TYPE__) base >> - & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); >> + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of >> + Unification is not required for instruction to data coherence. */ >> + >> + if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) { >> + /* Make the start address of the loop cache aligned. */ >> + address = (const char*) ((__UINTPTR_TYPE__) base >> + & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); >> >> - for (; address < (const char *) end; address += icache_lsize) >> - asm volatile ("ic\tivau, %0" >> - : >> - : "r" (address) >> - : "memory"); >> + for (; address < (const char *) end; address += icache_lsize) >> + asm volatile ("ic\tivau, %0" >> + : >> + : "r" (address) >> + : "memory"); >> >> - asm volatile ("dsb\tish; isb" : : : "memory"); >> + asm volatile ("dsb\tish" : : : "memory"); >> + } >> + asm volatile("isb" : : : "memory") > > ... a semicolon has gone missing here after the asm volatile so this will not compiler. A problem with the mail client? > Apologies that it seems that I made the wrong branch when I compiled the patch between several branches. :-( zhangshaokun@ubuntu:~/gcc_installdir/bin$ ./gcc --version gcc (GCC) 10.0.0 20190831 (experimental) Copyright (C) 2019 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. zhangshaokun@ubuntu:~/gcc_installdir/bin$ I have fixed it and double check its branch, it is ok now. ;-) Thanks, Shaokun > Thanks, > > Kyrill > > > >> } >> -- >> 2.7.4 >>
Hi Kyrill, On 2019/9/2 22:31, Kyrill Tkachov wrote: > Hi Shaokun > > On 8/31/19 8:12 AM, Shaokun Zhang wrote: >> The DCache clean & ICache invalidation requirements for instructions >> to be data coherence are discoverable through new fields in CTR_EL0. >> Let's support the two bits if they are enabled, the CPU core will >> not execute the unnecessary DCache clean or Icache Invalidation >> instructions. >> >> 2019-08-31 Shaokun Zhang <zhangshaokun@hisilicon.com> >> >> * config/aarch64/sync-cache.c: Support CTR_EL0.IDC and CTR_EL0.DIC in >> __aarch64_sync_cache_range function. > > Sorry, I just tried compiling this to look at the assembly output. I think there's a couple of issues... > Hmm, sorry for this -Wparentheses issue and I have fixed them in next version. About all the problems, I'm so sorry that I shall double check it before I send it to community. Thanks, Shaokun > >> --- >> libgcc/config/aarch64/sync-cache.c | 56 ++++++++++++++++++++++++-------------- >> 1 file changed, 35 insertions(+), 21 deletions(-) >> >> diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c >> index 791f5e42ff44..0b057efbdcab 100644 >> --- a/libgcc/config/aarch64/sync-cache.c >> +++ b/libgcc/config/aarch64/sync-cache.c >> @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program; >> see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> <http://www.gnu.org/licenses/>. */ >> >> +#define CTR_IDC_SHIFT 28 >> +#define CTR_DIC_SHIFT 29 >> + >> void __aarch64_sync_cache_range (const void *, const void *); >> >> void >> @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end) >> icache_lsize = 4 << (cache_info & 0xF); >> dcache_lsize = 4 << ((cache_info >> 16) & 0xF); >> >> - /* Loop over the address range, clearing one cache line at once. >> - Data cache must be flushed to unification first to make sure the >> - instruction cache fetches the updated data. 'end' is exclusive, >> - as per the GNU definition of __clear_cache. */ >> + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is >> + not required for instruction to data coherence. */ >> + >> + if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) { > > > By the C rules, this will evaluate to 0 always and the whole path is eliminated. What you want here is: > > if (((cache_info >> CTR_IDC_SHIFT) & 0x1) == 0x0) > > >> + /* Loop over the address range, clearing one cache line at once. >> + Data cache must be flushed to unification first to make sure the >> + instruction cache fetches the updated data. 'end' is exclusive, >> + as per the GNU definition of __clear_cache. */ >> >> - /* Make the start address of the loop cache aligned. */ >> - address = (const char*) ((__UINTPTR_TYPE__) base >> - & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); >> + /* Make the start address of the loop cache aligned. */ >> + address = (const char*) ((__UINTPTR_TYPE__) base >> + & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); >> >> - for (; address < (const char *) end; address += dcache_lsize) >> - asm volatile ("dc\tcvau, %0" >> - : >> - : "r" (address) >> - : "memory"); >> + for (; address < (const char *) end; address += dcache_lsize) >> + asm volatile ("dc\tcvau, %0" >> + : >> + : "r" (address) >> + : "memory"); >> + } >> >> asm volatile ("dsb\tish" : : : "memory"); >> >> - /* Make the start address of the loop cache aligned. */ >> - address = (const char*) ((__UINTPTR_TYPE__) base >> - & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); >> + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of >> + Unification is not required for instruction to data coherence. */ >> + >> + if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) { > > Same here, this should be: > > if (((cache_info >> CTR_DIC_SHIFT) & 0x1) == 0x0) > > > Thanks, > > Kyrill > >> + /* Make the start address of the loop cache aligned. */ >> + address = (const char*) ((__UINTPTR_TYPE__) base >> + & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); >> >> - for (; address < (const char *) end; address += icache_lsize) >> - asm volatile ("ic\tivau, %0" >> - : >> - : "r" (address) >> - : "memory"); >> + for (; address < (const char *) end; address += icache_lsize) >> + asm volatile ("ic\tivau, %0" >> + : >> + : "r" (address) >> + : "memory"); >> >> - asm volatile ("dsb\tish; isb" : : : "memory"); >> + asm volatile ("dsb\tish" : : : "memory"); >> + } >> + asm volatile("isb" : : : "memory") >> } >> -- >> 2.7.4 >> > > . >
diff --git a/libgcc/config/aarch64/sync-cache.c b/libgcc/config/aarch64/sync-cache.c index 791f5e42ff44..0b057efbdcab 100644 --- a/libgcc/config/aarch64/sync-cache.c +++ b/libgcc/config/aarch64/sync-cache.c @@ -23,6 +23,9 @@ a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ +#define CTR_IDC_SHIFT 28 +#define CTR_DIC_SHIFT 29 + void __aarch64_sync_cache_range (const void *, const void *); void @@ -41,32 +44,43 @@ __aarch64_sync_cache_range (const void *base, const void *end) icache_lsize = 4 << (cache_info & 0xF); dcache_lsize = 4 << ((cache_info >> 16) & 0xF); - /* Loop over the address range, clearing one cache line at once. - Data cache must be flushed to unification first to make sure the - instruction cache fetches the updated data. 'end' is exclusive, - as per the GNU definition of __clear_cache. */ + /* If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification is + not required for instruction to data coherence. */ + + if ((cache_info >> CTR_IDC_SHIFT) & 0x1 == 0x0) { + /* Loop over the address range, clearing one cache line at once. + Data cache must be flushed to unification first to make sure the + instruction cache fetches the updated data. 'end' is exclusive, + as per the GNU definition of __clear_cache. */ - /* Make the start address of the loop cache aligned. */ - address = (const char*) ((__UINTPTR_TYPE__) base - & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); + /* Make the start address of the loop cache aligned. */ + address = (const char*) ((__UINTPTR_TYPE__) base + & ~ (__UINTPTR_TYPE__) (dcache_lsize - 1)); - for (; address < (const char *) end; address += dcache_lsize) - asm volatile ("dc\tcvau, %0" - : - : "r" (address) - : "memory"); + for (; address < (const char *) end; address += dcache_lsize) + asm volatile ("dc\tcvau, %0" + : + : "r" (address) + : "memory"); + } asm volatile ("dsb\tish" : : : "memory"); - /* Make the start address of the loop cache aligned. */ - address = (const char*) ((__UINTPTR_TYPE__) base - & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); + /* If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point of + Unification is not required for instruction to data coherence. */ + + if ((cache_info >> CTR_DIC_SHIFT) & 0x1 == 0x0) { + /* Make the start address of the loop cache aligned. */ + address = (const char*) ((__UINTPTR_TYPE__) base + & ~ (__UINTPTR_TYPE__) (icache_lsize - 1)); - for (; address < (const char *) end; address += icache_lsize) - asm volatile ("ic\tivau, %0" - : - : "r" (address) - : "memory"); + for (; address < (const char *) end; address += icache_lsize) + asm volatile ("ic\tivau, %0" + : + : "r" (address) + : "memory"); - asm volatile ("dsb\tish; isb" : : : "memory"); + asm volatile ("dsb\tish" : : : "memory"); + } + asm volatile("isb" : : : "memory") }