diff mbox series

[4/4] powerpc/64: reuse PPC32 static inline flush_dcache_range()

Message ID d6f628ffdeb9c7863da722a8f6ef2949e57bb360.1557824379.git.christophe.leroy@c-s.fr (mailing list archive)
State Accepted
Commit 22e9c88d486a0536d337d6e0973968be0a4cd4b2
Headers show
Series [1/4] powerpc/64: flush_inval_dcache_range() becomesflush_dcache_range() | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch next (8150a153c013aa2dd1ffae43370b89ac1347a7fb)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 fail build failed!
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 86 lines checked

Commit Message

Christophe Leroy May 14, 2019, 9:05 a.m. UTC
This patch drops the assembly PPC64 version of flush_dcache_range()
and re-uses the PPC32 static inline version.

With GCC 8.1, the following code is generated:

void flush_test(unsigned long start, unsigned long stop)
{
	flush_dcache_range(start, stop);
}

0000000000000130 <.flush_test>:
 130:	3d 22 00 00 	addis   r9,r2,0
			132: R_PPC64_TOC16_HA	.data+0x8
 134:	81 09 00 00 	lwz     r8,0(r9)
			136: R_PPC64_TOC16_LO	.data+0x8
 138:	3d 22 00 00 	addis   r9,r2,0
			13a: R_PPC64_TOC16_HA	.data+0xc
 13c:	80 e9 00 00 	lwz     r7,0(r9)
			13e: R_PPC64_TOC16_LO	.data+0xc
 140:	7d 48 00 d0 	neg     r10,r8
 144:	7d 43 18 38 	and     r3,r10,r3
 148:	7c 00 04 ac 	hwsync
 14c:	4c 00 01 2c 	isync
 150:	39 28 ff ff 	addi    r9,r8,-1
 154:	7c 89 22 14 	add     r4,r9,r4
 158:	7c 83 20 50 	subf    r4,r3,r4
 15c:	7c 89 3c 37 	srd.    r9,r4,r7
 160:	41 82 00 1c 	beq     17c <.flush_test+0x4c>
 164:	7d 29 03 a6 	mtctr   r9
 168:	60 00 00 00 	nop
 16c:	60 00 00 00 	nop
 170:	7c 00 18 ac 	dcbf    0,r3
 174:	7c 63 42 14 	add     r3,r3,r8
 178:	42 00 ff f8 	bdnz    170 <.flush_test+0x40>
 17c:	7c 00 04 ac 	hwsync
 180:	4c 00 01 2c 	isync
 184:	4e 80 00 20 	blr
 188:	60 00 00 00 	nop
 18c:	60 00 00 00 	nop

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/include/asm/cache.h      | 10 ++++++++++
 arch/powerpc/include/asm/cacheflush.h | 14 ++++++++------
 arch/powerpc/kernel/misc_64.S         | 29 -----------------------------
 3 files changed, 18 insertions(+), 35 deletions(-)

Comments

Michael Ellerman July 8, 2019, 1:19 a.m. UTC | #1
On Tue, 2019-05-14 at 09:05:16 UTC, Christophe Leroy wrote:
> This patch drops the assembly PPC64 version of flush_dcache_range()
> and re-uses the PPC32 static inline version.
> 
> With GCC 8.1, the following code is generated:
> 
> void flush_test(unsigned long start, unsigned long stop)
> {
> 	flush_dcache_range(start, stop);
> }
> 
> 0000000000000130 <.flush_test>:
>  130:	3d 22 00 00 	addis   r9,r2,0
> 			132: R_PPC64_TOC16_HA	.data+0x8
>  134:	81 09 00 00 	lwz     r8,0(r9)
> 			136: R_PPC64_TOC16_LO	.data+0x8
>  138:	3d 22 00 00 	addis   r9,r2,0
> 			13a: R_PPC64_TOC16_HA	.data+0xc
>  13c:	80 e9 00 00 	lwz     r7,0(r9)
> 			13e: R_PPC64_TOC16_LO	.data+0xc
>  140:	7d 48 00 d0 	neg     r10,r8
>  144:	7d 43 18 38 	and     r3,r10,r3
>  148:	7c 00 04 ac 	hwsync
>  14c:	4c 00 01 2c 	isync
>  150:	39 28 ff ff 	addi    r9,r8,-1
>  154:	7c 89 22 14 	add     r4,r9,r4
>  158:	7c 83 20 50 	subf    r4,r3,r4
>  15c:	7c 89 3c 37 	srd.    r9,r4,r7
>  160:	41 82 00 1c 	beq     17c <.flush_test+0x4c>
>  164:	7d 29 03 a6 	mtctr   r9
>  168:	60 00 00 00 	nop
>  16c:	60 00 00 00 	nop
>  170:	7c 00 18 ac 	dcbf    0,r3
>  174:	7c 63 42 14 	add     r3,r3,r8
>  178:	42 00 ff f8 	bdnz    170 <.flush_test+0x40>
>  17c:	7c 00 04 ac 	hwsync
>  180:	4c 00 01 2c 	isync
>  184:	4e 80 00 20 	blr
>  188:	60 00 00 00 	nop
>  18c:	60 00 00 00 	nop
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/22e9c88d486a0536d337d6e0973968be0a4cd4b2

cheers
Aneesh Kumar K V July 8, 2019, 2:21 p.m. UTC | #2
Christophe Leroy <christophe.leroy@c-s.fr> writes:

> This patch drops the assembly PPC64 version of flush_dcache_range()
> and re-uses the PPC32 static inline version.
>
> With GCC 8.1, the following code is generated:
>
> void flush_test(unsigned long start, unsigned long stop)
> {
> 	flush_dcache_range(start, stop);
> }
>
> 0000000000000130 <.flush_test>:
>  130:	3d 22 00 00 	addis   r9,r2,0
> 			132: R_PPC64_TOC16_HA	.data+0x8
>  134:	81 09 00 00 	lwz     r8,0(r9)
> 			136: R_PPC64_TOC16_LO	.data+0x8
>  138:	3d 22 00 00 	addis   r9,r2,0
> 			13a: R_PPC64_TOC16_HA	.data+0xc
>  13c:	80 e9 00 00 	lwz     r7,0(r9)
> 			13e: R_PPC64_TOC16_LO	.data+0xc
>  140:	7d 48 00 d0 	neg     r10,r8
>  144:	7d 43 18 38 	and     r3,r10,r3
>  148:	7c 00 04 ac 	hwsync
>  14c:	4c 00 01 2c 	isync
>  150:	39 28 ff ff 	addi    r9,r8,-1
>  154:	7c 89 22 14 	add     r4,r9,r4
>  158:	7c 83 20 50 	subf    r4,r3,r4
>  15c:	7c 89 3c 37 	srd.    r9,r4,r7
>  160:	41 82 00 1c 	beq     17c <.flush_test+0x4c>
>  164:	7d 29 03 a6 	mtctr   r9
>  168:	60 00 00 00 	nop
>  16c:	60 00 00 00 	nop
>  170:	7c 00 18 ac 	dcbf    0,r3
>  174:	7c 63 42 14 	add     r3,r3,r8
>  178:	42 00 ff f8 	bdnz    170 <.flush_test+0x40>
>  17c:	7c 00 04 ac 	hwsync
>  180:	4c 00 01 2c 	isync
>  184:	4e 80 00 20 	blr
>  188:	60 00 00 00 	nop
>  18c:	60 00 00 00 	nop
>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  arch/powerpc/include/asm/cache.h      | 10 ++++++++++
>  arch/powerpc/include/asm/cacheflush.h | 14 ++++++++------
>  arch/powerpc/kernel/misc_64.S         | 29 -----------------------------
>  3 files changed, 18 insertions(+), 35 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
> index 0009a0a82e86..45e3137ccd71 100644
> --- a/arch/powerpc/include/asm/cache.h
> +++ b/arch/powerpc/include/asm/cache.h
> @@ -54,6 +54,16 @@ struct ppc64_caches {
>  };
>  
>  extern struct ppc64_caches ppc64_caches;
> +
> +static inline u32 l1_cache_shift(void)
> +{
> +	return ppc64_caches.l1d.log_block_size;
> +}
> +
> +static inline u32 l1_cache_bytes(void)
> +{
> +	return ppc64_caches.l1d.block_size;
> +}
>  #else
>  static inline u32 l1_cache_shift(void)
>  {
> diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
> index d405f18441cd..3cd7ce3dec8b 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -57,7 +57,6 @@ static inline void __flush_dcache_icache_phys(unsigned long physaddr)
>  }
>  #endif
>  
> -#ifdef CONFIG_PPC32
>  /*
>   * Write any modified data cache blocks out to memory and invalidate them.
>   * Does not invalidate the corresponding instruction cache blocks.
> @@ -70,9 +69,17 @@ static inline void flush_dcache_range(unsigned long start, unsigned long stop)
>  	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
>  	unsigned long i;
>  
> +	if (IS_ENABLED(CONFIG_PPC64)) {
> +		mb();	/* sync */
> +		isync();
> +	}
> +
>  	for (i = 0; i < size >> shift; i++, addr += bytes)
>  		dcbf(addr);
>  	mb();	/* sync */
> +
> +	if (IS_ENABLED(CONFIG_PPC64))
> +		isync();
>  }


Was checking with Michael about why we need that extra isync. Michael 
pointed this came via 

https://github.com/mpe/linux-fullhistory/commit/faa5ee3743ff9b6df9f9a03600e34fdae596cfb2#diff-67c7ffa8e420c7d4206cae4a9e888e14

for 970 which doesn't have coherent icache. So possibly isync there is
to flush the prefetch instructions? But even so we would need an icbi
there before that isync.

So overall wondering why we need that extra barriers there. 

>  
>  /*
> @@ -112,11 +119,6 @@ static inline void invalidate_dcache_range(unsigned long start,
>  	mb();	/* sync */
>  }
>  

-aneesh
Oliver O'Halloran July 9, 2019, 2:20 a.m. UTC | #3
On Tue, Jul 9, 2019 at 12:22 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>
> > *snip*
> > +     if (IS_ENABLED(CONFIG_PPC64))
> > +             isync();
> >  }
>
>
> Was checking with Michael about why we need that extra isync. Michael
> pointed this came via
>
> https://github.com/mpe/linux-fullhistory/commit/faa5ee3743ff9b6df9f9a03600e34fdae596cfb2#diff-67c7ffa8e420c7d4206cae4a9e888e14
>
> for 970 which doesn't have coherent icache. So possibly isync there is
> to flush the prefetch instructions? But even so we would need an icbi
> there before that isync.

I don't think it's that, there's some magic in flush_icache_range() to
handle dropping prefetched instructions on 970.

> So overall wondering why we need that extra barriers there.

I think the isync is needed there because the architecture only
requires sync to provide ordering. A sync alone doesn't guarantee the
dcbfs have actually completed so the isync is necessary to ensure the
flushed cache lines are back in memory. That said, as far as I know
all the IBM book3s chips from power4 onwards will wait for pending
dcbfs when they hit a sync, but that might change in the future.

If it's a problem we could add a cpu-feature section around the isync
to no-op it in the common case. However, when I had a look with perf
it always showed that the sync was the hotspot so I don't think it'll
help much.

Oliver
Aneesh Kumar K V July 9, 2019, 2:51 a.m. UTC | #4
On 7/9/19 7:50 AM, Oliver O'Halloran wrote:
> On Tue, Jul 9, 2019 at 12:22 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>
>>> *snip*
>>> +     if (IS_ENABLED(CONFIG_PPC64))
>>> +             isync();
>>>   }
>>
>>
>> Was checking with Michael about why we need that extra isync. Michael
>> pointed this came via
>>
>> https://github.com/mpe/linux-fullhistory/commit/faa5ee3743ff9b6df9f9a03600e34fdae596cfb2#diff-67c7ffa8e420c7d4206cae4a9e888e14
>>
>> for 970 which doesn't have coherent icache. So possibly isync there is
>> to flush the prefetch instructions? But even so we would need an icbi
>> there before that isync.
> 
> I don't think it's that, there's some magic in flush_icache_range() to
> handle dropping prefetched instructions on 970.
> 
>> So overall wondering why we need that extra barriers there.
> 
> I think the isync is needed there because the architecture only
> requires sync to provide ordering. A sync alone doesn't guarantee the
> dcbfs have actually completed so the isync is necessary to ensure the
> flushed cache lines are back in memory. That said, as far as I know
> all the IBM book3s chips from power4 onwards will wait for pending
> dcbfs when they hit a sync, but that might change in the future.
> 

ISA doesn't list that as the sequence. Only place where isync was 
mentioned was w.r.t  icbi where want to discards the prefetch.



> If it's a problem we could add a cpu-feature section around the isync
> to no-op it in the common case. However, when I had a look with perf
> it always showed that the sync was the hotspot so I don't think it'll
> help much.
> 

What about the preceding barriers (sync; isync;) before dcbf? Why are 
they needed?

-aneesh
Oliver O'Halloran July 9, 2019, 5:29 a.m. UTC | #5
On Tue, Jul 9, 2019 at 12:52 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 7/9/19 7:50 AM, Oliver O'Halloran wrote:
> > On Tue, Jul 9, 2019 at 12:22 AM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> >>
> >> Christophe Leroy <christophe.leroy@c-s.fr> writes:
> >>
> >>> *snip*
> >>> +     if (IS_ENABLED(CONFIG_PPC64))
> >>> +             isync();
> >>>   }
> >>
> >>
> >> Was checking with Michael about why we need that extra isync. Michael
> >> pointed this came via
> >>
> >> https://github.com/mpe/linux-fullhistory/commit/faa5ee3743ff9b6df9f9a03600e34fdae596cfb2#diff-67c7ffa8e420c7d4206cae4a9e888e14
> >>
> >> for 970 which doesn't have coherent icache. So possibly isync there is
> >> to flush the prefetch instructions? But even so we would need an icbi
> >> there before that isync.
> >
> > I don't think it's that, there's some magic in flush_icache_range() to
> > handle dropping prefetched instructions on 970.
> >
> >> So overall wondering why we need that extra barriers there.
> >
> > I think the isync is needed there because the architecture only
> > requires sync to provide ordering. A sync alone doesn't guarantee the
> > dcbfs have actually completed so the isync is necessary to ensure the
> > flushed cache lines are back in memory. That said, as far as I know
> > all the IBM book3s chips from power4 onwards will wait for pending
> > dcbfs when they hit a sync, but that might change in the future.
> >
>
> ISA doesn't list that as the sequence. Only place where isync was
> mentioned was w.r.t  icbi where want to discards the prefetch.

doesn't list that as the sequence for what?

> > If it's a problem we could add a cpu-feature section around the isync
> > to no-op it in the common case. However, when I had a look with perf
> > it always showed that the sync was the hotspot so I don't think it'll
> > help much.
> >
>
> What about the preceding barriers (sync; isync;) before dcbf? Why are
> they needed?

Dunno, the sync might just be to ensure ordering between prior stores
and the dcbf.

>
> -aneesh
Segher Boessenkool July 9, 2019, 5:04 p.m. UTC | #6
On Tue, Jul 09, 2019 at 08:21:54AM +0530, Aneesh Kumar K.V wrote:
> On 7/9/19 7:50 AM, Oliver O'Halloran wrote:
> >I don't think it's that, there's some magic in flush_icache_range() to
> >handle dropping prefetched instructions on 970.
> >
> >>So overall wondering why we need that extra barriers there.
> >
> >I think the isync is needed there because the architecture only
> >requires sync to provide ordering. A sync alone doesn't guarantee the
> >dcbfs have actually completed so the isync is necessary to ensure the
> >flushed cache lines are back in memory. That said, as far as I know
> >all the IBM book3s chips from power4 onwards will wait for pending
> >dcbfs when they hit a sync, but that might change in the future.
> >
> 
> ISA doesn't list that as the sequence. Only place where isync was 
> mentioned was w.r.t  icbi where want to discards the prefetch.

You need an isync to guarantee all icbi insns before the isync have been
performed before any code after the isync is fetched.  Killing the
prefetch is just part of it.

> >If it's a problem we could add a cpu-feature section around the isync
> >to no-op it in the common case. However, when I had a look with perf
> >it always showed that the sync was the hotspot so I don't think it'll
> >help much.
> 
> What about the preceding barriers (sync; isync;) before dcbf? Why are 
> they needed?

This isn't very generic code.  The code seems to be trying to do
coherency in software.  Like you needed to do for DART on U3/U4, or for
some of the PMU/SMU communication -- both are through main memory, but
both are not cache coherent.  Which means all rules go out of the
window.

To do this properly you need some platform-specific code, for example
to kill hardware and software prefetch streams.  Or hope^Wguarantee
those never touch your communication buffers.


I recommend you keep the original function, maybe with a more specific
name, for the DART etc. code; and have all normal(*) dcbf users use a
new more normal function, with just a single sync instruction.


Segher


(*) As far as anything using dcbf can be called "normal"!
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 0009a0a82e86..45e3137ccd71 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -54,6 +54,16 @@  struct ppc64_caches {
 };
 
 extern struct ppc64_caches ppc64_caches;
+
+static inline u32 l1_cache_shift(void)
+{
+	return ppc64_caches.l1d.log_block_size;
+}
+
+static inline u32 l1_cache_bytes(void)
+{
+	return ppc64_caches.l1d.block_size;
+}
 #else
 static inline u32 l1_cache_shift(void)
 {
diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index d405f18441cd..3cd7ce3dec8b 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -57,7 +57,6 @@  static inline void __flush_dcache_icache_phys(unsigned long physaddr)
 }
 #endif
 
-#ifdef CONFIG_PPC32
 /*
  * Write any modified data cache blocks out to memory and invalidate them.
  * Does not invalidate the corresponding instruction cache blocks.
@@ -70,9 +69,17 @@  static inline void flush_dcache_range(unsigned long start, unsigned long stop)
 	unsigned long size = stop - (unsigned long)addr + (bytes - 1);
 	unsigned long i;
 
+	if (IS_ENABLED(CONFIG_PPC64)) {
+		mb();	/* sync */
+		isync();
+	}
+
 	for (i = 0; i < size >> shift; i++, addr += bytes)
 		dcbf(addr);
 	mb();	/* sync */
+
+	if (IS_ENABLED(CONFIG_PPC64))
+		isync();
 }
 
 /*
@@ -112,11 +119,6 @@  static inline void invalidate_dcache_range(unsigned long start,
 	mb();	/* sync */
 }
 
-#endif /* CONFIG_PPC32 */
-#ifdef CONFIG_PPC64
-extern void flush_dcache_range(unsigned long start, unsigned long stop);
-#endif
-
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
 	do { \
 		memcpy(dst, src, len); \
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index a4fd536efb44..1b0a42c50ef1 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -115,35 +115,6 @@  _ASM_NOKPROBE_SYMBOL(flush_icache_range)
 EXPORT_SYMBOL(flush_icache_range)
 
 /*
- * Like above, but only do the D-cache.
- *
- * flush_dcache_range(unsigned long start, unsigned long stop)
- *
- *    flush all bytes from start to stop-1 inclusive
- */
-
-_GLOBAL_TOC(flush_dcache_range)
- 	ld	r10,PPC64_CACHES@toc(r2)
-	lwz	r7,DCACHEL1BLOCKSIZE(r10)	/* Get dcache block size */
-	addi	r5,r7,-1
-	andc	r6,r3,r5		/* round low to line bdy */
-	subf	r8,r6,r4		/* compute length */
-	add	r8,r8,r5		/* ensure we get enough */
-	lwz	r9,DCACHEL1LOGBLOCKSIZE(r10)/* Get log-2 of dcache block size */
-	srw.	r8,r8,r9		/* compute line count */
-	beqlr				/* nothing to do? */
-	sync
-	isync
-	mtctr	r8
-0:	dcbf	0,r6
-	add	r6,r6,r7
-	bdnz	0b
-	sync
-	isync
-	blr
-EXPORT_SYMBOL(flush_dcache_range)
-
-/*
  * Flush a particular page from the data cache to RAM.
  * Note: this is necessary because the instruction cache does *not*
  * snoop from the data cache.