diff mbox series

[RFC] powerpc: Convert ____flush_dcache_icache_phys() to C

Message ID de7a813c71c4823797bb351bea8be15acae83be2.1565970465.git.christophe.leroy@c-s.fr (mailing list archive)
State Rejected
Headers show
Series [RFC] powerpc: Convert ____flush_dcache_icache_phys() to C | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch warning Failed to apply on branch next (c9633332103e55bc73d80d07ead28b95a22a85a3)
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch

Commit Message

Christophe Leroy Aug. 16, 2019, 3:52 p.m. UTC
Resulting code (8xx with 16 bytes per cacheline and 16k pages)

0000016c <__flush_dcache_icache_phys>:
 16c:	54 63 00 22 	rlwinm  r3,r3,0,0,17
 170:	7d 20 00 a6 	mfmsr   r9
 174:	39 40 04 00 	li      r10,1024
 178:	55 28 07 34 	rlwinm  r8,r9,0,28,26
 17c:	7c 67 1b 78 	mr      r7,r3
 180:	7d 49 03 a6 	mtctr   r10
 184:	7d 00 01 24 	mtmsr   r8
 188:	4c 00 01 2c 	isync
 18c:	7c 00 18 6c 	dcbst   0,r3
 190:	38 63 00 10 	addi    r3,r3,16
 194:	42 00 ff f8 	bdnz    18c <__flush_dcache_icache_phys+0x20>
 198:	7c 00 04 ac 	hwsync
 19c:	7d 49 03 a6 	mtctr   r10
 1a0:	7c 00 3f ac 	icbi    0,r7
 1a4:	38 e7 00 10 	addi    r7,r7,16
 1a8:	42 00 ff f8 	bdnz    1a0 <__flush_dcache_icache_phys+0x34>
 1ac:	7c 00 04 ac 	hwsync
 1b0:	7d 20 01 24 	mtmsr   r9
 1b4:	4c 00 01 2c 	isync
 1b8:	4e 80 00 20 	blr

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 This patch is on top of Alastair's series "powerpc: convert cache asm to C"
 Patch 3 of that series should touch __flush_dcache_icache_phys and this
 patch could come just after patch 3.

 arch/powerpc/include/asm/cacheflush.h |  8 +++++
 arch/powerpc/mm/mem.c                 | 55 ++++++++++++++++++++++++++++-------
 2 files changed, 53 insertions(+), 10 deletions(-)

Comments

Alastair D'Silva Aug. 20, 2019, 4:36 a.m. UTC | #1
On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
> Resulting code (8xx with 16 bytes per cacheline and 16k pages)
> 
> 0000016c <__flush_dcache_icache_phys>:
>  16c:	54 63 00 22 	rlwinm  r3,r3,0,0,17
>  170:	7d 20 00 a6 	mfmsr   r9
>  174:	39 40 04 00 	li      r10,1024
>  178:	55 28 07 34 	rlwinm  r8,r9,0,28,26
>  17c:	7c 67 1b 78 	mr      r7,r3
>  180:	7d 49 03 a6 	mtctr   r10
>  184:	7d 00 01 24 	mtmsr   r8
>  188:	4c 00 01 2c 	isync
>  18c:	7c 00 18 6c 	dcbst   0,r3
>  190:	38 63 00 10 	addi    r3,r3,16
>  194:	42 00 ff f8 	bdnz    18c <__flush_dcache_icache_phys+0x20>
>  198:	7c 00 04 ac 	hwsync
>  19c:	7d 49 03 a6 	mtctr   r10
>  1a0:	7c 00 3f ac 	icbi    0,r7
>  1a4:	38 e7 00 10 	addi    r7,r7,16
>  1a8:	42 00 ff f8 	bdnz    1a0 <__flush_dcache_icache_phys+0x34>
>  1ac:	7c 00 04 ac 	hwsync
>  1b0:	7d 20 01 24 	mtmsr   r9
>  1b4:	4c 00 01 2c 	isync
>  1b8:	4e 80 00 20 	blr
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  This patch is on top of Alastair's series "powerpc: convert cache
> asm to C"
>  Patch 3 of that series should touch __flush_dcache_icache_phys and
> this
>  patch could come just after patch 3.
> 
>  arch/powerpc/include/asm/cacheflush.h |  8 +++++
>  arch/powerpc/mm/mem.c                 | 55
> ++++++++++++++++++++++++++++-------
>  2 files changed, 53 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cacheflush.h
> b/arch/powerpc/include/asm/cacheflush.h
> index 1826bf2cc137..bf4f2dc4eb76 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -47,6 +47,14 @@ void flush_icache_user_range(struct vm_area_struct
> *vma,
>  				    struct page *page, unsigned long
> addr,
>  				    int len);
>  void flush_dcache_icache_page(struct page *page);
> +#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> +void __flush_dcache_icache_phys(unsigned long physaddr);
> +#else
> +static inline void __flush_dcache_icache_phys(unsigned long
> physaddr)
> +{
> +	BUG();
> +}
> +#endif
>  
>  /**
>   * flush_dcache_range(): Write any modified data cache blocks out to
> memory and invalidate them.
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 43be99de7c9a..43009f9227c4 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -402,6 +402,50 @@ void flush_dcache_page(struct page *page)
>  }
>  EXPORT_SYMBOL(flush_dcache_page);
>  
> +#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> +void __flush_dcache_icache_phys(unsigned long physaddr)
> +{
> +	unsigned long bytes = l1_dcache_bytes();
> +	unsigned long nb = PAGE_SIZE / bytes;
> +	unsigned long addr = physaddr & PAGE_MASK;
> +	unsigned long msr, msr0;
> +	unsigned long loop1 = addr, loop2 = addr;
> +
> +	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
> +		/* For a snooping icache, we still need a dummy icbi to
> purge all the
> +		 * prefetched instructions from the ifetch buffers. We
> also need a sync
> +		 * before the icbi to order the the actual stores to
> memory that might
> +		 * have modified instructions with the icbi.
> +		 */
> +		mb(); /* sync */
> +		icbi((void *)addr);
> +		mb(); /* sync */
> +		isync();
> +		return;
> +	}
> +	msr0 = mfmsr();
> +	msr = msr0 & ~MSR_DR;
> +	asm volatile(
> +	    "	mtctr %2;"
> +	    "	mtmsr %3;"
> +	    "	isync;"
> +	    "0:	dcbst	0, %0;"
> +	    "	addi	%0, %0, %4;"
> +	    "	bdnz	0b;"
> +	    "	sync;"
> +	    "	mtctr %2;"
> +	    "1:	icbi	0, %1;"
> +	    "	addi	%1, %1, %4;"
> +	    "	bdnz	1b;"
> +	    "	sync;"
> +	    "	mtmsr %5;"
> +	    "	isync;"
> +	    : "+r" (loop1), "+r" (loop2)
> +	    : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
> +	    : "ctr", "memory");
> +}
> +#endif
> +
>  void flush_dcache_icache_page(struct page *page)
>  {
>  #ifdef CONFIG_HUGETLB_PAGE
> @@ -419,16 +463,7 @@ void flush_dcache_icache_page(struct page *page)
>  		__flush_dcache_icache(start);
>  		kunmap_atomic(start);
>  	} else {
> -		unsigned long msr = mfmsr();
> -
> -		/* Clear the DR bit so that we operate on physical
> -		 * rather than virtual addresses
> -		 */
> -		mtmsr(msr & ~(MSR_DR));
> -
> -		__flush_dcache_icache((void *)physaddr);
> -
> -		mtmsr(msr);
> +		__flush_dcache_icache_phys(page_to_pfn(page) <<
> PAGE_SHIFT);
>  	}
>  #endif
>  }


Thanks Christophe,

I'm trying a somewhat different approach that requires less knowledge
of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
function. The code below is not a patch as my tree is a bit messy,
sorry:

/**
 * flush_dcache_icache_phys() - Flush a page by it's physical address
 * @addr: the physical address of the page
 */
static void flush_dcache_icache_phys(unsigned long addr)
{
	register unsigned long msr;
	register unsigned long dlines = PAGE_SIZE >> l1_dcache_shift();
	register unsigned long dbytes = l1_dcache_bytes();
	register unsigned long ilines = PAGE_SIZE >> l1_icache_shift();
	register unsigned long ibytes = l1_icache_bytes();
	register unsigned long i;
	register unsigned long address = addr;

	/*
	 * Clear the DR bit so that we operate on physical
	 * rather than virtual addresses
	 */
	msr = mfmsr();
	mtmsr(msr & ~(MSR_DR));

	/* Write out the data cache */
	for (i = 0; i < dlines; i++, address += dbytes)
		dcbst((void *)address);

	/* Invalidate the instruction cache */
	address = addr;
	for (i = 0; i < ilines; i++, address += ibytes)
		icbi((void *)address);

	mtmsr(msr);
}

void test_flush_phys(unsigned long addr)
{
	flush_dcache_icache_phys(addr);
}


This gives the following assembler (using pmac32_defconfig):
000003cc <test_flush_phys>:
 3cc:   94 21 ff f0     stwu    r1,-16(r1)
 3d0:   7d 00 00 a6     mfmsr   r8
 3d4:   55 09 07 34     rlwinm  r9,r8,0,28,26
 3d8:   7d 20 01 24     mtmsr   r9
 3dc:   39 20 00 80     li      r9,128
 3e0:   7d 29 03 a6     mtctr   r9
 3e4:   39 43 10 00     addi    r10,r3,4096
 3e8:   7c 69 1b 78     mr      r9,r3
 3ec:   7c 00 48 6c     dcbst   0,r9
 3f0:   39 29 00 20     addi    r9,r9,32
 3f4:   42 00 ff f8     bdnz    3ec <test_flush_phys+0x20>
 3f8:   7c 00 1f ac     icbi    0,r3
 3fc:   38 63 00 20     addi    r3,r3,32
 400:   7f 8a 18 40     cmplw   cr7,r10,r3
 404:   40 9e ff f4     bne     cr7,3f8 <test_flush_phys+0x2c>
 408:   7d 00 01 24     mtmsr   r8
 40c:   38 21 00 10     addi    r1,r1,16
 410:   4e 80 00 20     blr
Christophe Leroy Aug. 21, 2019, 8:27 p.m. UTC | #2
Le 20/08/2019 à 06:36, Alastair D'Silva a écrit :
> On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:

[...]

> 
> 
> Thanks Christophe,
> 
> I'm trying a somewhat different approach that requires less knowledge
> of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
> function. The code below is not a patch as my tree is a bit messy,
> sorry:

Can we be 100% sure that GCC won't add any code accessing some global 
data or stack while the Data MMU is OFF ?

Christophe


> 
> /**
>   * flush_dcache_icache_phys() - Flush a page by it's physical address
>   * @addr: the physical address of the page
>   */
> static void flush_dcache_icache_phys(unsigned long addr)
> {
> 	register unsigned long msr;
> 	register unsigned long dlines = PAGE_SIZE >> l1_dcache_shift();
> 	register unsigned long dbytes = l1_dcache_bytes();
> 	register unsigned long ilines = PAGE_SIZE >> l1_icache_shift();
> 	register unsigned long ibytes = l1_icache_bytes();
> 	register unsigned long i;
> 	register unsigned long address = addr;
> 
> 	/*
> 	 * Clear the DR bit so that we operate on physical
> 	 * rather than virtual addresses
> 	 */
> 	msr = mfmsr();
> 	mtmsr(msr & ~(MSR_DR));
> 
> 	/* Write out the data cache */
> 	for (i = 0; i < dlines; i++, address += dbytes)
> 		dcbst((void *)address);
> 
> 	/* Invalidate the instruction cache */
> 	address = addr;
> 	for (i = 0; i < ilines; i++, address += ibytes)
> 		icbi((void *)address);
> 
> 	mtmsr(msr);
> }
> 
> void test_flush_phys(unsigned long addr)
> {
> 	flush_dcache_icache_phys(addr);
> }
> 
> 
> This gives the following assembler (using pmac32_defconfig):
> 000003cc <test_flush_phys>:
>   3cc:   94 21 ff f0     stwu    r1,-16(r1)
>   3d0:   7d 00 00 a6     mfmsr   r8
>   3d4:   55 09 07 34     rlwinm  r9,r8,0,28,26
>   3d8:   7d 20 01 24     mtmsr   r9
>   3dc:   39 20 00 80     li      r9,128
>   3e0:   7d 29 03 a6     mtctr   r9
>   3e4:   39 43 10 00     addi    r10,r3,4096
>   3e8:   7c 69 1b 78     mr      r9,r3
>   3ec:   7c 00 48 6c     dcbst   0,r9
>   3f0:   39 29 00 20     addi    r9,r9,32
>   3f4:   42 00 ff f8     bdnz    3ec <test_flush_phys+0x20>
>   3f8:   7c 00 1f ac     icbi    0,r3
>   3fc:   38 63 00 20     addi    r3,r3,32
>   400:   7f 8a 18 40     cmplw   cr7,r10,r3
>   404:   40 9e ff f4     bne     cr7,3f8 <test_flush_phys+0x2c>
>   408:   7d 00 01 24     mtmsr   r8
>   40c:   38 21 00 10     addi    r1,r1,16
>   410:   4e 80 00 20     blr
> 
>
Alastair D'Silva Aug. 22, 2019, 12:27 a.m. UTC | #3
On Wed, 2019-08-21 at 22:27 +0200, Christophe Leroy wrote:
> 
> Le 20/08/2019 à 06:36, Alastair D'Silva a écrit :
> > On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
> 
> [...]
> 
> > 
> > Thanks Christophe,
> > 
> > I'm trying a somewhat different approach that requires less
> > knowledge
> > of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
> > function. The code below is not a patch as my tree is a bit messy,
> > sorry:
> 
> Can we be 100% sure that GCC won't add any code accessing some
> global 
> data or stack while the Data MMU is OFF ?
> 
> Christophe
> 

+mpe

I'm not sure how we would go about making such a guarantee, but I've
tied every variable used to a register and addr is passed in a
register, so there is no stack usage, and every call in there only
operates on it's operands.

The calls to the inline cache helpers (for the PPC32 case) are all
constants, so I can't see a reasonable scenario where there would be a
function call and reordered to after the DR bit is turned off, but I
guess if we want to be paranoid, we could always add an mb() call
before the DR bit is manipulated to prevent the compiler from
reordering across the section where the data MMU is disabled.


> 
> > /**
> >   * flush_dcache_icache_phys() - Flush a page by it's physical
> > address
> >   * @addr: the physical address of the page
> >   */
> > static void flush_dcache_icache_phys(unsigned long addr)
> > {
> > 	register unsigned long msr;
> > 	register unsigned long dlines = PAGE_SIZE >> l1_dcache_shift();
> > 	register unsigned long dbytes = l1_dcache_bytes();
> > 	register unsigned long ilines = PAGE_SIZE >> l1_icache_shift();
> > 	register unsigned long ibytes = l1_icache_bytes();
> > 	register unsigned long i;
> > 	register unsigned long address = addr;
> > 
> > 	/*
> > 	 * Clear the DR bit so that we operate on physical
> > 	 * rather than virtual addresses
> > 	 */
> > 	msr = mfmsr();
> > 	mtmsr(msr & ~(MSR_DR));
> > 
> > 	/* Write out the data cache */
> > 	for (i = 0; i < dlines; i++, address += dbytes)
> > 		dcbst((void *)address);
> > 
> > 	/* Invalidate the instruction cache */
> > 	address = addr;
> > 	for (i = 0; i < ilines; i++, address += ibytes)
> > 		icbi((void *)address);
> > 
> > 	mtmsr(msr);
> > }
> > 
> > void test_flush_phys(unsigned long addr)
> > {
> > 	flush_dcache_icache_phys(addr);
> > }
> > 
> > 
> > This gives the following assembler (using pmac32_defconfig):
> > 000003cc <test_flush_phys>:
> >   3cc:   94 21 ff f0     stwu    r1,-16(r1)
> >   3d0:   7d 00 00 a6     mfmsr   r8
> >   3d4:   55 09 07 34     rlwinm  r9,r8,0,28,26
> >   3d8:   7d 20 01 24     mtmsr   r9
> >   3dc:   39 20 00 80     li      r9,128
> >   3e0:   7d 29 03 a6     mtctr   r9
> >   3e4:   39 43 10 00     addi    r10,r3,4096
> >   3e8:   7c 69 1b 78     mr      r9,r3
> >   3ec:   7c 00 48 6c     dcbst   0,r9
> >   3f0:   39 29 00 20     addi    r9,r9,32
> >   3f4:   42 00 ff f8     bdnz    3ec <test_flush_phys+0x20>
> >   3f8:   7c 00 1f ac     icbi    0,r3
> >   3fc:   38 63 00 20     addi    r3,r3,32
> >   400:   7f 8a 18 40     cmplw   cr7,r10,r3
> >   404:   40 9e ff f4     bne     cr7,3f8 <test_flush_phys+0x2c>
> >   408:   7d 00 01 24     mtmsr   r8
> >   40c:   38 21 00 10     addi    r1,r1,16
> >   410:   4e 80 00 20     blr
> > 
> >
Christophe Leroy Aug. 22, 2019, 5:06 a.m. UTC | #4
Le 22/08/2019 à 02:27, Alastair D'Silva a écrit :
> On Wed, 2019-08-21 at 22:27 +0200, Christophe Leroy wrote:
>>
>> Le 20/08/2019 à 06:36, Alastair D'Silva a écrit :
>>> On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
>>
>> [...]
>>
>>>
>>> Thanks Christophe,
>>>
>>> I'm trying a somewhat different approach that requires less
>>> knowledge
>>> of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
>>> function. The code below is not a patch as my tree is a bit messy,
>>> sorry:
>>
>> Can we be 100% sure that GCC won't add any code accessing some
>> global
>> data or stack while the Data MMU is OFF ?
>>
>> Christophe
>>
> 
> +mpe
> 
> I'm not sure how we would go about making such a guarantee, but I've
> tied every variable used to a register and addr is passed in a
> register, so there is no stack usage, and every call in there only
> operates on it's operands.
> 
> The calls to the inline cache helpers (for the PPC32 case) are all
> constants, so I can't see a reasonable scenario where there would be a
> function call and reordered to after the DR bit is turned off, but I
> guess if we want to be paranoid, we could always add an mb() call
> before the DR bit is manipulated to prevent the compiler from
> reordering across the section where the data MMU is disabled.
> 
> 

Anyway, I think the benefit of converting that function to C is pretty 
small. flush_dcache_range() and friends were converted to C mainly in 
order to inline them. But this __flush_dcache_icache_phys() is too big 
to be worth inlining, yet small and stable enough to remain in assembly 
for the time being.

So I suggest you keep it aside your series for now, just move 
PURGE_PREFETCHED_INS inside it directly as it will be the only remaining 
user of it.

Christophe
Alastair D'Silva Aug. 22, 2019, 5:47 a.m. UTC | #5
On Thu, 2019-08-22 at 07:06 +0200, Christophe Leroy wrote:
> 
> Le 22/08/2019 à 02:27, Alastair D'Silva a écrit :
> > On Wed, 2019-08-21 at 22:27 +0200, Christophe Leroy wrote:
> > > Le 20/08/2019 à 06:36, Alastair D'Silva a écrit :
> > > > On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
> > > 
> > > [...]
> > > 
> > > > Thanks Christophe,
> > > > 
> > > > I'm trying a somewhat different approach that requires less
> > > > knowledge
> > > > of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside
> > > > this
> > > > function. The code below is not a patch as my tree is a bit
> > > > messy,
> > > > sorry:
> > > 
> > > Can we be 100% sure that GCC won't add any code accessing some
> > > global
> > > data or stack while the Data MMU is OFF ?
> > > 
> > > Christophe
> > > 
> > 
> > +mpe
> > 
> > I'm not sure how we would go about making such a guarantee, but
> > I've
> > tied every variable used to a register and addr is passed in a
> > register, so there is no stack usage, and every call in there only
> > operates on it's operands.
> > 
> > The calls to the inline cache helpers (for the PPC32 case) are all
> > constants, so I can't see a reasonable scenario where there would
> > be a
> > function call and reordered to after the DR bit is turned off, but
> > I
> > guess if we want to be paranoid, we could always add an mb() call
> > before the DR bit is manipulated to prevent the compiler from
> > reordering across the section where the data MMU is disabled.
> > 
> > 
> 
> Anyway, I think the benefit of converting that function to C is
> pretty 
> small. flush_dcache_range() and friends were converted to C mainly
> in 
> order to inline them. But this __flush_dcache_icache_phys() is too
> big 
> to be worth inlining, yet small and stable enough to remain in
> assembly 
> for the time being.
> 
I disagree on this point, after converting it to C, using
44x/currituck.defconfig, the compiler definitely will inline it (noting
that there is only 1 caller of it):

00000134 <flush_dcache_icache_page>:
 134:   94 21 ff f0     stwu    r1,-16(r1)
 138:   3d 20 00 00     lis     r9,0
 13c:   81 29 00 00     lwz     r9,0(r9)
 140:   7c 08 02 a6     mflr    r0
 144:   38 81 00 0c     addi    r4,r1,12
 148:   90 01 00 14     stw     r0,20(r1)
 14c:   91 21 00 0c     stw     r9,12(r1)
 150:   48 00 00 01     bl      150 <flush_dcache_icache_page+0x1c>
 154:   39 00 00 20     li      r8,32
 158:   39 43 10 00     addi    r10,r3,4096
 15c:   7c 69 1b 78     mr      r9,r3
 160:   7d 09 03 a6     mtctr   r8
 164:   7c 00 48 6c     dcbst   0,r9
 168:   39 29 00 80     addi    r9,r9,128
 16c:   42 00 ff f8     bdnz    164 <flush_dcache_icache_page+0x30>
 170:   7c 00 04 ac     hwsync
 174:   7c 69 1b 78     mr      r9,r3
 178:   7c 00 4f ac     icbi    0,r9
 17c:   39 29 00 80     addi    r9,r9,128
 180:   7f 8a 48 40     cmplw   cr7,r10,r9
 184:   40 9e ff f4     bne     cr7,178 <flush_dcache_icache_page+0x44>
 188:   7c 00 04 ac     hwsync
 18c:   4c 00 01 2c     isync
 190:   80 01 00 14     lwz     r0,20(r1)
 194:   38 21 00 10     addi    r1,r1,16
 198:   7c 08 03 a6     mtlr    r0
 19c:   48 00 00 00     b       19c <flush_dcache_icache_page+0x68>


> So I suggest you keep it aside your series for now, just move 
> PURGE_PREFETCHED_INS inside it directly as it will be the only
> remaining 
> user of it.
> 
> Christophe
Michael Ellerman Sept. 2, 2019, 1:48 a.m. UTC | #6
"Alastair D'Silva" <alastair@au1.ibm.com> writes:
> On Wed, 2019-08-21 at 22:27 +0200, Christophe Leroy wrote:
>> 
>> Le 20/08/2019 à 06:36, Alastair D'Silva a écrit :
>> > On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
>> 
>> [...]
>> 
>> > 
>> > Thanks Christophe,
>> > 
>> > I'm trying a somewhat different approach that requires less
>> > knowledge
>> > of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
>> > function. The code below is not a patch as my tree is a bit messy,
>> > sorry:
>> 
>> Can we be 100% sure that GCC won't add any code accessing some
>> global data or stack while the Data MMU is OFF ?
>
> +mpe
>
> I'm not sure how we would go about making such a guarantee, but I've
> tied every variable used to a register and addr is passed in a
> register, so there is no stack usage, and every call in there only
> operates on it's operands.

That's not safe, I can believe it happens to work but the compiler
people will laugh at us if it ever breaks.

Let's leave it in asm.

cheers
Segher Boessenkool Sept. 2, 2019, 11:11 a.m. UTC | #7
On Mon, Sep 02, 2019 at 11:48:59AM +1000, Michael Ellerman wrote:
> "Alastair D'Silva" <alastair@au1.ibm.com> writes:
> > On Wed, 2019-08-21 at 22:27 +0200, Christophe Leroy wrote:
> >> Can we be 100% sure that GCC won't add any code accessing some
> >> global data or stack while the Data MMU is OFF ?
> >
> > +mpe
> >
> > I'm not sure how we would go about making such a guarantee, but I've
> > tied every variable used to a register and addr is passed in a
> > register, so there is no stack usage, and every call in there only
> > operates on it's operands.
> 
> That's not safe, I can believe it happens to work but the compiler
> people will laugh at us if it ever breaks.

Yes.  Sorry.

> Let's leave it in asm.

+1

The asm is simpler, more readable, more maintainable, and perhaps more
performant even.  Plus the being-laughed-at issue.


Segher
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h
index 1826bf2cc137..bf4f2dc4eb76 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -47,6 +47,14 @@  void flush_icache_user_range(struct vm_area_struct *vma,
 				    struct page *page, unsigned long addr,
 				    int len);
 void flush_dcache_icache_page(struct page *page);
+#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
+void __flush_dcache_icache_phys(unsigned long physaddr);
+#else
+static inline void __flush_dcache_icache_phys(unsigned long physaddr)
+{
+	BUG();
+}
+#endif
 
 /**
  * flush_dcache_range(): Write any modified data cache blocks out to memory and invalidate them.
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 43be99de7c9a..43009f9227c4 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -402,6 +402,50 @@  void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
+#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
+void __flush_dcache_icache_phys(unsigned long physaddr)
+{
+	unsigned long bytes = l1_dcache_bytes();
+	unsigned long nb = PAGE_SIZE / bytes;
+	unsigned long addr = physaddr & PAGE_MASK;
+	unsigned long msr, msr0;
+	unsigned long loop1 = addr, loop2 = addr;
+
+	if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
+		/* For a snooping icache, we still need a dummy icbi to purge all the
+		 * prefetched instructions from the ifetch buffers. We also need a sync
+		 * before the icbi to order the the actual stores to memory that might
+		 * have modified instructions with the icbi.
+		 */
+		mb(); /* sync */
+		icbi((void *)addr);
+		mb(); /* sync */
+		isync();
+		return;
+	}
+	msr0 = mfmsr();
+	msr = msr0 & ~MSR_DR;
+	asm volatile(
+	    "	mtctr %2;"
+	    "	mtmsr %3;"
+	    "	isync;"
+	    "0:	dcbst	0, %0;"
+	    "	addi	%0, %0, %4;"
+	    "	bdnz	0b;"
+	    "	sync;"
+	    "	mtctr %2;"
+	    "1:	icbi	0, %1;"
+	    "	addi	%1, %1, %4;"
+	    "	bdnz	1b;"
+	    "	sync;"
+	    "	mtmsr %5;"
+	    "	isync;"
+	    : "+r" (loop1), "+r" (loop2)
+	    : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
+	    : "ctr", "memory");
+}
+#endif
+
 void flush_dcache_icache_page(struct page *page)
 {
 #ifdef CONFIG_HUGETLB_PAGE
@@ -419,16 +463,7 @@  void flush_dcache_icache_page(struct page *page)
 		__flush_dcache_icache(start);
 		kunmap_atomic(start);
 	} else {
-		unsigned long msr = mfmsr();
-
-		/* Clear the DR bit so that we operate on physical
-		 * rather than virtual addresses
-		 */
-		mtmsr(msr & ~(MSR_DR));
-
-		__flush_dcache_icache((void *)physaddr);
-
-		mtmsr(msr);
+		__flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 	}
 #endif
 }