Patchwork powerpc: provide __bswapdi2

login
register
mail settings
Submitter David Woodhouse
Date May 10, 2013, 9:18 p.m.
Message ID <1368220707.3378.57.camel@shinybook.infradead.org>
Download mbox | patch
Permalink /patch/243067/
State Superseded
Headers show

Comments

David Woodhouse - May 10, 2013, 9:18 p.m.
From: David Woodhouse <David.Woodhouse@intel.com>

Some versions of GCC apparently expect this to be provided by libgcc.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
Untested.
Anton Blanchard - May 13, 2013, 6:48 a.m.
On Fri, 10 May 2013 22:18:27 +0100
David Woodhouse <dwmw2@infradead.org> wrote:

> From: David Woodhouse <David.Woodhouse@intel.com>
> 
> Some versions of GCC apparently expect this to be provided by libgcc.

Thanks Dave. We were discussing this with Alan Modra and he doesn't
think the 64bit target should ever emit a call to __bswapdi2. Did you
only see it on 32bit, or 64bit as well?

Alan: I notice Dave is adding calls to __builtin_bswap, perhaps some
versions of the 64bit compiler did emit __bswapdi2 calls for that.

Anton

> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
> Untested.
> 
> diff --git a/arch/powerpc/kernel/misc_32.S
> b/arch/powerpc/kernel/misc_32.S index 19e096b..f077dc2 100644
> --- a/arch/powerpc/kernel/misc_32.S
> +++ b/arch/powerpc/kernel/misc_32.S
> @@ -657,6 +657,17 @@ _GLOBAL(__ucmpdi2)
>  	li	r3,2
>  	blr
>  
> +_GLOBAL(__bswapdi2)
> +	rlwinm	10,4,8,0xffffffff
> +	rlwinm	11,3,8,0xffffffff
> +	rlwimi	10,4,24,0,7
> +	rlwimi	11,3,24,0,7
> +	rlwimi	10,4,24,16,23
> +	rlwimi	11,3,24,16,23
> +	mr	4,11
> +	mr	3,10
> +	blr
> +
>  _GLOBAL(abs)
>  	srawi	r4,r3,31
>  	xor	r3,r3,r4
> diff --git a/arch/powerpc/kernel/misc_64.S
> b/arch/powerpc/kernel/misc_64.S index 5cfa800..3b2e6e8 100644
> --- a/arch/powerpc/kernel/misc_64.S
> +++ b/arch/powerpc/kernel/misc_64.S
> @@ -234,6 +234,18 @@ _GLOBAL(__flush_dcache_icache)
>  	isync
>  	blr
>  
> +_GLOBAL(__bswapdi2)
> +	srdi	8,3,32
> +	rlwinm	7,3,8,0xffffffff
> +	rlwimi	7,3,24,0,7
> +	rlwinm	9,8,8,0xffffffff
> +	rlwimi	7,3,24,16,23
> +	rlwimi	9,8,24,0,7
> +	rlwimi	9,8,24,16,23
> +	sldi	7,7,32
> +	or	7,7,9
> +	mr	3,7
> +	blr
>  
>  #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
>  /*
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c
> b/arch/powerpc/kernel/ppc_ksyms.c index 78b8766..c296665 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -143,7 +143,8 @@ EXPORT_SYMBOL(__lshrdi3);
>  int __ucmpdi2(unsigned long long, unsigned long long);
>  EXPORT_SYMBOL(__ucmpdi2);
>  #endif
> -
> +long long __bswapdi2(long long);
> +EXPORT_SYMBOL(__bswapdi2);
>  EXPORT_SYMBOL(memcpy);
>  EXPORT_SYMBOL(memset);
>  EXPORT_SYMBOL(memmove);
>
Michael Neuling - May 13, 2013, 7:09 a.m.
David Woodhouse <dwmw2@infradead.org> wrote:

> From: David Woodhouse <David.Woodhouse@intel.com>
> 
> Some versions of GCC apparently expect this to be provided by libgcc.
> 
> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> ---
> Untested.
> 
> diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
> index 19e096b..f077dc2 100644
> --- a/arch/powerpc/kernel/misc_32.S
> +++ b/arch/powerpc/kernel/misc_32.S
> @@ -657,6 +657,17 @@ _GLOBAL(__ucmpdi2)
>  	li	r3,2
>  	blr
>  
> +_GLOBAL(__bswapdi2)
> +	rlwinm	10,4,8,0xffffffff
> +	rlwinm	11,3,8,0xffffffff
> +	rlwimi	10,4,24,0,7
> +	rlwimi	11,3,24,0,7
> +	rlwimi	10,4,24,16,23
> +	rlwimi	11,3,24,16,23
> +	mr	4,11
> +	mr	3,10
> +	blr
> +

This doesn't work for me but the below does:

_GLOBAL(__bswapdi2)
	rotlwi  r9,r4,8
	rotlwi  r10,r3,8
	rlwimi  r9,r4,24,0,7
	rlwimi  r10,r3,24,0,7
	rlwimi  r9,r4,24,16,23
	rlwimi  r10,r3,24,16,23
	mr      r4,r10
	mr      r3,r9
	blr

stolen from GCC -02 output of:
  unsigned long long __bswapdi2(unsigned long long x)
  {
	 return ((x & 0x00000000000000ffULL) << 56) |
		((x & 0x000000000000ff00ULL) << 40) |
		((x & 0x0000000000ff0000ULL) << 24) |
		((x & 0x00000000ff000000ULL) <<  8) |
		((x & 0x000000ff00000000ULL) >>  8) |
		((x & 0x0000ff0000000000ULL) >> 24) |
		((x & 0x00ff000000000000ULL) >> 40) |
		((x & 0xff00000000000000ULL) >> 56);
  }

>  _GLOBAL(abs)
>  	srawi	r4,r3,31
>  	xor	r3,r3,r4
> diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
> index 5cfa800..3b2e6e8 100644
> --- a/arch/powerpc/kernel/misc_64.S
> +++ b/arch/powerpc/kernel/misc_64.S
> @@ -234,6 +234,18 @@ _GLOBAL(__flush_dcache_icache)
>  	isync
>  	blr
>  
> +_GLOBAL(__bswapdi2)
> +	srdi	8,3,32
> +	rlwinm	7,3,8,0xffffffff
> +	rlwimi	7,3,24,0,7
> +	rlwinm	9,8,8,0xffffffff
> +	rlwimi	7,3,24,16,23
> +	rlwimi	9,8,24,0,7
> +	rlwimi	9,8,24,16,23
> +	sldi	7,7,32
> +	or	7,7,9
> +	mr	3,7
> +	blr

This works but we should add "r" to the register names.

I'll repost

Mikey

>  
>  #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
>  /*
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
> index 78b8766..c296665 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -143,7 +143,8 @@ EXPORT_SYMBOL(__lshrdi3);
>  int __ucmpdi2(unsigned long long, unsigned long long);
>  EXPORT_SYMBOL(__ucmpdi2);
>  #endif
> -
> +long long __bswapdi2(long long);
> +EXPORT_SYMBOL(__bswapdi2);
>  EXPORT_SYMBOL(memcpy);
>  EXPORT_SYMBOL(memset);
>  EXPORT_SYMBOL(memmove);
> 
> -- 
> dwmw2
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
Alan Modra - May 13, 2013, 7:20 a.m.
On Mon, May 13, 2013 at 04:48:19PM +1000, Anton Blanchard wrote:
> On Fri, 10 May 2013 22:18:27 +0100
> David Woodhouse <dwmw2@infradead.org> wrote:
> 
> > From: David Woodhouse <David.Woodhouse@intel.com>
> > 
> > Some versions of GCC apparently expect this to be provided by libgcc.
> 
> Thanks Dave. We were discussing this with Alan Modra and he doesn't
> think the 64bit target should ever emit a call to __bswapdi2. Did you
> only see it on 32bit, or 64bit as well?
> 
> Alan: I notice Dave is adding calls to __builtin_bswap, perhaps some
> versions of the 64bit compiler did emit __bswapdi2 calls for that.

I did a little digging, and it looks like gcc-4.4 will emit __bswapdi2
calls.  Support in rs6000.md appeared 2009-06-25.
Gabriel Paubert - May 13, 2013, 7:33 a.m.
On Mon, May 13, 2013 at 05:09:59PM +1000, Michael Neuling wrote:
> David Woodhouse <dwmw2@infradead.org> wrote:
> 
> > From: David Woodhouse <David.Woodhouse@intel.com>
> > 
> > Some versions of GCC apparently expect this to be provided by libgcc.
> > 
> > Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
> > ---
> > Untested.
> > 
> > diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
> > index 19e096b..f077dc2 100644
> > --- a/arch/powerpc/kernel/misc_32.S
> > +++ b/arch/powerpc/kernel/misc_32.S
> > @@ -657,6 +657,17 @@ _GLOBAL(__ucmpdi2)
> >  	li	r3,2
> >  	blr
> >  
> > +_GLOBAL(__bswapdi2)
> > +	rlwinm	10,4,8,0xffffffff
> > +	rlwinm	11,3,8,0xffffffff
> > +	rlwimi	10,4,24,0,7
> > +	rlwimi	11,3,24,0,7
> > +	rlwimi	10,4,24,16,23
> > +	rlwimi	11,3,24,16,23
> > +	mr	4,11
> > +	mr	3,10
> > +	blr
> > +
> 
> This doesn't work for me but the below does:
> 
> _GLOBAL(__bswapdi2)
> 	rotlwi  r9,r4,8
> 	rotlwi  r10,r3,8
> 	rlwimi  r9,r4,24,0,7
> 	rlwimi  r10,r3,24,0,7
> 	rlwimi  r9,r4,24,16,23
> 	rlwimi  r10,r3,24,16,23
> 	mr      r4,r10
> 	mr      r3,r9
> 	blr
> 

Actually, I'd swap the two mr instructions to never
have an instruction that uses the result from the
previous one. 


> stolen from GCC -02 output of:
>   unsigned long long __bswapdi2(unsigned long long x)
>   {
> 	 return ((x & 0x00000000000000ffULL) << 56) |
> 		((x & 0x000000000000ff00ULL) << 40) |
> 		((x & 0x0000000000ff0000ULL) << 24) |
> 		((x & 0x00000000ff000000ULL) <<  8) |
> 		((x & 0x000000ff00000000ULL) >>  8) |
> 		((x & 0x0000ff0000000000ULL) >> 24) |
> 		((x & 0x00ff000000000000ULL) >> 40) |
> 		((x & 0xff00000000000000ULL) >> 56);
>   }
> 
> >  _GLOBAL(abs)
> >  	srawi	r4,r3,31
> >  	xor	r3,r3,r4
> > diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
> > index 5cfa800..3b2e6e8 100644
> > --- a/arch/powerpc/kernel/misc_64.S
> > +++ b/arch/powerpc/kernel/misc_64.S
> > @@ -234,6 +234,18 @@ _GLOBAL(__flush_dcache_icache)
> >  	isync
> >  	blr
> >  
> > +_GLOBAL(__bswapdi2)
> > +	srdi	8,3,32
> > +	rlwinm	7,3,8,0xffffffff
> > +	rlwimi	7,3,24,0,7
> > +	rlwinm	9,8,8,0xffffffff
> > +	rlwimi	7,3,24,16,23
> > +	rlwimi	9,8,24,0,7
> > +	rlwimi	9,8,24,16,23
> > +	sldi	7,7,32
> > +	or	7,7,9
> > +	mr	3,7
> > +	blr
> 
> This works but we should add "r" to the register names.
> 

And merge the last two instructions as a single "or r3,r7,r9".


	Gabriel
David Woodhouse - May 13, 2013, 10:27 a.m.
On Mon, 2013-05-13 at 16:50 +0930, Alan Modra wrote:
> I did a little digging, and it looks like gcc-4.4 will emit __bswapdi2
> calls.  Support in rs6000.md appeared 2009-06-25.

That's where it was seen. I don't have anything that ancient any more so
it didn't show up in my testing, but Al saw it on a Debian system.
David Woodhouse - May 13, 2013, 10:33 a.m.
On Mon, 2013-05-13 at 17:09 +1000, Michael Neuling wrote:
> 
> This doesn't work for me but the below does:
 ...
> 
> stolen from GCC -02 output of:
>   unsigned long long __bswapdi2(unsigned long long x)

Hm, so was mine:

[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32
	.file	"bswapdi2.c"
	.section	".text"
	.align 2
	.p2align 4,,15
	.globl __bswapdi2
	.type	__bswapdi2, @function
__bswapdi2:
	rlwinm 10,4,8,0xffffffff
	rlwinm 11,3,8,0xffffffff
	rlwimi 10,4,24,0,7
	rlwimi 11,3,24,0,7
	rlwimi 10,4,24,16,23
	rlwimi 11,3,24,16,23
	mr 4,11
	mr 3,10
	blr
	.size	__bswapdi2,.-__bswapdi2
	.ident	"GCC: (GNU) 4.7.2 20121105 (Red Hat 4.7.2-2.aa.20121114svn)"

On Mon, 2013-05-13 at 09:33 +0200, Gabriel Paubert wrote:
> Actually, I'd swap the two mr instructions to never
> have an instruction that uses the result from the
> previous one. 

Bad GCC. No biscuit.

Should we file a PR?
David Woodhouse - May 13, 2013, 10:38 a.m.
On Mon, 2013-05-13 at 11:33 +0100, David Woodhouse wrote:
> 
> On Mon, 2013-05-13 at 09:33 +0200, Gabriel Paubert wrote:
> > Actually, I'd swap the two mr instructions to never
> > have an instruction that uses the result from the
> > previous one. 
> 
> Bad GCC. No biscuit.
> 
> Should we file a PR? 

Maybe not. If you tell it to tune for an in-order machine like Cell, it
swaps them round. Although now I'm confused about which of POWER[567]
were in-order:

[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32  | grep -B1 mr
	rlwimi 11,3,24,16,23
	mr 4,11
	mr 3,10
[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=cell | grep -B1 mr
	rlwimi 11,3,24,16,23
	mr 3,10
	mr 4,11
[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power5 | grep -B1 mr
	rlwimi 11,3,24,16,23
	mr 3,10
	mr 4,11
[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power6 | grep -B1 mr
	rlwimi 11,3,24,16,23
	mr 4,11
	mr 3,10
[dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power7 | grep -B1 mr
	rlwimi 11,3,24,16,23
	mr 4,11
	mr 3,10
Joakim Tjernlund - May 13, 2013, 10:51 a.m.
"Linuxppc-dev" 
<linuxppc-dev-bounces+joakim.tjernlund=transmode.se@lists.ozlabs.org> 
wrote 2013/05/13 12:38:13:
> 
> On Mon, 2013-05-13 at 11:33 +0100, David Woodhouse wrote:
> > 
> > On Mon, 2013-05-13 at 09:33 +0200, Gabriel Paubert wrote:
> > > Actually, I'd swap the two mr instructions to never
> > > have an instruction that uses the result from the
> > > previous one. 
> > 
> > Bad GCC. No biscuit.
> > 
> > Should we file a PR? 
> 
> Maybe not. If you tell it to tune for an in-order machine like Cell, it
> swaps them round. Although now I'm confused about which of POWER[567]
> were in-order:
> 
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32  | grep 
-B1 mr
>    rlwimi 11,3,24,16,23
>    mr 4,11
>    mr 3,10
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
-mtune=cell | grep -B1 mr
>    rlwimi 11,3,24,16,23
>    mr 3,10
>    mr 4,11
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
-mtune=power5 | grep -B1 mr
>    rlwimi 11,3,24,16,23
>    mr 3,10
>    mr 4,11
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
-mtune=power6 | grep -B1 mr
>    rlwimi 11,3,24,16,23
>    mr 4,11
>    mr 3,10
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
-mtune=power7 | grep -B1 mr
>    rlwimi 11,3,24,16,23
>    mr 4,11
>    mr 3,10

A bit rusty on the ppc asm but can you not remove the mr completely:
                 rlwimi 10,4,24,16,23
                 rlwimi 11,3,24,16,23
                 mr 4,11
                 mr 3,10
to
                 rlwimi 4,4,24,16,23
                 rlwimi 3,3,24,16,23

 Jocke
Joakim Tjernlund - May 13, 2013, 11:12 a.m.
"Linuxppc-dev" 
<linuxppc-dev-bounces+joakim.tjernlund=transmode.se@lists.ozlabs.org> 
wrote on 2013/05/13 12:51:59:
> 
> "Linuxppc-dev" 
> <linuxppc-dev-bounces+joakim.tjernlund=transmode.se@lists.ozlabs.org> 
> wrote 2013/05/13 12:38:13:
> > 
> > On Mon, 2013-05-13 at 11:33 +0100, David Woodhouse wrote:
> > > 
> > > On Mon, 2013-05-13 at 09:33 +0200, Gabriel Paubert wrote:
> > > > Actually, I'd swap the two mr instructions to never
> > > > have an instruction that uses the result from the
> > > > previous one. 
> > > 
> > > Bad GCC. No biscuit.
> > > 
> > > Should we file a PR? 
> > 
> > Maybe not. If you tell it to tune for an in-order machine like Cell, 
it
> > swaps them round. Although now I'm confused about which of POWER[567]
> > were in-order:
> > 
> > [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32  | 
grep 
> -B1 mr
> >    rlwimi 11,3,24,16,23
> >    mr 4,11
> >    mr 3,10
> > [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
> -mtune=cell | grep -B1 mr
> >    rlwimi 11,3,24,16,23
> >    mr 3,10
> >    mr 4,11
> > [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
> -mtune=power5 | grep -B1 mr
> >    rlwimi 11,3,24,16,23
> >    mr 3,10
> >    mr 4,11
> > [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
> -mtune=power6 | grep -B1 mr
> >    rlwimi 11,3,24,16,23
> >    mr 4,11
> >    mr 3,10
> > [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 
> -mtune=power7 | grep -B1 mr
> >    rlwimi 11,3,24,16,23
> >    mr 4,11
> >    mr 3,10
> 
> A bit rusty on the ppc asm but can you not remove the mr completely:
>                  rlwimi 10,4,24,16,23
>                  rlwimi 11,3,24,16,23
>                  mr 4,11
>                  mr 3,10
> to
>                  rlwimi 4,4,24,16,23
>                  rlwimi 3,3,24,16,23

Oops, that got twisted. Forget my comment.
Gabriel Paubert - May 13, 2013, 1:12 p.m.
On Mon, May 13, 2013 at 11:38:13AM +0100, David Woodhouse wrote:
> On Mon, 2013-05-13 at 11:33 +0100, David Woodhouse wrote:
> > 
> > On Mon, 2013-05-13 at 09:33 +0200, Gabriel Paubert wrote:
> > > Actually, I'd swap the two mr instructions to never
> > > have an instruction that uses the result from the
> > > previous one. 
> > 
> > Bad GCC. No biscuit.
> > 
> > Should we file a PR? 
> 
> Maybe not. If you tell it to tune for an in-order machine like Cell, it
> swaps them round. Although now I'm confused about which of POWER[567]
> were in-order:

It was Power6 IIRC. On this kind of fine point, don't rely too much
on what GCC produces.

> 
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32  | grep -B1 mr
> 	rlwimi 11,3,24,16,23
> 	mr 4,11
> 	mr 3,10
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=cell | grep -B1 mr
> 	rlwimi 11,3,24,16,23
> 	mr 3,10
> 	mr 4,11
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power5 | grep -B1 mr
> 	rlwimi 11,3,24,16,23
> 	mr 3,10
> 	mr 4,11
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power6 | grep -B1 mr
> 	rlwimi 11,3,24,16,23
> 	mr 4,11
> 	mr 3,10
> [dwmw2@i7 ~]$ powerpc64-linux-gnu-gcc -O2 -S -o- bswapdi2.c -m32 -mtune=power7 | grep -B1 mr
> 	rlwimi 11,3,24,16,23
> 	mr 4,11
> 	mr 3,10

I don't know of any processor in which putting the mr 3,10 first can cause stalls, so
even a generic tuning should put it first.

	Gabriel
Segher Boessenkool - May 13, 2013, 4:47 p.m.
>> I did a little digging, and it looks like gcc-4.4 will emit  
>> __bswapdi2
>> calls.  Support in rs6000.md appeared 2009-06-25.
>
> That's where it was seen. I don't have anything that ancient any  
> more so
> it didn't show up in my testing, but Al saw it on a Debian system.

It should never happen on 32-bit -- it is broken into two bswapsi's --
although, old compiler, who knows.  Lack of testing makes some people
nervous though ;-)


Segher
Segher Boessenkool - May 13, 2013, 4:55 p.m.
> 	rlwinm 10,4,8,0xffffffff
> 	rlwinm 11,3,8,0xffffffff
> 	rlwimi 10,4,24,0,7
> 	rlwimi 11,3,24,0,7
> 	rlwimi 10,4,24,16,23
> 	rlwimi 11,3,24,16,23
> 	mr 4,11
> 	mr 3,10

>> Actually, I'd swap the two mr instructions to never
>> have an instruction that uses the result from the
>> previous one.
>
> Bad GCC. No biscuit.
>
> Should we file a PR?

This is scheduled just fine.  Every pair of instructions here can
execute together (on most CPUs, if not all); all instructions after
it are dependent on previous instructions.  There also is no issue
(group) restriction that makes this scheduling suboptimal afaics.


Segher
Stephen Rothwell - May 14, 2013, 1:09 a.m.
Hi Mikey,

On Mon, 13 May 2013 17:09:59 +1000 Michael Neuling <mikey@neuling.org> wrote:
>
> This doesn't work for me but the below does:
> 
> _GLOBAL(__bswapdi2)
> 	rotlwi  r9,r4,8
> 	rotlwi  r10,r3,8
> 	rlwimi  r9,r4,24,0,7
> 	rlwimi  r10,r3,24,0,7
> 	rlwimi  r9,r4,24,16,23
> 	rlwimi  r10,r3,24,16,23
> 	mr      r4,r10
> 	mr      r3,r9
> 	blr
> 
> stolen from GCC -02 output of:
>   unsigned long long __bswapdi2(unsigned long long x)
>   {
> 	 return ((x & 0x00000000000000ffULL) << 56) |
> 		((x & 0x000000000000ff00ULL) << 40) |
> 		((x & 0x0000000000ff0000ULL) << 24) |
> 		((x & 0x00000000ff000000ULL) <<  8) |
> 		((x & 0x000000ff00000000ULL) >>  8) |
> 		((x & 0x0000ff0000000000ULL) >> 24) |
> 		((x & 0x00ff000000000000ULL) >> 40) |
> 		((x & 0xff00000000000000ULL) >> 56);
>   }

So, if we are just stealing the output of gcc, why not just use the C
version (at least for 32 bit)?
Michael Neuling - May 14, 2013, 1:25 a.m.
On 14 May 2013 11:09, "Stephen Rothwell" <sfr@canb.auug.org.au> wrote:
>
> Hi Mikey,
>
> On Mon, 13 May 2013 17:09:59 +1000 Michael Neuling <mikey@neuling.org>
wrote:
> >
> > This doesn't work for me but the below does:
> >
> > _GLOBAL(__bswapdi2)
> >       rotlwi  r9,r4,8
> >       rotlwi  r10,r3,8
> >       rlwimi  r9,r4,24,0,7
> >       rlwimi  r10,r3,24,0,7
> >       rlwimi  r9,r4,24,16,23
> >       rlwimi  r10,r3,24,16,23
> >       mr      r4,r10
> >       mr      r3,r9
> >       blr
> >
> > stolen from GCC -02 output of:
> >   unsigned long long __bswapdi2(unsigned long long x)
> >   {
> >        return ((x & 0x00000000000000ffULL) << 56) |
> >               ((x & 0x000000000000ff00ULL) << 40) |
> >               ((x & 0x0000000000ff0000ULL) << 24) |
> >               ((x & 0x00000000ff000000ULL) <<  8) |
> >               ((x & 0x000000ff00000000ULL) >>  8) |
> >               ((x & 0x0000ff0000000000ULL) >> 24) |
> >               ((x & 0x00ff000000000000ULL) >> 40) |
> >               ((x & 0xff00000000000000ULL) >> 56);
> >   }
>
> So, if we are just stealing the output of gcc, why not just use the C
> version (at least for 32 bit)?

Woodhouse: can we just do this?

Mikey
David Woodhouse - May 14, 2013, 6:59 a.m.
On Tue, 2013-05-14 at 11:25 +1000, Michael Neuling wrote:
> 
> > So, if we are just stealing the output of gcc, why not just use the C
> > version (at least for 32 bit)?
> 
> Woodhouse: can we just do this?

Sure, if you don't mind GCC optimising the contents of your C function
by turning it into a call to libgcc's __bswapdi2() :)

OK, you might be able to do some archaeology and determine that the only
compiler that emits calls to __bswapdi2() is GCC 4.4, and furthermore
that the same compiler *doesn't* have the wit to notice that the
contents of the function are a 64-bit byteswap, so it's never going to
happen. But I don't like that approach. I'd feel I have to sacrifice a
goat *anyway*, and I don't have a spare goat.

Although now I come to explicitly explain why I did it that way... it
occurs to me that the libgcc version is just written in C, and the
compiler evidently trusts itself not to optimise that into a recursive
call. Is there a compiler switch which guarantees that, which we could
use without other unwanted side-effects?

Patch

diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 19e096b..f077dc2 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -657,6 +657,17 @@  _GLOBAL(__ucmpdi2)
 	li	r3,2
 	blr
 
+_GLOBAL(__bswapdi2)
+	rlwinm	10,4,8,0xffffffff
+	rlwinm	11,3,8,0xffffffff
+	rlwimi	10,4,24,0,7
+	rlwimi	11,3,24,0,7
+	rlwimi	10,4,24,16,23
+	rlwimi	11,3,24,16,23
+	mr	4,11
+	mr	3,10
+	blr
+
 _GLOBAL(abs)
 	srawi	r4,r3,31
 	xor	r3,r3,r4
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 5cfa800..3b2e6e8 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -234,6 +234,18 @@  _GLOBAL(__flush_dcache_icache)
 	isync
 	blr
 
+_GLOBAL(__bswapdi2)
+	srdi	8,3,32
+	rlwinm	7,3,8,0xffffffff
+	rlwimi	7,3,24,0,7
+	rlwinm	9,8,8,0xffffffff
+	rlwimi	7,3,24,16,23
+	rlwimi	9,8,24,0,7
+	rlwimi	9,8,24,16,23
+	sldi	7,7,32
+	or	7,7,9
+	mr	3,7
+	blr
 
 #if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_MAPLE)
 /*
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 78b8766..c296665 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -143,7 +143,8 @@  EXPORT_SYMBOL(__lshrdi3);
 int __ucmpdi2(unsigned long long, unsigned long long);
 EXPORT_SYMBOL(__ucmpdi2);
 #endif
-
+long long __bswapdi2(long long);
+EXPORT_SYMBOL(__bswapdi2);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);