diff mbox

[AArch64] Stop generating BSL for simple integer code

Message ID 1497275080-34732-1-git-send-email-james.greenhalgh@arm.com
State New
Headers show

Commit Message

James Greenhalgh June 12, 2017, 1:44 p.m. UTC
[Sorry for the re-send. I spotted that the attributes were not right for the
 new pattern I was adding. The change between this and the first version was:

  +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
  +   (set_attr "length" "4,4,4,12")]
]

---

Hi,

In this testcase, all argument registers and the return register
will be general purpose registers:

  long long
  foo (long long a, long long b, long long c)
  {
    return ((a ^ b) & c) ^ b;
  }

However, due to the implementation of aarch64_simd_bsl<mode>_internal
we'll match that pattern and emit a BSL, necessitating moving all those
arguments and results to the Advanced SIMD registers:

	fmov	d2, x0
	fmov	d0, x2
	fmov	d1, x1
	bsl	v0.8b, v2.8b, v1.8b
	fmov	x0, d0

To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
knows to split back to integer operations if the register allocation
falls that way.

We could have used an unspec, but then we lose some of the nice
simplifications that can be made from explicitly spelling out the semantics
of BSL.

Bootstrapped on aarch64-none-linux-gnu.

OK?

Thanks,
James

---
gcc/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_bsl<mode>_internal): Remove DImode.
	(*aarch64_simd_bsl<mode>_alt): Likewise.
	(aarch64_simd_bsldi_internal): New.

gcc/testsuite/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/no-dimode-bsl.c: New.
	* gcc.target/aarch64/dimode-bsl.c: New.

Comments

James Greenhalgh June 21, 2017, 10:49 a.m. UTC | #1
*ping*

Thanks,
James

On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> [Sorry for the re-send. I spotted that the attributes were not right for the
>  new pattern I was adding. The change between this and the first version was:
> 
>   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
>   +   (set_attr "length" "4,4,4,12")]
> ]
> 
> ---
> 
> Hi,
> 
> In this testcase, all argument registers and the return register
> will be general purpose registers:
> 
>   long long
>   foo (long long a, long long b, long long c)
>   {
>     return ((a ^ b) & c) ^ b;
>   }
> 
> However, due to the implementation of aarch64_simd_bsl<mode>_internal
> we'll match that pattern and emit a BSL, necessitating moving all those
> arguments and results to the Advanced SIMD registers:
> 
> 	fmov	d2, x0
> 	fmov	d0, x2
> 	fmov	d1, x1
> 	bsl	v0.8b, v2.8b, v1.8b
> 	fmov	x0, d0
> 
> To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> knows to split back to integer operations if the register allocation
> falls that way.
> 
> We could have used an unspec, but then we lose some of the nice
> simplifications that can be made from explicitly spelling out the semantics
> of BSL.
> 
> Bootstrapped on aarch64-none-linux-gnu.
> 
> OK?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md
> 	(aarch64_simd_bsl<mode>_internal): Remove DImode.
> 	(*aarch64_simd_bsl<mode>_alt): Likewise.
> 	(aarch64_simd_bsldi_internal): New.
> 
> gcc/testsuite/
> 
> 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.target/aarch64/no-dimode-bsl.c: New.
> 	* gcc.target/aarch64/dimode-bsl.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c5a86ff..7b6b12f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2256,13 +2256,13 @@
>  ;; in *aarch64_simd_bsl<mode>_alt.
>  
>  (define_insn "aarch64_simd_bsl<mode>_internal"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
>  	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> -	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> +	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> +	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
>  	  (match_dup:<V_cmp_result> 3)
>  	))]
>    "TARGET_SIMD"
> @@ -2280,14 +2280,14 @@
>  ;; permutations of commutative operations, we have to have a separate pattern.
>  
>  (define_insn "*aarch64_simd_bsl<mode>_alt"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> -	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> -	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> -	  (match_dup:VSDQ_I_DI 2)))]
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
> +	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> +	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> +	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> +	  (match_dup:VDQ_I 2)))]
>    "TARGET_SIMD"
>    "@
>    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> @@ -2296,6 +2296,45 @@
>    [(set_attr "type" "neon_bsl<q>")]
>  )
>  
> +;; DImode is special, we want to avoid computing operations which are
> +;; more naturally computed in general purpose registers in the vector
> +;; registers.  If we do that, we need to move all three operands from general
> +;; purpose registers to vector registers, then back again.  However, we
> +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> +;; optimizations based on the component operations of a BSL.
> +;;
> +;; That means we need a splitter back to the individual operations, if they
> +;; would be better calculated on the integer side.
> +
> +(define_insn_and_split "aarch64_simd_bsldi_internal"
> +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> +	(xor:DI
> +	   (and:DI
> +	     (xor:DI
> +	       (match_operand:DI 3 "register_operand" "w,0,w,r")
> +	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
> +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> +	  (match_dup:DI 3)
> +	))]
> +  "TARGET_SIMD"
> +  "@
> +  bsl\\t%0.8b, %2.8b, %3.8b
> +  bit\\t%0.8b, %2.8b, %1.8b
> +  bif\\t%0.8b, %3.8b, %1.8b
> +  #"
> +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> +{
> +  /* Split back to individual operations.  */
> +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> +  DONE;
> +}
> +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> +   (set_attr "length" "4,4,4,12")]
> +)
> +
>  (define_expand "aarch64_simd_bsl<mode>"
>    [(match_operand:VALLDIF 0 "register_operand")
>     (match_operand:<V_cmp_result> 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> new file mode 100644
> index 0000000..4e63511
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we can generate DImode BSL when we are using
> +   copysign.  */
> +
> +double
> +foo (double a, double b)
> +{
> +  return __builtin_copysign (a, b);
> +}
> +
> +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> new file mode 100644
> index 0000000..67dfda0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we don't combine to BSL when in DImode, avoiding register
> +   moves in the general case.
> +
> +   We want:
> +	eor	x0, x0, x1
> +	and	x0, x0, x2
> +	eor	x0, x0, x1
> +	ret
> +
> +   Rather than:
> +	fmov	d2, x0
> +	fmov	d0, x2
> +	fmov	d1, x1
> +	bsl	v0.8b, v2.8b, v1.8b
> +	fmov	x0, d0
> +	ret  */
> +
> +long long
> +foo (long long a, long long b, long long c)
> +{
> +  return ((a ^ b) & c) ^ b;
> +}
> +
> +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */
James Greenhalgh July 3, 2017, 10:47 a.m. UTC | #2
On Wed, Jun 21, 2017 at 11:49:07AM +0100, James Greenhalgh wrote:
> *ping*

*ping*x2

Thanks,
James

> On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > [Sorry for the re-send. I spotted that the attributes were not right for the
> >  new pattern I was adding. The change between this and the first version was:
> > 
> >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> >   +   (set_attr "length" "4,4,4,12")]
> > ]
> > 
> > ---
> > 
> > Hi,
> > 
> > In this testcase, all argument registers and the return register
> > will be general purpose registers:
> > 
> >   long long
> >   foo (long long a, long long b, long long c)
> >   {
> >     return ((a ^ b) & c) ^ b;
> >   }
> > 
> > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > we'll match that pattern and emit a BSL, necessitating moving all those
> > arguments and results to the Advanced SIMD registers:
> > 
> > 	fmov	d2, x0
> > 	fmov	d0, x2
> > 	fmov	d1, x1
> > 	bsl	v0.8b, v2.8b, v1.8b
> > 	fmov	x0, d0
> > 
> > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> > knows to split back to integer operations if the register allocation
> > falls that way.
> > 
> > We could have used an unspec, but then we lose some of the nice
> > simplifications that can be made from explicitly spelling out the semantics
> > of BSL.
> > 
> > Bootstrapped on aarch64-none-linux-gnu.
> > 
> > OK?
> > 
> > Thanks,
> > James
> > 
> > ---
> > gcc/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> > 
> > 	* config/aarch64/aarch64-simd.md
> > 	(aarch64_simd_bsl<mode>_internal): Remove DImode.
> > 	(*aarch64_simd_bsl<mode>_alt): Likewise.
> > 	(aarch64_simd_bsldi_internal): New.
> > 
> > gcc/testsuite/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> > 
> > 	* gcc.target/aarch64/no-dimode-bsl.c: New.
> > 	* gcc.target/aarch64/dimode-bsl.c: New.
> > 
> 
> > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> > index c5a86ff..7b6b12f 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -2256,13 +2256,13 @@
> >  ;; in *aarch64_simd_bsl<mode>_alt.
> >  
> >  (define_insn "aarch64_simd_bsl<mode>_internal"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -	(xor:VSDQ_I_DI
> > -	   (and:VSDQ_I_DI
> > -	     (xor:VSDQ_I_DI
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +	(xor:VDQ_I
> > +	   (and:VDQ_I
> > +	     (xor:VDQ_I
> >  	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> > -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> > -	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > +	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> > +	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> >  	  (match_dup:<V_cmp_result> 3)
> >  	))]
> >    "TARGET_SIMD"
> > @@ -2280,14 +2280,14 @@
> >  ;; permutations of commutative operations, we have to have a separate pattern.
> >  
> >  (define_insn "*aarch64_simd_bsl<mode>_alt"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -	(xor:VSDQ_I_DI
> > -	   (and:VSDQ_I_DI
> > -	     (xor:VSDQ_I_DI
> > -	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> > -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> > -	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > -	  (match_dup:VSDQ_I_DI 2)))]
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +	(xor:VDQ_I
> > +	   (and:VDQ_I
> > +	     (xor:VDQ_I
> > +	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> > +	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> > +	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> > +	  (match_dup:VDQ_I 2)))]
> >    "TARGET_SIMD"
> >    "@
> >    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> > @@ -2296,6 +2296,45 @@
> >    [(set_attr "type" "neon_bsl<q>")]
> >  )
> >  
> > +;; DImode is special, we want to avoid computing operations which are
> > +;; more naturally computed in general purpose registers in the vector
> > +;; registers.  If we do that, we need to move all three operands from general
> > +;; purpose registers to vector registers, then back again.  However, we
> > +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> > +;; optimizations based on the component operations of a BSL.
> > +;;
> > +;; That means we need a splitter back to the individual operations, if they
> > +;; would be better calculated on the integer side.
> > +
> > +(define_insn_and_split "aarch64_simd_bsldi_internal"
> > +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> > +	(xor:DI
> > +	   (and:DI
> > +	     (xor:DI
> > +	       (match_operand:DI 3 "register_operand" "w,0,w,r")
> > +	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
> > +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> > +	  (match_dup:DI 3)
> > +	))]
> > +  "TARGET_SIMD"
> > +  "@
> > +  bsl\\t%0.8b, %2.8b, %3.8b
> > +  bit\\t%0.8b, %2.8b, %1.8b
> > +  bif\\t%0.8b, %3.8b, %1.8b
> > +  #"
> > +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> > +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> > +{
> > +  /* Split back to individual operations.  */
> > +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> > +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> > +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> > +  DONE;
> > +}
> > +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> > +   (set_attr "length" "4,4,4,12")]
> > +)
> > +
> >  (define_expand "aarch64_simd_bsl<mode>"
> >    [(match_operand:VALLDIF 0 "register_operand")
> >     (match_operand:<V_cmp_result> 1 "register_operand")
> > diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > new file mode 100644
> > index 0000000..4e63511
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we can generate DImode BSL when we are using
> > +   copysign.  */
> > +
> > +double
> > +foo (double a, double b)
> > +{
> > +  return __builtin_copysign (a, b);
> > +}
> > +
> > +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > new file mode 100644
> > index 0000000..67dfda0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we don't combine to BSL when in DImode, avoiding register
> > +   moves in the general case.
> > +
> > +   We want:
> > +	eor	x0, x0, x1
> > +	and	x0, x0, x2
> > +	eor	x0, x0, x1
> > +	ret
> > +
> > +   Rather than:
> > +	fmov	d2, x0
> > +	fmov	d0, x2
> > +	fmov	d1, x1
> > +	bsl	v0.8b, v2.8b, v1.8b
> > +	fmov	x0, d0
> > +	ret  */
> > +
> > +long long
> > +foo (long long a, long long b, long long c)
> > +{
> > +  return ((a ^ b) & c) ^ b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5a86ff..7b6b12f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2256,13 +2256,13 @@ 
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_cmp_result> 3)
 	))]
   "TARGET_SIMD"
@@ -2280,14 +2280,14 @@ 
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:VDQ_I 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,6 +2296,45 @@ 
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  */
+  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
+  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
+  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
new file mode 100644
index 0000000..67dfda0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we don't combine to BSL when in DImode, avoiding register
+   moves in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+long long
+foo (long long a, long long b, long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */