Message ID | 1497275080-34732-1-git-send-email-james.greenhalgh@arm.com |
---|---|
State | New |
Headers | show |
*ping* Thanks, James On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote: > [Sorry for the re-send. I spotted that the attributes were not right for the > new pattern I was adding. The change between this and the first version was: > > + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") > + (set_attr "length" "4,4,4,12")] > ] > > --- > > Hi, > > In this testcase, all argument registers and the return register > will be general purpose registers: > > long long > foo (long long a, long long b, long long c) > { > return ((a ^ b) & c) ^ b; > } > > However, due to the implementation of aarch64_simd_bsl<mode>_internal > we'll match that pattern and emit a BSL, necessitating moving all those > arguments and results to the Advanced SIMD registers: > > fmov d2, x0 > fmov d0, x2 > fmov d1, x1 > bsl v0.8b, v2.8b, v1.8b > fmov x0, d0 > > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that > knows to split back to integer operations if the register allocation > falls that way. > > We could have used an unspec, but then we lose some of the nice > simplifications that can be made from explicitly spelling out the semantics > of BSL. > > Bootstrapped on aarch64-none-linux-gnu. > > OK? > > Thanks, > James > > --- > gcc/ > > 2017-06-12 James Greenhalgh <james.greenhalgh@arm.com> > > * config/aarch64/aarch64-simd.md > (aarch64_simd_bsl<mode>_internal): Remove DImode. > (*aarch64_simd_bsl<mode>_alt): Likewise. > (aarch64_simd_bsldi_internal): New. > > gcc/testsuite/ > > 2017-06-12 James Greenhalgh <james.greenhalgh@arm.com> > > * gcc.target/aarch64/no-dimode-bsl.c: New. > * gcc.target/aarch64/dimode-bsl.c: New. > > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index c5a86ff..7b6b12f 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -2256,13 +2256,13 @@ > ;; in *aarch64_simd_bsl<mode>_alt. > > (define_insn "aarch64_simd_bsl<mode>_internal" > - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") > - (xor:VSDQ_I_DI > - (and:VSDQ_I_DI > - (xor:VSDQ_I_DI > + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") > + (xor:VDQ_I > + (and:VDQ_I > + (xor:VDQ_I > (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w") > - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) > - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) > + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) > + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) > (match_dup:<V_cmp_result> 3) > ))] > "TARGET_SIMD" > @@ -2280,14 +2280,14 @@ > ;; permutations of commutative operations, we have to have a separate pattern. > > (define_insn "*aarch64_simd_bsl<mode>_alt" > - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") > - (xor:VSDQ_I_DI > - (and:VSDQ_I_DI > - (xor:VSDQ_I_DI > - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") > - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) > - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) > - (match_dup:VSDQ_I_DI 2)))] > + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") > + (xor:VDQ_I > + (and:VDQ_I > + (xor:VDQ_I > + (match_operand:VDQ_I 3 "register_operand" "w,w,0") > + (match_operand:VDQ_I 2 "register_operand" "w,0,w")) > + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) > + (match_dup:VDQ_I 2)))] > "TARGET_SIMD" > "@ > bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype> > @@ -2296,6 +2296,45 @@ > [(set_attr "type" "neon_bsl<q>")] > ) > > +;; DImode is special, we want to avoid computing operations which are > +;; more naturally computed in general purpose registers in the vector > +;; registers. If we do that, we need to move all three operands from general > +;; purpose registers to vector registers, then back again. However, we > +;; don't want to make this pattern an UNSPEC as we'd lose scope for > +;; optimizations based on the component operations of a BSL. > +;; > +;; That means we need a splitter back to the individual operations, if they > +;; would be better calculated on the integer side. > + > +(define_insn_and_split "aarch64_simd_bsldi_internal" > + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") > + (xor:DI > + (and:DI > + (xor:DI > + (match_operand:DI 3 "register_operand" "w,0,w,r") > + (match_operand:DI 2 "register_operand" "w,w,0,r")) > + (match_operand:DI 1 "register_operand" "0,w,w,r")) > + (match_dup:DI 3) > + ))] > + "TARGET_SIMD" > + "@ > + bsl\\t%0.8b, %2.8b, %3.8b > + bit\\t%0.8b, %2.8b, %1.8b > + bif\\t%0.8b, %3.8b, %1.8b > + #" > + "&& GP_REGNUM_P (REGNO (operands[0]))" > + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] > +{ > + /* Split back to individual operations. */ > + emit_insn (gen_xordi3 (operands[0], operands[2], operands[3])); > + emit_insn (gen_anddi3 (operands[0], operands[0], operands[1])); > + emit_insn (gen_xordi3 (operands[0], operands[0], operands[3])); > + DONE; > +} > + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") > + (set_attr "length" "4,4,4,12")] > +) > + > (define_expand "aarch64_simd_bsl<mode>" > [(match_operand:VALLDIF 0 "register_operand") > (match_operand:<V_cmp_result> 1 "register_operand") > diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c > new file mode 100644 > index 0000000..4e63511 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +/* Test that we can generate DImode BSL when we are using > + copysign. */ > + > +double > +foo (double a, double b) > +{ > + return __builtin_copysign (a, b); > +} > + > +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c > new file mode 100644 > index 0000000..67dfda0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +/* Test that we don't combine to BSL when in DImode, avoiding register > + moves in the general case. > + > + We want: > + eor x0, x0, x1 > + and x0, x0, x2 > + eor x0, x0, x1 > + ret > + > + Rather than: > + fmov d2, x0 > + fmov d0, x2 > + fmov d1, x1 > + bsl v0.8b, v2.8b, v1.8b > + fmov x0, d0 > + ret */ > + > +long long > +foo (long long a, long long b, long long c) > +{ > + return ((a ^ b) & c) ^ b; > +} > + > +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */ > +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */ > +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */ > +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */
On Wed, Jun 21, 2017 at 11:49:07AM +0100, James Greenhalgh wrote: > *ping* *ping*x2 Thanks, James > On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote: > > [Sorry for the re-send. I spotted that the attributes were not right for the > > new pattern I was adding. The change between this and the first version was: > > > > + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") > > + (set_attr "length" "4,4,4,12")] > > ] > > > > --- > > > > Hi, > > > > In this testcase, all argument registers and the return register > > will be general purpose registers: > > > > long long > > foo (long long a, long long b, long long c) > > { > > return ((a ^ b) & c) ^ b; > > } > > > > However, due to the implementation of aarch64_simd_bsl<mode>_internal > > we'll match that pattern and emit a BSL, necessitating moving all those > > arguments and results to the Advanced SIMD registers: > > > > fmov d2, x0 > > fmov d0, x2 > > fmov d1, x1 > > bsl v0.8b, v2.8b, v1.8b > > fmov x0, d0 > > > > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that > > knows to split back to integer operations if the register allocation > > falls that way. > > > > We could have used an unspec, but then we lose some of the nice > > simplifications that can be made from explicitly spelling out the semantics > > of BSL. > > > > Bootstrapped on aarch64-none-linux-gnu. > > > > OK? > > > > Thanks, > > James > > > > --- > > gcc/ > > > > 2017-06-12 James Greenhalgh <james.greenhalgh@arm.com> > > > > * config/aarch64/aarch64-simd.md > > (aarch64_simd_bsl<mode>_internal): Remove DImode. > > (*aarch64_simd_bsl<mode>_alt): Likewise. > > (aarch64_simd_bsldi_internal): New. > > > > gcc/testsuite/ > > > > 2017-06-12 James Greenhalgh <james.greenhalgh@arm.com> > > > > * gcc.target/aarch64/no-dimode-bsl.c: New. > > * gcc.target/aarch64/dimode-bsl.c: New. > > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > > index c5a86ff..7b6b12f 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -2256,13 +2256,13 @@ > > ;; in *aarch64_simd_bsl<mode>_alt. > > > > (define_insn "aarch64_simd_bsl<mode>_internal" > > - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") > > - (xor:VSDQ_I_DI > > - (and:VSDQ_I_DI > > - (xor:VSDQ_I_DI > > + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") > > + (xor:VDQ_I > > + (and:VDQ_I > > + (xor:VDQ_I > > (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w") > > - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) > > - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) > > + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) > > + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) > > (match_dup:<V_cmp_result> 3) > > ))] > > "TARGET_SIMD" > > @@ -2280,14 +2280,14 @@ > > ;; permutations of commutative operations, we have to have a separate pattern. > > > > (define_insn "*aarch64_simd_bsl<mode>_alt" > > - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") > > - (xor:VSDQ_I_DI > > - (and:VSDQ_I_DI > > - (xor:VSDQ_I_DI > > - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") > > - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) > > - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) > > - (match_dup:VSDQ_I_DI 2)))] > > + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") > > + (xor:VDQ_I > > + (and:VDQ_I > > + (xor:VDQ_I > > + (match_operand:VDQ_I 3 "register_operand" "w,w,0") > > + (match_operand:VDQ_I 2 "register_operand" "w,0,w")) > > + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) > > + (match_dup:VDQ_I 2)))] > > "TARGET_SIMD" > > "@ > > bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype> > > @@ -2296,6 +2296,45 @@ > > [(set_attr "type" "neon_bsl<q>")] > > ) > > > > +;; DImode is special, we want to avoid computing operations which are > > +;; more naturally computed in general purpose registers in the vector > > +;; registers. If we do that, we need to move all three operands from general > > +;; purpose registers to vector registers, then back again. However, we > > +;; don't want to make this pattern an UNSPEC as we'd lose scope for > > +;; optimizations based on the component operations of a BSL. > > +;; > > +;; That means we need a splitter back to the individual operations, if they > > +;; would be better calculated on the integer side. > > + > > +(define_insn_and_split "aarch64_simd_bsldi_internal" > > + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") > > + (xor:DI > > + (and:DI > > + (xor:DI > > + (match_operand:DI 3 "register_operand" "w,0,w,r") > > + (match_operand:DI 2 "register_operand" "w,w,0,r")) > > + (match_operand:DI 1 "register_operand" "0,w,w,r")) > > + (match_dup:DI 3) > > + ))] > > + "TARGET_SIMD" > > + "@ > > + bsl\\t%0.8b, %2.8b, %3.8b > > + bit\\t%0.8b, %2.8b, %1.8b > > + bif\\t%0.8b, %3.8b, %1.8b > > + #" > > + "&& GP_REGNUM_P (REGNO (operands[0]))" > > + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] > > +{ > > + /* Split back to individual operations. */ > > + emit_insn (gen_xordi3 (operands[0], operands[2], operands[3])); > > + emit_insn (gen_anddi3 (operands[0], operands[0], operands[1])); > > + emit_insn (gen_xordi3 (operands[0], operands[0], operands[3])); > > + DONE; > > +} > > + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") > > + (set_attr "length" "4,4,4,12")] > > +) > > + > > (define_expand "aarch64_simd_bsl<mode>" > > [(match_operand:VALLDIF 0 "register_operand") > > (match_operand:<V_cmp_result> 1 "register_operand") > > diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c > > new file mode 100644 > > index 0000000..4e63511 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +/* Test that we can generate DImode BSL when we are using > > + copysign. */ > > + > > +double > > +foo (double a, double b) > > +{ > > + return __builtin_copysign (a, b); > > +} > > + > > +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c > > new file mode 100644 > > index 0000000..67dfda0 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c > > @@ -0,0 +1,30 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +/* Test that we don't combine to BSL when in DImode, avoiding register > > + moves in the general case. > > + > > + We want: > > + eor x0, x0, x1 > > + and x0, x0, x2 > > + eor x0, x0, x1 > > + ret > > + > > + Rather than: > > + fmov d2, x0 > > + fmov d0, x2 > > + fmov d1, x1 > > + bsl v0.8b, v2.8b, v1.8b > > + fmov x0, d0 > > + ret */ > > + > > +long long > > +foo (long long a, long long b, long long c) > > +{ > > + return ((a ^ b) & c) ^ b; > > +} > > + > > +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */ > > +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */ > > +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */ > > +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */ >
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5a86ff..7b6b12f 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2256,13 +2256,13 @@ ;; in *aarch64_simd_bsl<mode>_alt. (define_insn "aarch64_simd_bsl<mode>_internal" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) (match_dup:<V_cmp_result> 3) ))] "TARGET_SIMD" @@ -2280,14 +2280,14 @@ ;; permutations of commutative operations, we have to have a separate pattern. (define_insn "*aarch64_simd_bsl<mode>_alt" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) - (match_dup:VSDQ_I_DI 2)))] + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I + (match_operand:VDQ_I 3 "register_operand" "w,w,0") + (match_operand:VDQ_I 2 "register_operand" "w,0,w")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) + (match_dup:VDQ_I 2)))] "TARGET_SIMD" "@ bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype> @@ -2296,6 +2296,45 @@ [(set_attr "type" "neon_bsl<q>")] ) +;; DImode is special, we want to avoid computing operations which are +;; more naturally computed in general purpose registers in the vector +;; registers. If we do that, we need to move all three operands from general +;; purpose registers to vector registers, then back again. However, we +;; don't want to make this pattern an UNSPEC as we'd lose scope for +;; optimizations based on the component operations of a BSL. +;; +;; That means we need a splitter back to the individual operations, if they +;; would be better calculated on the integer side. + +(define_insn_and_split "aarch64_simd_bsldi_internal" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,0,w,r") + (match_operand:DI 2 "register_operand" "w,w,0,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 3) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %2.8b, %3.8b + bit\\t%0.8b, %2.8b, %1.8b + bif\\t%0.8b, %3.8b, %1.8b + #" + "&& GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. */ + emit_insn (gen_xordi3 (operands[0], operands[2], operands[3])); + emit_insn (gen_anddi3 (operands[0], operands[0], operands[1])); + emit_insn (gen_xordi3 (operands[0], operands[0], operands[3])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + (define_expand "aarch64_simd_bsl<mode>" [(match_operand:VALLDIF 0 "register_operand") (match_operand:<V_cmp_result> 1 "register_operand") diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c new file mode 100644 index 0000000..4e63511 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* Test that we can generate DImode BSL when we are using + copysign. */ + +double +foo (double a, double b) +{ + return __builtin_copysign (a, b); +} + +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c new file mode 100644 index 0000000..67dfda0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* Test that we don't combine to BSL when in DImode, avoiding register + moves in the general case. + + We want: + eor x0, x0, x1 + and x0, x0, x2 + eor x0, x0, x1 + ret + + Rather than: + fmov d2, x0 + fmov d0, x2 + fmov d1, x1 + bsl v0.8b, v2.8b, v1.8b + fmov x0, d0 + ret */ + +long long +foo (long long a, long long b, long long c) +{ + return ((a ^ b) & c) ^ b; +} + +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */