Message ID | 20210608213844.6218-1-christophe.lyon@linaro.org |
---|---|
State | New |
Headers | show |
Series | [1/2] arm: Fix vcond_mask expander for MVE (PR target/100757) | expand |
Christophe Lyon <christophe.lyon@linaro.org> writes: > The problem in this PR is that we call VPSEL with a mask of vector > type instead of HImode. This happens because operand 3 in vcond_mask > is the pre-computed vector comparison and has vector type. The fix is > to transfer this value to VPR.P0 by comparing operand 3 with a vector > of constant 1 of the same type as operand 3. The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE and return HImode for MVE. This is how AVX512 handles masks. It might be worth trying that to see how it works. I'm not sure off-hand whether it'll produce worse code or better code. However, using HImode as the mask mode would help when defining other predicated optabs in future. Thanks, Richard > The pr100757*.c testcases are derived from > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using > different types and return values different from 0 and 1 to avoid > commonalization with boolean masks. > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > generate the code below: > > float a[32]; > float fn1(int d) { > int c = 4; > for (int b = 0; b < 8; b++) > if (a[b] != 2.0f) > c = 5; > return c; > } > > fn1: > ldr r3, .L4+80 > vpush.64 {d8, d9} > vldrw.32 q3, [r3] // q3=a[0..3] > vldr.64 d8, .L4 // q4=(2.0,2.0,2.0,2.0) > vldr.64 d9, .L4+8 > adds r3, r3, #16 > vcmp.f32 eq, q3, q4 // cmp a[0..3] == (2.0,2.0,2.0,2.0) > vldr.64 d2, .L4+16 // q1=(1,1,1,1) > vldr.64 d3, .L4+24 > vldrw.32 q3, [r3] // q3=a[4..7] > vldr.64 d4, .L4+32 // q2=(0,0,0,0) > vldr.64 d5, .L4+40 > vpsel q0, q1, q2 // q0=select (a[0..3]) > vcmp.f32 eq, q3, q4 // cmp a[4..7] == (2.0,2.0,2.0,2.0) > vldm sp!, {d8-d9} > vpsel q2, q1, q2 // q2=select (a[4..7]) > vand q2, q0, q2 // q2=select (a[0..3]) && select (a[4..7]) > vldr.64 d6, .L4+48 // q3=(4.0,4.0,4.0,4.0) > vldr.64 d7, .L4+56 > vldr.64 d0, .L4+64 // q0=(5.0,5.0,5.0,5.0) > vldr.64 d1, .L4+72 > vcmp.i32 eq, q2, q1 // cmp mask(a[0..7]) == (1,1,1,1) > vpsel q3, q3, q0 // q3= vcond_mask(4.0,5.0) > vmov.32 r3, q3[0] // keep the scalar max > vmov.32 r1, q3[1] > vmov.32 r0, q3[3] > vmov.32 r2, q3[2] > vmov s14, r1 > vmov s15, r3 > vmaxnm.f32 s15, s15, s14 > vmov s14, r2 > vmaxnm.f32 s15, s15, s14 > vmov s14, r0 > vmaxnm.f32 s15, s15, s14 > vmov r0, s15 > bx lr > .L5: > .align 3 > .L4: > .word 1073741824 > .word 1073741824 > .word 1073741824 > .word 1073741824 > .word 1 > .word 1 > .word 1 > .word 1 > .word 0 > .word 0 > .word 0 > .word 0 > .word 1082130432 > .word 1082130432 > .word 1082130432 > .word 1082130432 > .word 1084227584 > .word 1084227584 > .word 1084227584 > .word 1084227584 > > 2021-06-09 Christophe Lyon <christophe.lyon@linaro.org> > > PR target/100757 > gcc/ > * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix > expansion for MVE. > > gcc/testsuite/ > * gcc.target/arm/simd/pr100757.c: New test. > * gcc.target/arm/simd/pr100757-2.c: New test. > * gcc.target/arm/simd/pr100757-3.c: New test. > --- > gcc/config/arm/vec-common.md | 24 +++++++++++++++++-- > .../gcc.target/arm/simd/pr100757-2.c | 20 ++++++++++++++++ > .../gcc.target/arm/simd/pr100757-3.c | 20 ++++++++++++++++ > gcc/testsuite/gcc.target/arm/simd/pr100757.c | 19 +++++++++++++++ > 4 files changed, 81 insertions(+), 2 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > index 0ffc7a9322c..ccdfaa8321f 100644 > --- a/gcc/config/arm/vec-common.md > +++ b/gcc/config/arm/vec-common.md > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" > } > else if (TARGET_HAVE_MVE) > { > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > - operands[1], operands[2], operands[3])); > + /* Convert pre-computed vector comparison into VPR.P0 by comparing > + operand 3 with a vector of '1', then use VPSEL. */ > + machine_mode cmp_mode = GET_MODE (operands[3]); > + rtx vpr_p0 = gen_reg_rtx (HImode); > + rtx one = gen_reg_rtx (cmp_mode); > + emit_move_insn (one, CONST1_RTX (cmp_mode)); > + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one)); > + > + switch (GET_MODE_CLASS (<MODE>mode)) > + { > + case MODE_VECTOR_INT: > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > + break; > + case MODE_VECTOR_FLOAT: > + if (TARGET_HAVE_MVE_FLOAT) > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > + else > + gcc_unreachable (); > + break; > + default: > + gcc_unreachable (); > + } > } > else > gcc_unreachable (); > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > new file mode 100644 > index 00000000000..993ce369090 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > +/* { dg-add-options arm_v8_1m_mve_fp } */ > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > + > +float a[32]; > +int fn1(int d) { > + int c = 4; > + for (int b = 0; b < 32; b++) > + if (a[b] != 2.0f) > + c = 5; > + return c; > +} > + > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c. */ > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c. */ > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > new file mode 100644 > index 00000000000..b94a73b2d2c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > +/* { dg-add-options arm_v8_1m_mve_fp } */ > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > +/* Copied from gcc.c-torture/compile/20160205-1.c. */ > + > +float a[32]; > +float fn1(int d) { > + float c = 4; > + for (int b = 0; b < 32; b++) > + if (a[b] != 2.0f) > + c = 5; > + return c; > +} > + > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c. */ > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c. */ > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > new file mode 100644 > index 00000000000..e51e716b4ec > --- /dev/null > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ > +/* { dg-add-options arm_v8_1m_mve } */ > +/* { dg-additional-options "-O3" } */ > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > + > +int a[32]; > +int fn1(int d) { > + int c = 2; > + for (int b = 0; b < 32; b++) > + if (a[b]) > + c = 3; > + return c; > +} > + > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c. */ > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c. */
Hi, On Wed, 9 Jun 2021 at 17:04, Richard Sandiford <richard.sandiford@arm.com> wrote: > > Christophe Lyon <christophe.lyon@linaro.org> writes: > > The problem in this PR is that we call VPSEL with a mask of vector > > type instead of HImode. This happens because operand 3 in vcond_mask > > is the pre-computed vector comparison and has vector type. The fix is > > to transfer this value to VPR.P0 by comparing operand 3 with a vector > > of constant 1 of the same type as operand 3. > > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE > and return HImode for MVE. This is how AVX512 handles masks. > > It might be worth trying that to see how it works. I'm not sure > off-hand whether it'll produce worse code or better code. However, > using HImode as the mask mode would help when defining other > predicated optabs in future. > Here is my v2 of this patch, hopefully implementing what you suggested. Sorry it took me so long, but as implementing this hook was of course not sufficient, and it took me a while to figure out I needed to keep the non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created another one... I shouldn't have added so many tests ;-) I'm not sure how to improve the vectorizer doc, to better describe the vec_cmp/vcond patterns and see which ones the vectorizer is trying to use (to understand which ones I should implement). Then I realized I was about to break Neon support, so I decided it was safer to add Neon tests ;-) Is that version OK? Thanks, Christophe > Thanks, > Richard > > > The pr100757*.c testcases are derived from > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using > > different types and return values different from 0 and 1 to avoid > > commonalization with boolean masks. > > > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > > generate the code below: > > > > float a[32]; > > float fn1(int d) { > > int c = 4; > > for (int b = 0; b < 8; b++) > > if (a[b] != 2.0f) > > c = 5; > > return c; > > } > > > > fn1: > > ldr r3, .L4+80 > > vpush.64 {d8, d9} > > vldrw.32 q3, [r3] // q3=a[0..3] > > vldr.64 d8, .L4 // q4=(2.0,2.0,2.0,2.0) > > vldr.64 d9, .L4+8 > > adds r3, r3, #16 > > vcmp.f32 eq, q3, q4 // cmp a[0..3] == (2.0,2.0,2.0,2.0) > > vldr.64 d2, .L4+16 // q1=(1,1,1,1) > > vldr.64 d3, .L4+24 > > vldrw.32 q3, [r3] // q3=a[4..7] > > vldr.64 d4, .L4+32 // q2=(0,0,0,0) > > vldr.64 d5, .L4+40 > > vpsel q0, q1, q2 // q0=select (a[0..3]) > > vcmp.f32 eq, q3, q4 // cmp a[4..7] == (2.0,2.0,2.0,2.0) > > vldm sp!, {d8-d9} > > vpsel q2, q1, q2 // q2=select (a[4..7]) > > vand q2, q0, q2 // q2=select (a[0..3]) && select (a[4..7]) > > vldr.64 d6, .L4+48 // q3=(4.0,4.0,4.0,4.0) > > vldr.64 d7, .L4+56 > > vldr.64 d0, .L4+64 // q0=(5.0,5.0,5.0,5.0) > > vldr.64 d1, .L4+72 > > vcmp.i32 eq, q2, q1 // cmp mask(a[0..7]) == (1,1,1,1) > > vpsel q3, q3, q0 // q3= vcond_mask(4.0,5.0) > > vmov.32 r3, q3[0] // keep the scalar maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch > > vmov.32 r1, q3[1] > > vmov.32 r0, q3[3] > > vmov.32 r2, q3[2] > > vmov s14, r1 > > vmov s15, r3 > > vmaxnm.f32 s15, s15, s14 > > vmov s14, r2 > > vmaxnm.f32 s15, s15, s14 > > vmov s14, r0 > > vmaxnm.f32 s15, s15, s14 > > vmov r0, s15 > > bx lr > > .L5: > > .align 3 > > .L4: > > .word 1073741824 > > .word 1073741824 > > .word 1073741824 > > .word 1073741824 > > .word 1 > > .word 1 > > .word 1 > > .word 1 > > .word 0 > > .word 0 > > .word 0 > > .word 0 > > .word 1082130432 > > .word 1082130432 > > .word 1082130432 > > .word 1082130432 > > .word 1084227584 > > .word 1084227584 > > .word 1084227584 > > .word 1084227584 > > > > 2021-06-09 Christophe Lyon <christophe.lyon@linaro.org> > > > > PR target/100757 > > gcc/ > > * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix > > expansion for MVE. > > > > gcc/testsuite/ > > * gcc.target/arm/simd/pr100757.c: New test. > > * gcc.target/arm/simd/pr100757-2.c: New test. > > * gcc.target/arm/simd/pr100757-3.c: New test. > > --- > > gcc/config/arm/vec-common.md | 24 +++++++++++++++++-- > > .../gcc.target/arm/simd/pr100757-2.c | 20 ++++++++++++++++ > > .../gcc.target/arm/simd/pr100757-3.c | 20 ++++++++++++++++ > > gcc/testsuite/gcc.target/arm/simd/pr100757.c | 19 +++++++++++++++ > > 4 files changed, 81 insertions(+), 2 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > > index 0ffc7a9322c..ccdfaa8321f 100644 > > --- a/gcc/config/arm/vec-common.md > > +++ b/gcc/config/arm/vec-common.md > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" > > } > > else if (TARGET_HAVE_MVE) > > { > > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > > - operands[1], operands[2], operands[3])); > > + /* Convert pre-computed vector comparison into VPR.P0 by comparing > > + operand 3 with a vector of '1', then use VPSEL. */ > > + machine_mode cmp_mode = GET_MODE (operands[3]); > > + rtx vpr_p0 = gen_reg_rtx (HImode); > > + rtx one = gen_reg_rtx (cmp_mode); > > + emit_move_insn (one, CONST1_RTX (cmp_mode)); > > + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one)); > > + > > + switch (GET_MODE_CLASS (<MODE>mode)) > > + { > > + case MODE_VECTOR_INT: > > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > > + break; > > + case MODE_VECTOR_FLOAT: > > + if (TARGET_HAVE_MVE_FLOAT) > > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > > + else > > + gcc_unreachable (); > > + break; > > + default: > > + gcc_unreachable (); > > + } > > } > > else > > gcc_unreachable (); > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > new file mode 100644 > > index 00000000000..993ce369090 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > + > > +float a[32]; > > +int fn1(int d) { > > + int c = 4; > > + for (int b = 0; b < 32; b++) > > + if (a[b] != 2.0f) > > + c = 5; > > + return c; > > +} > > + > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c. */ > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c. */ > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > new file mode 100644 > > index 00000000000..b94a73b2d2c > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > +/* Copied from gcc.c-torture/compile/20160205-1.c. */ > > + > > +float a[32]; > > +float fn1(int d) { > > + float c = 4; > > + for (int b = 0; b < 32; b++) > > + if (a[b] != 2.0f) > > + c = 5; > > + return c; > > +} > > + > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c. */ > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c. */ > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > new file mode 100644 > > index 00000000000..e51e716b4ec > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ > > +/* { dg-add-options arm_v8_1m_mve } */ > > +/* { dg-additional-options "-O3" } */ > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > + > > +int a[32]; > > +int fn1(int d) { > > + int c = 2; > > + for (int b = 0; b < 32; b++) > > + if (a[b]) > > + c = 3; > > + return c; > > +} > > + > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c. */ > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c. */
On Fri, 2 Jul 2021 at 10:53, Christophe Lyon <christophe.lyon@linaro.org> wrote: > > Hi, > > On Wed, 9 Jun 2021 at 17:04, Richard Sandiford > <richard.sandiford@arm.com> wrote: > > > > Christophe Lyon <christophe.lyon@linaro.org> writes: > > > The problem in this PR is that we call VPSEL with a mask of vector > > > type instead of HImode. This happens because operand 3 in vcond_mask > > > is the pre-computed vector comparison and has vector type. The fix is > > > to transfer this value to VPR.P0 by comparing operand 3 with a vector > > > of constant 1 of the same type as operand 3. > > > > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE > > and return HImode for MVE. This is how AVX512 handles masks. > > > > It might be worth trying that to see how it works. I'm not sure > > off-hand whether it'll produce worse code or better code. However, > > using HImode as the mask mode would help when defining other > > predicated optabs in future. > > > > Here is my v2 of this patch, hopefully implementing what you suggested. > > Sorry it took me so long, but as implementing this hook was of course > not sufficient, and it took me a while to figure out I needed to keep the > non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created > another one... I shouldn't have added so many tests ;-) > > I'm not sure how to improve the vectorizer doc, to better describe the > vec_cmp/vcond patterns and see which ones the vectorizer is trying > to use (to understand which ones I should implement). > > Then I realized I was about to break Neon support, so I decided > it was safer to add Neon tests ;-) > > Is that version OK? > I forgot to mention that I merged the original patch 2/2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572300.html into this one. > Thanks, > > Christophe > > > > Thanks, > > Richard > > > > > The pr100757*.c testcases are derived from > > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using > > > different types and return values different from 0 and 1 to avoid > > > commonalization with boolean masks. > > > > > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > > > generate the code below: > > > > > > float a[32]; > > > float fn1(int d) { > > > int c = 4; > > > for (int b = 0; b < 8; b++) > > > if (a[b] != 2.0f) > > > c = 5; > > > return c; > > > } > > > > > > fn1: > > > ldr r3, .L4+80 > > > vpush.64 {d8, d9} > > > vldrw.32 q3, [r3] // q3=a[0..3] > > > vldr.64 d8, .L4 // q4=(2.0,2.0,2.0,2.0) > > > vldr.64 d9, .L4+8 > > > adds r3, r3, #16 > > > vcmp.f32 eq, q3, q4 // cmp a[0..3] == (2.0,2.0,2.0,2.0) > > > vldr.64 d2, .L4+16 // q1=(1,1,1,1) > > > vldr.64 d3, .L4+24 > > > vldrw.32 q3, [r3] // q3=a[4..7] > > > vldr.64 d4, .L4+32 // q2=(0,0,0,0) > > > vldr.64 d5, .L4+40 > > > vpsel q0, q1, q2 // q0=select (a[0..3]) > > > vcmp.f32 eq, q3, q4 // cmp a[4..7] == (2.0,2.0,2.0,2.0) > > > vldm sp!, {d8-d9} > > > vpsel q2, q1, q2 // q2=select (a[4..7]) > > > vand q2, q0, q2 // q2=select (a[0..3]) && select (a[4..7]) > > > vldr.64 d6, .L4+48 // q3=(4.0,4.0,4.0,4.0) > > > vldr.64 d7, .L4+56 > > > vldr.64 d0, .L4+64 // q0=(5.0,5.0,5.0,5.0) > > > vldr.64 d1, .L4+72 > > > vcmp.i32 eq, q2, q1 // cmp mask(a[0..7]) == (1,1,1,1) > > > vpsel q3, q3, q0 // q3= vcond_mask(4.0,5.0) > > > vmov.32 r3, q3[0] // keep the scalar maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch > > > vmov.32 r1, q3[1] > > > vmov.32 r0, q3[3] > > > vmov.32 r2, q3[2] > > > vmov s14, r1 > > > vmov s15, r3 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r2 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r0 > > > vmaxnm.f32 s15, s15, s14 > > > vmov r0, s15 > > > bx lr > > > .L5: > > > .align 3 > > > .L4: > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > > > > 2021-06-09 Christophe Lyon <christophe.lyon@linaro.org> > > > > > > PR target/100757 > > > gcc/ > > > * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix > > > expansion for MVE. > > > > > > gcc/testsuite/ > > > * gcc.target/arm/simd/pr100757.c: New test. > > > * gcc.target/arm/simd/pr100757-2.c: New test. > > > * gcc.target/arm/simd/pr100757-3.c: New test. > > > --- > > > gcc/config/arm/vec-common.md | 24 +++++++++++++++++-- > > > .../gcc.target/arm/simd/pr100757-2.c | 20 ++++++++++++++++ > > > .../gcc.target/arm/simd/pr100757-3.c | 20 ++++++++++++++++ > > > gcc/testsuite/gcc.target/arm/simd/pr100757.c | 19 +++++++++++++++ > > > 4 files changed, 81 insertions(+), 2 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > > > > diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md > > > index 0ffc7a9322c..ccdfaa8321f 100644 > > > --- a/gcc/config/arm/vec-common.md > > > +++ b/gcc/config/arm/vec-common.md > > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" > > > } > > > else if (TARGET_HAVE_MVE) > > > { > > > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > > > - operands[1], operands[2], operands[3])); > > > + /* Convert pre-computed vector comparison into VPR.P0 by comparing > > > + operand 3 with a vector of '1', then use VPSEL. */ > > > + machine_mode cmp_mode = GET_MODE (operands[3]); > > > + rtx vpr_p0 = gen_reg_rtx (HImode); > > > + rtx one = gen_reg_rtx (cmp_mode); > > > + emit_move_insn (one, CONST1_RTX (cmp_mode)); > > > + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one)); > > > + > > > + switch (GET_MODE_CLASS (<MODE>mode)) > > > + { > > > + case MODE_VECTOR_INT: > > > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > > > + break; > > > + case MODE_VECTOR_FLOAT: > > > + if (TARGET_HAVE_MVE_FLOAT) > > > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); > > > + else > > > + gcc_unreachable (); > > > + break; > > > + default: > > > + gcc_unreachable (); > > > + } > > > } > > > else > > > gcc_unreachable (); > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > new file mode 100644 > > > index 00000000000..993ce369090 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +int fn1(int d) { > > > + int c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > new file mode 100644 > > > index 00000000000..b94a73b2d2c > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Copied from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +float fn1(int d) { > > > + float c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > new file mode 100644 > > > index 00000000000..e51e716b4ec > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > @@ -0,0 +1,19 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve } */ > > > +/* { dg-additional-options "-O3" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +int a[32]; > > > +int fn1(int d) { > > > + int c = 2; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b]) > > > + c = 3; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c. */
Ping? Le ven. 2 juil. 2021 à 10:53, Christophe Lyon <christophe.lyon@linaro.org> a écrit : > Hi, > > On Wed, 9 Jun 2021 at 17:04, Richard Sandiford > <richard.sandiford@arm.com> wrote: > > > > Christophe Lyon <christophe.lyon@linaro.org> writes: > > > The problem in this PR is that we call VPSEL with a mask of vector > > > type instead of HImode. This happens because operand 3 in vcond_mask > > > is the pre-computed vector comparison and has vector type. The fix is > > > to transfer this value to VPR.P0 by comparing operand 3 with a vector > > > of constant 1 of the same type as operand 3. > > > > The alternative is to implement TARGET_VECTORIZE_GET_MASK_MODE > > and return HImode for MVE. This is how AVX512 handles masks. > > > > It might be worth trying that to see how it works. I'm not sure > > off-hand whether it'll produce worse code or better code. However, > > using HImode as the mask mode would help when defining other > > predicated optabs in future. > > > > Here is my v2 of this patch, hopefully implementing what you suggested. > > Sorry it took me so long, but as implementing this hook was of course > not sufficient, and it took me a while to figure out I needed to keep the > non-HI expanders (vec_cmp ,...). Each time I fixed a bug, I created > another one... I shouldn't have added so many tests ;-) > > I'm not sure how to improve the vectorizer doc, to better describe the > vec_cmp/vcond patterns and see which ones the vectorizer is trying > to use (to understand which ones I should implement). > > Then I realized I was about to break Neon support, so I decided > it was safer to add Neon tests ;-) > > Is that version OK? > > Thanks, > > Christophe > > > > Thanks, > > Richard > > > > > The pr100757*.c testcases are derived from > > > gcc.c-torture/compile/20160205-1.c, forcing the use of MVE, and using > > > different types and return values different from 0 and 1 to avoid > > > commonalization with boolean masks. > > > > > > Reducing the number of iterations in pr100757-3.c from 32 to 8, we > > > generate the code below: > > > > > > float a[32]; > > > float fn1(int d) { > > > int c = 4; > > > for (int b = 0; b < 8; b++) > > > if (a[b] != 2.0f) > > > c = 5; > > > return c; > > > } > > > > > > fn1: > > > ldr r3, .L4+80 > > > vpush.64 {d8, d9} > > > vldrw.32 q3, [r3] // q3=a[0..3] > > > vldr.64 d8, .L4 // q4=(2.0,2.0,2.0,2.0) > > > vldr.64 d9, .L4+8 > > > adds r3, r3, #16 > > > vcmp.f32 eq, q3, q4 // cmp a[0..3] == > (2.0,2.0,2.0,2.0) > > > vldr.64 d2, .L4+16 // q1=(1,1,1,1) > > > vldr.64 d3, .L4+24 > > > vldrw.32 q3, [r3] // q3=a[4..7] > > > vldr.64 d4, .L4+32 // q2=(0,0,0,0) > > > vldr.64 d5, .L4+40 > > > vpsel q0, q1, q2 // q0=select (a[0..3]) > > > vcmp.f32 eq, q3, q4 // cmp a[4..7] == > (2.0,2.0,2.0,2.0) > > > vldm sp!, {d8-d9} > > > vpsel q2, q1, q2 // q2=select (a[4..7]) > > > vand q2, q0, q2 // q2=select (a[0..3]) && select > (a[4..7]) > > > vldr.64 d6, .L4+48 // q3=(4.0,4.0,4.0,4.0) > > > vldr.64 d7, .L4+56 > > > vldr.64 d0, .L4+64 // q0=(5.0,5.0,5.0,5.0) > > > vldr.64 d1, .L4+72 > > > vcmp.i32 eq, q2, q1 // cmp mask(a[0..7]) == (1,1,1,1) > > > vpsel q3, q3, q0 // q3= vcond_mask(4.0,5.0) > > > vmov.32 r3, q3[0] // keep the scalar > maxv2-0001-arm-Fix-vcond_mask-expander-for-MVE-PR-target-100.patch > > > vmov.32 r1, q3[1] > > > vmov.32 r0, q3[3] > > > vmov.32 r2, q3[2] > > > vmov s14, r1 > > > vmov s15, r3 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r2 > > > vmaxnm.f32 s15, s15, s14 > > > vmov s14, r0 > > > vmaxnm.f32 s15, s15, s14 > > > vmov r0, s15 > > > bx lr > > > .L5: > > > .align 3 > > > .L4: > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1073741824 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 1 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 0 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1082130432 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > .word 1084227584 > > > > > > 2021-06-09 Christophe Lyon <christophe.lyon@linaro.org> > > > > > > PR target/100757 > > > gcc/ > > > * config/arm/vec-common.md (vcond_mask_<mode><v_cmp_result>): Fix > > > expansion for MVE. > > > > > > gcc/testsuite/ > > > * gcc.target/arm/simd/pr100757.c: New test. > > > * gcc.target/arm/simd/pr100757-2.c: New test. > > > * gcc.target/arm/simd/pr100757-3.c: New test. > > > --- > > > gcc/config/arm/vec-common.md | 24 +++++++++++++++++-- > > > .../gcc.target/arm/simd/pr100757-2.c | 20 ++++++++++++++++ > > > .../gcc.target/arm/simd/pr100757-3.c | 20 ++++++++++++++++ > > > gcc/testsuite/gcc.target/arm/simd/pr100757.c | 19 +++++++++++++++ > > > 4 files changed, 81 insertions(+), 2 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > create mode 100644 gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > > > > diff --git a/gcc/config/arm/vec-common.md > b/gcc/config/arm/vec-common.md > > > index 0ffc7a9322c..ccdfaa8321f 100644 > > > --- a/gcc/config/arm/vec-common.md > > > +++ b/gcc/config/arm/vec-common.md > > > @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" > > > } > > > else if (TARGET_HAVE_MVE) > > > { > > > - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], > > > - operands[1], operands[2], > operands[3])); > > > + /* Convert pre-computed vector comparison into VPR.P0 by > comparing > > > + operand 3 with a vector of '1', then use VPSEL. */ > > > + machine_mode cmp_mode = GET_MODE (operands[3]); > > > + rtx vpr_p0 = gen_reg_rtx (HImode); > > > + rtx one = gen_reg_rtx (cmp_mode); > > > + emit_move_insn (one, CONST1_RTX (cmp_mode)); > > > + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], > one)); > > > + > > > + switch (GET_MODE_CLASS (<MODE>mode)) > > > + { > > > + case MODE_VECTOR_INT: > > > + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, > operands[0], operands[1], operands[2], vpr_p0)); > > > + break; > > > + case MODE_VECTOR_FLOAT: > > > + if (TARGET_HAVE_MVE_FLOAT) > > > + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], > operands[1], operands[2], vpr_p0)); > > > + else > > > + gcc_unreachable (); > > > + break; > > > + default: > > > + gcc_unreachable (); > > > + } > > > } > > > else > > > gcc_unreachable (); > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > new file mode 100644 > > > index 00000000000..993ce369090 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +int fn1(int d) { > > > + int c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ > /* Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* > 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* > 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* > Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* > Possible value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > new file mode 100644 > > > index 00000000000..b94a73b2d2c > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c > > > @@ -0,0 +1,20 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve_fp } */ > > > +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ > > > +/* Copied from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +float a[32]; > > > +float fn1(int d) { > > > + float c = 4; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b] != 2.0f) > > > + c = 5; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ > /* Constant 2.0f. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* > 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* > 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ > /* Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ > /* Possible value for c. */ > > > diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c > b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > new file mode 100644 > > > index 00000000000..e51e716b4ec > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c > > > @@ -0,0 +1,19 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ > > > +/* { dg-add-options arm_v8_1m_mve } */ > > > +/* { dg-additional-options "-O3" } */ > > > +/* Derived from gcc.c-torture/compile/20160205-1.c. */ > > > + > > > +int a[32]; > > > +int fn1(int d) { > > > + int c = 2; > > > + for (int b = 0; b < 32; b++) > > > + if (a[b]) > > > + c = 3; > > > + return c; > > > +} > > > + > > > +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* > 'true' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* > 'false' mask. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* > Initial value for c. */ > > > +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* > Possible value for c. */ >
Hi, Sorry for the slow review. I'd initially held off from reviewing because it sounded like you were trying to treat predicates as MODE_VECTOR_BOOL instead. Is that right? If so, how did that go? It does feel like the right long-term direction. Treating masks as integers for AVX seems to make some things more difficult than they should be. Also, RTL like: > +(define_expand "vec_cmp<mode>hi" > + [(set (match_operand:HI 0 "s_register_operand") > + (match_operator:HI 1 "comparison_operator" > + [(match_operand:MVE_VLD_ST 2 "s_register_operand") > + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] > + "TARGET_HAVE_MVE > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false, false); > + DONE; > +}) seems kind-of suspect, since (as I think you said in a PR), (eq:HI X Y) would normally be a single boolean. Having MODE_VECTOR_BOOL means that we can represent the comparisons “properly”, even in define_insns. But since this usage is confined to define_expands, I guess it doesn't matter much for the purposes of this patch. Any switch to MODE_VECTOR_BOOL would leave most of the patch in tact (contrary to what I'd initially assumed). > @@ -31061,13 +31065,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, > > /* If we are not expanding a vcond, build the result here. */ > if (!vcond_mve) > - { > - rtx zero = gen_reg_rtx (cmp_result_mode); > - rtx one = gen_reg_rtx (cmp_result_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); > - } > + emit_move_insn (target, vpr_p0); The code above this is: if (vcond_mve) vpr_p0 = target; else vpr_p0 = gen_reg_rtx (HImode); so couldn't we simply use vpr_p0 = target unconditionally (or drop vpr_p0 altogether)? Same for the other cases. > @@ -31178,20 +31164,21 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) > mask, operands[1], operands[2])); > else > { > - machine_mode cmp_mode = GET_MODE (operands[4]); > + machine_mode cmp_mode = GET_MODE (operands[0]); > rtx vpr_p0 = mask; > - rtx zero = gen_reg_rtx (cmp_mode); > - rtx one = gen_reg_rtx (cmp_mode); > - emit_move_insn (zero, CONST0_RTX (cmp_mode)); > - emit_move_insn (one, CONST1_RTX (cmp_mode)); > + > switch (GET_MODE_CLASS (cmp_mode)) > { > case MODE_VECTOR_INT: > - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); > + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0], > + operands[1], operands[2], vpr_p0)); > break; > case MODE_VECTOR_FLOAT: > if (TARGET_HAVE_MVE_FLOAT) > - emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); > + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], > + operands[1], operands[2], vpr_p0)); > + else > + gcc_unreachable (); > break; > default: > gcc_unreachable (); Here too vpr_p0 feels a bit redundant now. > diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md > index e393518ea88..a9840408bdd 100644 > --- a/gcc/config/arm/mve.md > +++ b/gcc/config/arm/mve.md > @@ -10516,3 +10516,58 @@ (define_insn "*movmisalign<mode>_mve_load" > "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1" > [(set_attr "type" "mve_load")] > ) > + > +;; Expanders for vec_cmp and vcond > + > +(define_expand "vec_cmp<mode>hi" > + [(set (match_operand:HI 0 "s_register_operand") > + (match_operator:HI 1 "comparison_operator" > + [(match_operand:MVE_VLD_ST 2 "s_register_operand") > + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] > + "TARGET_HAVE_MVE > + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" Is flag_unsafe_math_optimizations needed for MVE? For Neon we had it because of flush to zero (at least, I think that was the only reason). > +{ > + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), > + operands[2], operands[3], false, false); > + DONE; > +}) The (snipped) tests look good, but if we do support !flag_unsafe_math_optimizations, it would be good to have some tests for unordered comparisons too. E.g.: #define ordered(A, B) (!__builtin_isunordered (A, B)) #define unordered(A, B) (__builtin_isunordered (A, B)) #define ueq(A, B) (!__builtin_islessgreater (A, B)) #define ult(A, B) (__builtin_isless (A, B)) #define ule(A, B) (__builtin_islessequal (A, B)) #define uge(A, B) (__builtin_isgreaterequal (A, B)) #define ugt(A, B) (__builtin_isgreater (A, B)) #define nueq(A, B) (__builtin_islessgreater (A, B)) #define nult(A, B) (!__builtin_isless (A, B)) #define nule(A, B) (!__builtin_islessequal (A, B)) #define nuge(A, B) (!__builtin_isgreaterequal (A, B)) #define nugt(A, B) (!__builtin_isgreater (A, B)) The main thing is just to check that they don't ICE. It might be difficult to check the assembly for correctness. Thanks, Richard
On 16/07/2021 16:06, Richard Sandiford via Gcc-patches wrote: > Hi, > > Sorry for the slow review. I'd initially held off from reviewing > because it sounded like you were trying to treat predicates as > MODE_VECTOR_BOOL instead. Is that right? If so, how did that go? Yes, that's part of PR 101325. It's still WIP as I wrote in the PR, I'm not sure I got it right yet. At the moment it seems it would imply a lot of changes, I'll have to look at AArch64's implementation in more details. I hoped this fix could be merged before switching to MODE_VECTOR_BOOL. > It does feel like the right long-term direction. Treating masks as > integers for AVX seems to make some things more difficult than they > should be. Also, RTL like: OK, I see, good to know. > >> +(define_expand "vec_cmp<mode>hi" >> + [(set (match_operand:HI 0 "s_register_operand") >> + (match_operator:HI 1 "comparison_operator" >> + [(match_operand:MVE_VLD_ST 2 "s_register_operand") >> + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] >> + "TARGET_HAVE_MVE >> + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" >> +{ >> + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), >> + operands[2], operands[3], false, false); >> + DONE; >> +}) > seems kind-of suspect, since (as I think you said in a PR), > (eq:HI X Y) would normally be a single boolean. Having MODE_VECTOR_BOOL > means that we can represent the comparisons “properly”, even in > define_insns. > > But since this usage is confined to define_expands, I guess it > doesn't matter much for the purposes of this patch. Any switch > to MODE_VECTOR_BOOL would leave most of the patch in tact (contrary > to what I'd initially assumed). > >> @@ -31061,13 +31065,7 @@ arm_expand_vector_compare (rtx target, rtx_code code, rtx op0, rtx op1, >> >> /* If we are not expanding a vcond, build the result here. */ >> if (!vcond_mve) >> - { >> - rtx zero = gen_reg_rtx (cmp_result_mode); >> - rtx one = gen_reg_rtx (cmp_result_mode); >> - emit_move_insn (zero, CONST0_RTX (cmp_result_mode)); >> - emit_move_insn (one, CONST1_RTX (cmp_result_mode)); >> - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, target, one, zero, vpr_p0)); >> - } >> + emit_move_insn (target, vpr_p0); > The code above this is: > > if (vcond_mve) > vpr_p0 = target; > else > vpr_p0 = gen_reg_rtx (HImode); > > so couldn't we simply use vpr_p0 = target unconditionally (or drop > vpr_p0 altogether)? Same for the other cases. Probably, I'll check that. >> @@ -31178,20 +31164,21 @@ arm_expand_vcond (rtx *operands, machine_mode cmp_result_mode) >> mask, operands[1], operands[2])); >> else >> { >> - machine_mode cmp_mode = GET_MODE (operands[4]); >> + machine_mode cmp_mode = GET_MODE (operands[0]); >> rtx vpr_p0 = mask; >> - rtx zero = gen_reg_rtx (cmp_mode); >> - rtx one = gen_reg_rtx (cmp_mode); >> - emit_move_insn (zero, CONST0_RTX (cmp_mode)); >> - emit_move_insn (one, CONST1_RTX (cmp_mode)); >> + >> switch (GET_MODE_CLASS (cmp_mode)) >> { >> case MODE_VECTOR_INT: >> - emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_result_mode, operands[0], one, zero, vpr_p0)); >> + emit_insn (gen_mve_vpselq (VPSELQ_S, cmp_mode, operands[0], >> + operands[1], operands[2], vpr_p0)); >> break; >> case MODE_VECTOR_FLOAT: >> if (TARGET_HAVE_MVE_FLOAT) >> - emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], one, zero, vpr_p0)); >> + emit_insn (gen_mve_vpselq_f (cmp_mode, operands[0], >> + operands[1], operands[2], vpr_p0)); >> + else >> + gcc_unreachable (); >> break; >> default: >> gcc_unreachable (); > Here too vpr_p0 feels a bit redundant now. Indeed, but it seemed clearer to me :-) >> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md >> index e393518ea88..a9840408bdd 100644 >> --- a/gcc/config/arm/mve.md >> +++ b/gcc/config/arm/mve.md >> @@ -10516,3 +10516,58 @@ (define_insn "*movmisalign<mode>_mve_load" >> "vldr<V_sz_elem1>.<V_sz_elem>\t%q0, %E1" >> [(set_attr "type" "mve_load")] >> ) >> + >> +;; Expanders for vec_cmp and vcond >> + >> +(define_expand "vec_cmp<mode>hi" >> + [(set (match_operand:HI 0 "s_register_operand") >> + (match_operator:HI 1 "comparison_operator" >> + [(match_operand:MVE_VLD_ST 2 "s_register_operand") >> + (match_operand:MVE_VLD_ST 3 "reg_or_zero_operand")]))] >> + "TARGET_HAVE_MVE >> + && (!<Is_float_mode> || flag_unsafe_math_optimizations)" > Is flag_unsafe_math_optimizations needed for MVE? For Neon we had > it because of flush to zero (at least, I think that was the only reason). Right, I inherited this from the vec_cmp in neon.md. I do not have the ARM ARM at hand right now to check. However, your question makes me wonder about the other vec_cmp pattern in vec-common.md, which is common to Neon and MVE. It might need to be adjusted too. >> +{ >> + arm_expand_vector_compare (operands[0], GET_CODE (operands[1]), >> + operands[2], operands[3], false, false); >> + DONE; >> +}) > The (snipped) tests look good, but if we do support > !flag_unsafe_math_optimizations, it would be good to have some tests > for unordered comparisons too. E.g.: > > #define ordered(A, B) (!__builtin_isunordered (A, B)) > #define unordered(A, B) (__builtin_isunordered (A, B)) > #define ueq(A, B) (!__builtin_islessgreater (A, B)) > #define ult(A, B) (__builtin_isless (A, B)) > #define ule(A, B) (__builtin_islessequal (A, B)) > #define uge(A, B) (__builtin_isgreaterequal (A, B)) > #define ugt(A, B) (__builtin_isgreater (A, B)) > #define nueq(A, B) (__builtin_islessgreater (A, B)) > #define nult(A, B) (!__builtin_isless (A, B)) > #define nule(A, B) (!__builtin_islessequal (A, B)) > #define nuge(A, B) (!__builtin_isgreaterequal (A, B)) > #define nugt(A, B) (!__builtin_isgreater (A, B)) > > The main thing is just to check that they don't ICE. It might be > difficult to check the assembly for correctness. Thanks I'll check that in August, when I'm back from holidays. Christophe > > Thanks, > Richard
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index 0ffc7a9322c..ccdfaa8321f 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -478,8 +478,28 @@ (define_expand "vcond_mask_<mode><v_cmp_result>" } else if (TARGET_HAVE_MVE) { - emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], - operands[1], operands[2], operands[3])); + /* Convert pre-computed vector comparison into VPR.P0 by comparing + operand 3 with a vector of '1', then use VPSEL. */ + machine_mode cmp_mode = GET_MODE (operands[3]); + rtx vpr_p0 = gen_reg_rtx (HImode); + rtx one = gen_reg_rtx (cmp_mode); + emit_move_insn (one, CONST1_RTX (cmp_mode)); + emit_insn (gen_mve_vcmpq (EQ, cmp_mode, vpr_p0, operands[3], one)); + + switch (GET_MODE_CLASS (<MODE>mode)) + { + case MODE_VECTOR_INT: + emit_insn (gen_mve_vpselq (VPSELQ_S, <MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); + break; + case MODE_VECTOR_FLOAT: + if (TARGET_HAVE_MVE_FLOAT) + emit_insn (gen_mve_vpselq_f (<MODE>mode, operands[0], operands[1], operands[2], vpr_p0)); + else + gcc_unreachable (); + break; + default: + gcc_unreachable (); + } } else gcc_unreachable (); diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c new file mode 100644 index 00000000000..993ce369090 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ +/* { dg-add-options arm_v8_1m_mve_fp } */ +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ +/* Derived from gcc.c-torture/compile/20160205-1.c. */ + +float a[32]; +int fn1(int d) { + int c = 4; + for (int b = 0; b < 32; b++) + if (a[b] != 2.0f) + c = 5; + return c; +} + +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t4\n} 4 } } */ /* Initial value for c. */ +/* { dg-final { scan-assembler-times {\t.word\t5\n} 4 } } */ /* Possible value for c. */ diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c new file mode 100644 index 00000000000..b94a73b2d2c --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757-3.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ +/* { dg-add-options arm_v8_1m_mve_fp } */ +/* { dg-additional-options "-O3 -funsafe-math-optimizations" } */ +/* Copied from gcc.c-torture/compile/20160205-1.c. */ + +float a[32]; +float fn1(int d) { + float c = 4; + for (int b = 0; b < 32; b++) + if (a[b] != 2.0f) + c = 5; + return c; +} + +/* { dg-final { scan-assembler-times {\t.word\t1073741824\n} 4 } } */ /* Constant 2.0f. */ +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t1084227584\n} 4 } } */ /* Initial value for c. */ +/* { dg-final { scan-assembler-times {\t.word\t1082130432\n} 4 } } */ /* Possible value for c. */ diff --git a/gcc/testsuite/gcc.target/arm/simd/pr100757.c b/gcc/testsuite/gcc.target/arm/simd/pr100757.c new file mode 100644 index 00000000000..e51e716b4ec --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/pr100757.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_ok } */ +/* { dg-add-options arm_v8_1m_mve } */ +/* { dg-additional-options "-O3" } */ +/* Derived from gcc.c-torture/compile/20160205-1.c. */ + +int a[32]; +int fn1(int d) { + int c = 2; + for (int b = 0; b < 32; b++) + if (a[b]) + c = 3; + return c; +} + +/* { dg-final { scan-assembler-times {\t.word\t1\n} 4 } } */ /* 'true' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t0\n} 4 } } */ /* 'false' mask. */ +/* { dg-final { scan-assembler-times {\t.word\t2\n} 4 } } */ /* Initial value for c. */ +/* { dg-final { scan-assembler-times {\t.word\t3\n} 4 } } */ /* Possible value for c. */