Message ID | 20210727081138.GG2380545@tucnak |
---|---|
State | New |
Headers | show |
Series | i386: Improve AVX2 expansion of vector >> vector DImode arithm. shifts [PR101611] | expand |
On Tue, Jul 27, 2021 at 4:11 PM Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > AVX2 introduced vector >> vector shifts, but unfortunately for V{2,4}DImode > it only supports logical and not arithmetic shifts, only AVX512F for > V8DImode or AVX512VL for V{2,4}DImode fixed that omission. > Earlier in GCC12 cycle I've committed vector >> scalar arithmetic shift > emulation using various sequences, this patch handles the vector >> vector > case. No need to adjust costs, the previous cost adjustment actually > covers even the vector by vector shifts. > The patch emits the right arithmetic V{2,4}DImode shifts using 2 logical right > V{2,4}DImode shifts (once of the original operands, once of sign mask > constant by the vector shift count), xor and subtraction, on each element > (long long) x >> y is done as > (((unsigned long long) x >> y) ^ (0x8000000000000000ULL >> y)) > - (0x8000000000000000ULL >> y) I'm wondering when y > 64, would the transformation still be proper. Guess since it's UD, compiler can do anything. > i.e. if x doesn't have in some element the MSB set, it is just the logical > shift, if it does, then the xor and subtraction cause also all higher bits > to be set. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2021-07-27 Jakub Jelinek <jakub@redhat.com> > > PR target/101611 > * config/i386/sse.md (vashr<mode>3): Split into vashrv8di3 expander > and vashrv4di3 expander, where the latter requires just TARGET_AVX2 > and has special !TARGET_AVX512VL expansion. > (vashrv2di3<mask_name>): Rename to ... > (vashrv2di3): ... this. Change condition to TARGET_XOP || TARGET_AVX2 > and add special !TARGET_XOP && !TARGET_AVX512VL expansion. > > * gcc.target/i386/avx2-pr101611-1.c: New test. > * gcc.target/i386/avx2-pr101611-2.c: New test. > > --- gcc/config/i386/sse.md.jj 2021-07-22 12:37:20.439532859 +0200 > +++ gcc/config/i386/sse.md 2021-07-24 18:03:07.328126900 +0200 > @@ -20499,13 +20499,34 @@ (define_expand "vlshr<mode>3" > (match_operand:VI48_256 2 "nonimmediate_operand")))] > "TARGET_AVX2") > > -(define_expand "vashr<mode>3" > - [(set (match_operand:VI8_256_512 0 "register_operand") > - (ashiftrt:VI8_256_512 > - (match_operand:VI8_256_512 1 "register_operand") > - (match_operand:VI8_256_512 2 "nonimmediate_operand")))] > +(define_expand "vashrv8di3" > + [(set (match_operand:V8DI 0 "register_operand") > + (ashiftrt:V8DI > + (match_operand:V8DI 1 "register_operand") > + (match_operand:V8DI 2 "nonimmediate_operand")))] > "TARGET_AVX512F") > > +(define_expand "vashrv4di3" > + [(set (match_operand:V4DI 0 "register_operand") > + (ashiftrt:V4DI > + (match_operand:V4DI 1 "register_operand") > + (match_operand:V4DI 2 "nonimmediate_operand")))] > + "TARGET_AVX2" > +{ > + if (!TARGET_AVX512VL) > + { > + rtx mask = ix86_build_signbit_mask (V4DImode, 1, 0); > + rtx t1 = gen_reg_rtx (V4DImode); > + rtx t2 = gen_reg_rtx (V4DImode); > + rtx t3 = gen_reg_rtx (V4DImode); > + emit_insn (gen_vlshrv4di3 (t1, operands[1], operands[2])); > + emit_insn (gen_vlshrv4di3 (t2, mask, operands[2])); > + emit_insn (gen_xorv4di3 (t3, t1, t2)); > + emit_insn (gen_subv4di3 (operands[0], t3, t2)); > + DONE; > + } > +}) > + > (define_expand "vashr<mode>3" > [(set (match_operand:VI12_128 0 "register_operand") > (ashiftrt:VI12_128 > @@ -20527,12 +20548,12 @@ (define_expand "vashr<mode>3" > } > }) > > -(define_expand "vashrv2di3<mask_name>" > +(define_expand "vashrv2di3" > [(set (match_operand:V2DI 0 "register_operand") > (ashiftrt:V2DI > (match_operand:V2DI 1 "register_operand") > (match_operand:V2DI 2 "nonimmediate_operand")))] > - "TARGET_XOP || TARGET_AVX512VL" > + "TARGET_XOP || TARGET_AVX2" > { > if (TARGET_XOP) > { > @@ -20541,6 +20562,18 @@ (define_expand "vashrv2di3<mask_name>" > emit_insn (gen_xop_shav2di3 (operands[0], operands[1], neg)); > DONE; > } > + if (!TARGET_AVX512VL) > + { > + rtx mask = ix86_build_signbit_mask (V2DImode, 1, 0); > + rtx t1 = gen_reg_rtx (V2DImode); > + rtx t2 = gen_reg_rtx (V2DImode); > + rtx t3 = gen_reg_rtx (V2DImode); > + emit_insn (gen_vlshrv2di3 (t1, operands[1], operands[2])); > + emit_insn (gen_vlshrv2di3 (t2, mask, operands[2])); > + emit_insn (gen_xorv2di3 (t3, t1, t2)); > + emit_insn (gen_subv2di3 (operands[0], t3, t2)); > + DONE; > + } > }) > > (define_expand "vashrv4si3" > --- gcc/testsuite/gcc.target/i386/avx2-pr101611-1.c.jj 2021-07-26 14:22:35.341226231 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-pr101611-1.c 2021-07-26 14:21:29.806083664 +0200 > @@ -0,0 +1,12 @@ > +/* PR target/101611 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ > +/* { dg-final { scan-assembler-times {\mvpsrlvq\M} 4 } } */ > +/* { dg-final { scan-assembler-times {\mvpxor\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mvpsubq\M} 2 } } */ > + > +typedef long long V __attribute__((vector_size(32))); > +typedef long long W __attribute__((vector_size(16))); > + > +V foo (V a, V b) { return a >> b; } > +W bar (W a, W b) { return a >> b; } > --- gcc/testsuite/gcc.target/i386/avx2-pr101611-2.c.jj 2021-07-26 14:22:39.962165772 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-pr101611-2.c 2021-07-26 14:38:38.904497928 +0200 > @@ -0,0 +1,41 @@ > +/* PR target/101611 */ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ > +/* { dg-require-effective-target avx2 } */ > + > +#include "avx2-check.h" > + > +typedef long long V __attribute__((vector_size(32))); > +typedef long long W __attribute__((vector_size(16))); > + > +__attribute__((noipa)) V > +foo (V a, V b) > +{ > + return a >> b; > +} > + > +__attribute__((noipa)) W > +bar (W a, W b) > +{ > + return a >> b; > +} > + > +static void > +avx2_test (void) > +{ > + V a = { 0x7f123456789abcdeLL, -0x30edcba987654322LL, > + -0x30edcba987654322LL, 0x7f123456789abcdeLL }; > + V b = { 17, 11, 23, 0 }; > + V c = foo (a, b); > + if (c[0] != 0x3f891a2b3c4dLL > + || c[1] != -0x61db97530eca9LL > + || c[2] != -0x61db97530fLL > + || c[3] != 0x7f123456789abcdeLL) > + abort (); > + W d = { 0x7f123456789abcdeLL, -0x30edcba987654322LL }; > + W e = { 45, 27 }; > + W f = bar (d, e); > + if (f[0] != 0x3f891LL > + || f[1] != -0x61db97531LL) > + abort (); > +} > > Jakub >
On Tue, Jul 27, 2021 at 06:33:24PM +0800, Hongtao Liu wrote: > > AVX2 introduced vector >> vector shifts, but unfortunately for V{2,4}DImode > > it only supports logical and not arithmetic shifts, only AVX512F for > > V8DImode or AVX512VL for V{2,4}DImode fixed that omission. > > Earlier in GCC12 cycle I've committed vector >> scalar arithmetic shift > > emulation using various sequences, this patch handles the vector >> vector > > case. No need to adjust costs, the previous cost adjustment actually > > covers even the vector by vector shifts. > > The patch emits the right arithmetic V{2,4}DImode shifts using 2 logical right > > V{2,4}DImode shifts (once of the original operands, once of sign mask > > constant by the vector shift count), xor and subtraction, on each element > > (long long) x >> y is done as > > (((unsigned long long) x >> y) ^ (0x8000000000000000ULL >> y)) > > - (0x8000000000000000ULL >> y) > I'm wondering when y > 64, would the transformation still be proper. > Guess since it's UD, compiler can do anything. The patch is changing optabs, not something from target builtins where the intrinsics might make it well defined. In the optabs out of bound shifts (including y == 64) are UB - i386.h doesn't define SHIFT_COUNTS_TRUNCATED. Jakub
On Tue, Jul 27, 2021 at 6:39 PM Jakub Jelinek <jakub@redhat.com> wrote: > > On Tue, Jul 27, 2021 at 06:33:24PM +0800, Hongtao Liu wrote: > > > AVX2 introduced vector >> vector shifts, but unfortunately for V{2,4}DImode > > > it only supports logical and not arithmetic shifts, only AVX512F for > > > V8DImode or AVX512VL for V{2,4}DImode fixed that omission. > > > Earlier in GCC12 cycle I've committed vector >> scalar arithmetic shift > > > emulation using various sequences, this patch handles the vector >> vector > > > case. No need to adjust costs, the previous cost adjustment actually > > > covers even the vector by vector shifts. > > > The patch emits the right arithmetic V{2,4}DImode shifts using 2 logical right > > > V{2,4}DImode shifts (once of the original operands, once of sign mask > > > constant by the vector shift count), xor and subtraction, on each element > > > (long long) x >> y is done as > > > (((unsigned long long) x >> y) ^ (0x8000000000000000ULL >> y)) > > > - (0x8000000000000000ULL >> y) > > I'm wondering when y > 64, would the transformation still be proper. > > Guess since it's UD, compiler can do anything. > > The patch is changing optabs, not something from target builtins where the > intrinsics might make it well defined. > In the optabs out of bound shifts (including y == 64) are UB - i386.h > doesn't define SHIFT_COUNTS_TRUNCATED. Thanks for the explanation, patch LGTM. > > Jakub >
--- gcc/config/i386/sse.md.jj 2021-07-22 12:37:20.439532859 +0200 +++ gcc/config/i386/sse.md 2021-07-24 18:03:07.328126900 +0200 @@ -20499,13 +20499,34 @@ (define_expand "vlshr<mode>3" (match_operand:VI48_256 2 "nonimmediate_operand")))] "TARGET_AVX2") -(define_expand "vashr<mode>3" - [(set (match_operand:VI8_256_512 0 "register_operand") - (ashiftrt:VI8_256_512 - (match_operand:VI8_256_512 1 "register_operand") - (match_operand:VI8_256_512 2 "nonimmediate_operand")))] +(define_expand "vashrv8di3" + [(set (match_operand:V8DI 0 "register_operand") + (ashiftrt:V8DI + (match_operand:V8DI 1 "register_operand") + (match_operand:V8DI 2 "nonimmediate_operand")))] "TARGET_AVX512F") +(define_expand "vashrv4di3" + [(set (match_operand:V4DI 0 "register_operand") + (ashiftrt:V4DI + (match_operand:V4DI 1 "register_operand") + (match_operand:V4DI 2 "nonimmediate_operand")))] + "TARGET_AVX2" +{ + if (!TARGET_AVX512VL) + { + rtx mask = ix86_build_signbit_mask (V4DImode, 1, 0); + rtx t1 = gen_reg_rtx (V4DImode); + rtx t2 = gen_reg_rtx (V4DImode); + rtx t3 = gen_reg_rtx (V4DImode); + emit_insn (gen_vlshrv4di3 (t1, operands[1], operands[2])); + emit_insn (gen_vlshrv4di3 (t2, mask, operands[2])); + emit_insn (gen_xorv4di3 (t3, t1, t2)); + emit_insn (gen_subv4di3 (operands[0], t3, t2)); + DONE; + } +}) + (define_expand "vashr<mode>3" [(set (match_operand:VI12_128 0 "register_operand") (ashiftrt:VI12_128 @@ -20527,12 +20548,12 @@ (define_expand "vashr<mode>3" } }) -(define_expand "vashrv2di3<mask_name>" +(define_expand "vashrv2di3" [(set (match_operand:V2DI 0 "register_operand") (ashiftrt:V2DI (match_operand:V2DI 1 "register_operand") (match_operand:V2DI 2 "nonimmediate_operand")))] - "TARGET_XOP || TARGET_AVX512VL" + "TARGET_XOP || TARGET_AVX2" { if (TARGET_XOP) { @@ -20541,6 +20562,18 @@ (define_expand "vashrv2di3<mask_name>" emit_insn (gen_xop_shav2di3 (operands[0], operands[1], neg)); DONE; } + if (!TARGET_AVX512VL) + { + rtx mask = ix86_build_signbit_mask (V2DImode, 1, 0); + rtx t1 = gen_reg_rtx (V2DImode); + rtx t2 = gen_reg_rtx (V2DImode); + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_vlshrv2di3 (t1, operands[1], operands[2])); + emit_insn (gen_vlshrv2di3 (t2, mask, operands[2])); + emit_insn (gen_xorv2di3 (t3, t1, t2)); + emit_insn (gen_subv2di3 (operands[0], t3, t2)); + DONE; + } }) (define_expand "vashrv4si3" --- gcc/testsuite/gcc.target/i386/avx2-pr101611-1.c.jj 2021-07-26 14:22:35.341226231 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-pr101611-1.c 2021-07-26 14:21:29.806083664 +0200 @@ -0,0 +1,12 @@ +/* PR target/101611 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-times {\mvpsrlvq\M} 4 } } */ +/* { dg-final { scan-assembler-times {\mvpxor\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvpsubq\M} 2 } } */ + +typedef long long V __attribute__((vector_size(32))); +typedef long long W __attribute__((vector_size(16))); + +V foo (V a, V b) { return a >> b; } +W bar (W a, W b) { return a >> b; } --- gcc/testsuite/gcc.target/i386/avx2-pr101611-2.c.jj 2021-07-26 14:22:39.962165772 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-pr101611-2.c 2021-07-26 14:38:38.904497928 +0200 @@ -0,0 +1,41 @@ +/* PR target/101611 */ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ +/* { dg-require-effective-target avx2 } */ + +#include "avx2-check.h" + +typedef long long V __attribute__((vector_size(32))); +typedef long long W __attribute__((vector_size(16))); + +__attribute__((noipa)) V +foo (V a, V b) +{ + return a >> b; +} + +__attribute__((noipa)) W +bar (W a, W b) +{ + return a >> b; +} + +static void +avx2_test (void) +{ + V a = { 0x7f123456789abcdeLL, -0x30edcba987654322LL, + -0x30edcba987654322LL, 0x7f123456789abcdeLL }; + V b = { 17, 11, 23, 0 }; + V c = foo (a, b); + if (c[0] != 0x3f891a2b3c4dLL + || c[1] != -0x61db97530eca9LL + || c[2] != -0x61db97530fLL + || c[3] != 0x7f123456789abcdeLL) + abort (); + W d = { 0x7f123456789abcdeLL, -0x30edcba987654322LL }; + W e = { 45, 27 }; + W f = bar (d, e); + if (f[0] != 0x3f891LL + || f[1] != -0x61db97531LL) + abort (); +}