Message ID | 20180430191929.GH8577@tucnak |
---|---|
State | New |
Headers | show |
Series | Implement absv2di2 and absv4di2 expanders for pre-avx512vl (PR target/85572) | expand |
On Mon, Apr 30, 2018 at 9:19 PM, Jakub Jelinek <jakub@redhat.com> wrote: > Hi! > > Before avx512vl we don't have a single instruction to do V2DImode and > V4DImode abs, but that isn't much different from say V4SImode before SSE3 > where we also just emit a short sequence that is better than elementwise > expansion. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for > trunk? > > 2018-04-30 Jakub Jelinek <jakub@redhat.com> > > PR target/85572 > * config/i386/i386.c (ix86_expand_sse2_abs): Handle E_V2DImode and > E_V4DImode. > * config/i386/sse.md (abs<mode>2): Use VI_AVX2 iterator instead of > VI1248_AVX512VL_AVX512BW. Handle V2DImode and V4DImode if not > TARGET_AVX512VL using ix86_expand_sse2_abs. Formatting fixes. > > * g++.dg/other/sse2-pr85572-1.C: New test. > * g++.dg/other/sse2-pr85572-2.C: New test. > * g++.dg/other/sse4-pr85572-1.C: New test. > * g++.dg/other/avx2-pr85572-1.C: New test. LGTM. Thanks, Uros. > --- gcc/config/i386/i386.c.jj 2018-04-25 15:09:29.895453703 +0200 > +++ gcc/config/i386/i386.c 2018-04-30 18:31:56.027101932 +0200 > @@ -49806,39 +49806,74 @@ ix86_expand_sse2_abs (rtx target, rtx in > > switch (mode) > { > + case E_V2DImode: > + case E_V4DImode: > + /* For 64-bit signed integer X, with SSE4.2 use > + pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. > + Otherwise handle it similarly to V4SImode, except use 64 as W instead of > + 32 and use logical instead of arithmetic right shift (which is > + unimplemented) and subtract. */ > + if (TARGET_SSE4_2) > + { > + tmp0 = gen_reg_rtx (mode); > + tmp1 = gen_reg_rtx (mode); > + emit_move_insn (tmp1, CONST0_RTX (mode)); > + if (mode == E_V2DImode) > + emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); > + else > + emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); > + > + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, > + NULL, 0, OPTAB_DIRECT); > + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, > + target, 0, OPTAB_DIRECT); > + break; > + } > + > + tmp0 = expand_simple_binop (mode, LSHIFTRT, input, > + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), > + NULL, 0, OPTAB_DIRECT); > + tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); > + > + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, > + NULL, 0, OPTAB_DIRECT); > + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, > + target, 0, OPTAB_DIRECT); > + break; > + > + case E_V4SImode: > /* For 32-bit signed integer X, the best way to calculate the absolute > value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ > - case E_V4SImode: > - tmp0 = expand_simple_binop (mode, ASHIFTRT, input, > - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), > - NULL, 0, OPTAB_DIRECT); > - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, > - NULL, 0, OPTAB_DIRECT); > - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, > - target, 0, OPTAB_DIRECT); > - break; > + tmp0 = expand_simple_binop (mode, ASHIFTRT, input, > + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), > + NULL, 0, OPTAB_DIRECT); > + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, > + NULL, 0, OPTAB_DIRECT); > + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, > + target, 0, OPTAB_DIRECT); > + break; > > + case E_V8HImode: > /* For 16-bit signed integer X, the best way to calculate the absolute > value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ > - case E_V8HImode: > - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); > + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); > > - x = expand_simple_binop (mode, SMAX, tmp0, input, > - target, 0, OPTAB_DIRECT); > - break; > + x = expand_simple_binop (mode, SMAX, tmp0, input, > + target, 0, OPTAB_DIRECT); > + break; > > + case E_V16QImode: > /* For 8-bit signed integer X, the best way to calculate the absolute > value of X is min ((unsigned char) X, (unsigned char) (-X)), > as SSE2 provides the PMINUB insn. */ > - case E_V16QImode: > - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); > + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); > > - x = expand_simple_binop (V16QImode, UMIN, tmp0, input, > - target, 0, OPTAB_DIRECT); > - break; > + x = expand_simple_binop (V16QImode, UMIN, tmp0, input, > + target, 0, OPTAB_DIRECT); > + break; > > - default: > - gcc_unreachable (); > + default: > + gcc_unreachable (); > } > > if (x != target) > --- gcc/config/i386/sse.md.jj 2018-04-26 22:03:29.000000000 +0200 > +++ gcc/config/i386/sse.md 2018-04-30 19:09:18.740838508 +0200 > @@ -15174,12 +15174,14 @@ (define_insn "abs<mode>2_mask" > (set_attr "mode" "<sseinsnmode>")]) > > (define_expand "abs<mode>2" > - [(set (match_operand:VI1248_AVX512VL_AVX512BW 0 "register_operand") > - (abs:VI1248_AVX512VL_AVX512BW > - (match_operand:VI1248_AVX512VL_AVX512BW 1 "vector_operand")))] > + [(set (match_operand:VI_AVX2 0 "register_operand") > + (abs:VI_AVX2 > + (match_operand:VI_AVX2 1 "vector_operand")))] > "TARGET_SSE2" > { > - if (!TARGET_SSSE3) > + if (!TARGET_SSSE3 > + || ((<MODE>mode == V2DImode || <MODE>mode == V4DImode) > + && !TARGET_AVX512VL)) > { > ix86_expand_sse2_abs (operands[0], operands[1]); > DONE; > --- gcc/testsuite/g++.dg/other/sse2-pr85572-1.C.jj 2018-04-30 18:40:29.566619137 +0200 > +++ gcc/testsuite/g++.dg/other/sse2-pr85572-1.C 2018-04-30 19:04:04.111419300 +0200 > @@ -0,0 +1,14 @@ > +// PR target/85572 > +// { dg-do compile { target i?86-*-* x86_64-*-* } } > +// { dg-options "-O2 -msse2 -mno-sse3" } > +// { dg-final { scan-assembler-times {\mpxor\M} 2 } } > +// { dg-final { scan-assembler-times {\mpsubq\M} 2 } } > +// { dg-final { scan-assembler-times {\mpsrlq\M} 1 } } > + > +typedef long long V __attribute__((vector_size (16))); > + > +V > +foo (V x) > +{ > + return x < 0 ? -x : x; > +} > --- gcc/testsuite/g++.dg/other/sse2-pr85572-2.C.jj 2018-04-30 18:43:59.057879751 +0200 > +++ gcc/testsuite/g++.dg/other/sse2-pr85572-2.C 2018-04-30 19:04:55.717493565 +0200 > @@ -0,0 +1,45 @@ > +// PR target/85572 > +// { dg-do run { target i?86-*-* x86_64-*-* } } > +// { dg-options "-O2 -msse2" } > +// { dg-require-effective-target sse2_runtime } > + > +typedef long long V __attribute__((vector_size (16))); > +typedef long long W __attribute__((vector_size (32))); > + > +__attribute__((noipa)) V > +foo (V x) > +{ > + return x < 0 ? -x : x; > +} > + > +__attribute__((noipa)) void > +bar (W *x, W *y) > +{ > + *y = *x < 0 ? -*x : *x; > +} > + > +int > +main () > +{ > + V a = { 11LL, -15LL }; > + V b = foo (a); > + if (b[0] != 11LL || b[1] != 15LL) > + __builtin_abort (); > + V c = { -123456789123456LL, 654321654321654LL }; > + V d = foo (c); > + if (d[0] != 123456789123456LL || d[1] != 654321654321654LL) > + __builtin_abort (); > + V e = { 0, 1 }; > + V f = foo (e); > + if (f[0] != 0 || f[1] != 1) > + __builtin_abort (); > + W g = { 17LL, -32LL, -123456789123456LL, 654321654321654LL }, h; > + bar (&g, &h); > + if (h[0] != 17LL || h[1] != 32LL > + || h[2] != 123456789123456LL || h[3] != 654321654321654LL) > + __builtin_abort (); > + W i = { 0, 1, -1, 0 }, j; > + bar (&i, &j); > + if (j[0] != 0 || j[1] != 1 || j[2] != 1 || j[3] != 0) > + __builtin_abort (); > +} > --- gcc/testsuite/g++.dg/other/sse4-pr85572-1.C.jj 2018-04-30 18:50:56.569420770 +0200 > +++ gcc/testsuite/g++.dg/other/sse4-pr85572-1.C 2018-04-30 19:03:02.687330914 +0200 > @@ -0,0 +1,14 @@ > +// PR target/85572 > +// { dg-do compile { target i?86-*-* x86_64-*-* } } > +// { dg-options "-O2 -msse4 -mno-avx" } > +// { dg-final { scan-assembler-times {\mpxor\M} 2 } } > +// { dg-final { scan-assembler-times {\mpcmpgtq\M} 1 } } > +// { dg-final { scan-assembler-times {\mpsubq\M} 1 } } > + > +typedef long long V __attribute__((vector_size (16))); > + > +V > +foo (V x) > +{ > + return x < 0 ? -x : x; > +} > --- gcc/testsuite/g++.dg/other/avx2-pr85572-1.C.jj 2018-04-30 18:53:14.020598885 +0200 > +++ gcc/testsuite/g++.dg/other/avx2-pr85572-1.C 2018-04-30 19:02:47.316308798 +0200 > @@ -0,0 +1,21 @@ > +// PR target/85572 > +// { dg-do compile { target i?86-*-* x86_64-*-* } } > +// { dg-options "-O2 -mavx2 -mno-avx512f" } > +// { dg-final { scan-assembler-times {\mvpxor\M} 4 } } > +// { dg-final { scan-assembler-times {\mvpcmpgtq\M} 2 } } > +// { dg-final { scan-assembler-times {\mvpsubq\M} 2 } } > + > +typedef long long V __attribute__((vector_size (16))); > +typedef long long W __attribute__((vector_size (32))); > + > +V > +foo (V x) > +{ > + return x < 0 ? -x : x; > +} > + > +W > +bar (W x) > +{ > + return x < 0 ? -x : x; > +} > > Jakub
On Tue, May 8, 2018 at 11:11 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On Mon, Apr 30, 2018 at 9:19 PM, Jakub Jelinek <jakub@redhat.com> wrote: >> Hi! >> >> Before avx512vl we don't have a single instruction to do V2DImode and >> V4DImode abs, but that isn't much different from say V4SImode before SSE3 >> where we also just emit a short sequence that is better than elementwise >> expansion. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for >> trunk? >> >> 2018-04-30 Jakub Jelinek <jakub@redhat.com> >> >> PR target/85572 >> * config/i386/i386.c (ix86_expand_sse2_abs): Handle E_V2DImode and >> E_V4DImode. >> * config/i386/sse.md (abs<mode>2): Use VI_AVX2 iterator instead of >> VI1248_AVX512VL_AVX512BW. Handle V2DImode and V4DImode if not >> TARGET_AVX512VL using ix86_expand_sse2_abs. Formatting fixes. >> >> * g++.dg/other/sse2-pr85572-1.C: New test. >> * g++.dg/other/sse2-pr85572-2.C: New test. >> * g++.dg/other/sse4-pr85572-1.C: New test. >> * g++.dg/other/avx2-pr85572-1.C: New test. > > LGTM. > > Thanks, > Uros. > >> --- gcc/config/i386/i386.c.jj 2018-04-25 15:09:29.895453703 +0200 >> +++ gcc/config/i386/i386.c 2018-04-30 18:31:56.027101932 +0200 >> @@ -49806,39 +49806,74 @@ ix86_expand_sse2_abs (rtx target, rtx in >> >> switch (mode) >> { >> + case E_V2DImode: >> + case E_V4DImode: >> + /* For 64-bit signed integer X, with SSE4.2 use >> + pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. >> + Otherwise handle it similarly to V4SImode, except use 64 as W instead of >> + 32 and use logical instead of arithmetic right shift (which is >> + unimplemented) and subtract. */ >> + if (TARGET_SSE4_2) >> + { >> + tmp0 = gen_reg_rtx (mode); >> + tmp1 = gen_reg_rtx (mode); >> + emit_move_insn (tmp1, CONST0_RTX (mode)); >> + if (mode == E_V2DImode) >> + emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); >> + else >> + emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); } else { >> + tmp0 = expand_simple_binop (mode, LSHIFTRT, input, >> + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), >> + NULL, 0, OPTAB_DIRECT); >> + tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); } >> + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, >> + NULL, 0, OPTAB_DIRECT); >> + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, >> + target, 0, OPTAB_DIRECT); >> + break; You could merge parts of the above code. Uros.
--- gcc/config/i386/i386.c.jj 2018-04-25 15:09:29.895453703 +0200 +++ gcc/config/i386/i386.c 2018-04-30 18:31:56.027101932 +0200 @@ -49806,39 +49806,74 @@ ix86_expand_sse2_abs (rtx target, rtx in switch (mode) { + case E_V2DImode: + case E_V4DImode: + /* For 64-bit signed integer X, with SSE4.2 use + pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. + Otherwise handle it similarly to V4SImode, except use 64 as W instead of + 32 and use logical instead of arithmetic right shift (which is + unimplemented) and subtract. */ + if (TARGET_SSE4_2) + { + tmp0 = gen_reg_rtx (mode); + tmp1 = gen_reg_rtx (mode); + emit_move_insn (tmp1, CONST0_RTX (mode)); + if (mode == E_V2DImode) + emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); + else + emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); + + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, + NULL, 0, OPTAB_DIRECT); + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, + target, 0, OPTAB_DIRECT); + break; + } + + tmp0 = expand_simple_binop (mode, LSHIFTRT, input, + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), + NULL, 0, OPTAB_DIRECT); + tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); + + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, + NULL, 0, OPTAB_DIRECT); + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, + target, 0, OPTAB_DIRECT); + break; + + case E_V4SImode: /* For 32-bit signed integer X, the best way to calculate the absolute value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ - case E_V4SImode: - tmp0 = expand_simple_binop (mode, ASHIFTRT, input, - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), - NULL, 0, OPTAB_DIRECT); - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, - NULL, 0, OPTAB_DIRECT); - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, - target, 0, OPTAB_DIRECT); - break; + tmp0 = expand_simple_binop (mode, ASHIFTRT, input, + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), + NULL, 0, OPTAB_DIRECT); + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, + NULL, 0, OPTAB_DIRECT); + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, + target, 0, OPTAB_DIRECT); + break; + case E_V8HImode: /* For 16-bit signed integer X, the best way to calculate the absolute value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ - case E_V8HImode: - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - x = expand_simple_binop (mode, SMAX, tmp0, input, - target, 0, OPTAB_DIRECT); - break; + x = expand_simple_binop (mode, SMAX, tmp0, input, + target, 0, OPTAB_DIRECT); + break; + case E_V16QImode: /* For 8-bit signed integer X, the best way to calculate the absolute value of X is min ((unsigned char) X, (unsigned char) (-X)), as SSE2 provides the PMINUB insn. */ - case E_V16QImode: - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - x = expand_simple_binop (V16QImode, UMIN, tmp0, input, - target, 0, OPTAB_DIRECT); - break; + x = expand_simple_binop (V16QImode, UMIN, tmp0, input, + target, 0, OPTAB_DIRECT); + break; - default: - gcc_unreachable (); + default: + gcc_unreachable (); } if (x != target) --- gcc/config/i386/sse.md.jj 2018-04-26 22:03:29.000000000 +0200 +++ gcc/config/i386/sse.md 2018-04-30 19:09:18.740838508 +0200 @@ -15174,12 +15174,14 @@ (define_insn "abs<mode>2_mask" (set_attr "mode" "<sseinsnmode>")]) (define_expand "abs<mode>2" - [(set (match_operand:VI1248_AVX512VL_AVX512BW 0 "register_operand") - (abs:VI1248_AVX512VL_AVX512BW - (match_operand:VI1248_AVX512VL_AVX512BW 1 "vector_operand")))] + [(set (match_operand:VI_AVX2 0 "register_operand") + (abs:VI_AVX2 + (match_operand:VI_AVX2 1 "vector_operand")))] "TARGET_SSE2" { - if (!TARGET_SSSE3) + if (!TARGET_SSSE3 + || ((<MODE>mode == V2DImode || <MODE>mode == V4DImode) + && !TARGET_AVX512VL)) { ix86_expand_sse2_abs (operands[0], operands[1]); DONE; --- gcc/testsuite/g++.dg/other/sse2-pr85572-1.C.jj 2018-04-30 18:40:29.566619137 +0200 +++ gcc/testsuite/g++.dg/other/sse2-pr85572-1.C 2018-04-30 19:04:04.111419300 +0200 @@ -0,0 +1,14 @@ +// PR target/85572 +// { dg-do compile { target i?86-*-* x86_64-*-* } } +// { dg-options "-O2 -msse2 -mno-sse3" } +// { dg-final { scan-assembler-times {\mpxor\M} 2 } } +// { dg-final { scan-assembler-times {\mpsubq\M} 2 } } +// { dg-final { scan-assembler-times {\mpsrlq\M} 1 } } + +typedef long long V __attribute__((vector_size (16))); + +V +foo (V x) +{ + return x < 0 ? -x : x; +} --- gcc/testsuite/g++.dg/other/sse2-pr85572-2.C.jj 2018-04-30 18:43:59.057879751 +0200 +++ gcc/testsuite/g++.dg/other/sse2-pr85572-2.C 2018-04-30 19:04:55.717493565 +0200 @@ -0,0 +1,45 @@ +// PR target/85572 +// { dg-do run { target i?86-*-* x86_64-*-* } } +// { dg-options "-O2 -msse2" } +// { dg-require-effective-target sse2_runtime } + +typedef long long V __attribute__((vector_size (16))); +typedef long long W __attribute__((vector_size (32))); + +__attribute__((noipa)) V +foo (V x) +{ + return x < 0 ? -x : x; +} + +__attribute__((noipa)) void +bar (W *x, W *y) +{ + *y = *x < 0 ? -*x : *x; +} + +int +main () +{ + V a = { 11LL, -15LL }; + V b = foo (a); + if (b[0] != 11LL || b[1] != 15LL) + __builtin_abort (); + V c = { -123456789123456LL, 654321654321654LL }; + V d = foo (c); + if (d[0] != 123456789123456LL || d[1] != 654321654321654LL) + __builtin_abort (); + V e = { 0, 1 }; + V f = foo (e); + if (f[0] != 0 || f[1] != 1) + __builtin_abort (); + W g = { 17LL, -32LL, -123456789123456LL, 654321654321654LL }, h; + bar (&g, &h); + if (h[0] != 17LL || h[1] != 32LL + || h[2] != 123456789123456LL || h[3] != 654321654321654LL) + __builtin_abort (); + W i = { 0, 1, -1, 0 }, j; + bar (&i, &j); + if (j[0] != 0 || j[1] != 1 || j[2] != 1 || j[3] != 0) + __builtin_abort (); +} --- gcc/testsuite/g++.dg/other/sse4-pr85572-1.C.jj 2018-04-30 18:50:56.569420770 +0200 +++ gcc/testsuite/g++.dg/other/sse4-pr85572-1.C 2018-04-30 19:03:02.687330914 +0200 @@ -0,0 +1,14 @@ +// PR target/85572 +// { dg-do compile { target i?86-*-* x86_64-*-* } } +// { dg-options "-O2 -msse4 -mno-avx" } +// { dg-final { scan-assembler-times {\mpxor\M} 2 } } +// { dg-final { scan-assembler-times {\mpcmpgtq\M} 1 } } +// { dg-final { scan-assembler-times {\mpsubq\M} 1 } } + +typedef long long V __attribute__((vector_size (16))); + +V +foo (V x) +{ + return x < 0 ? -x : x; +} --- gcc/testsuite/g++.dg/other/avx2-pr85572-1.C.jj 2018-04-30 18:53:14.020598885 +0200 +++ gcc/testsuite/g++.dg/other/avx2-pr85572-1.C 2018-04-30 19:02:47.316308798 +0200 @@ -0,0 +1,21 @@ +// PR target/85572 +// { dg-do compile { target i?86-*-* x86_64-*-* } } +// { dg-options "-O2 -mavx2 -mno-avx512f" } +// { dg-final { scan-assembler-times {\mvpxor\M} 4 } } +// { dg-final { scan-assembler-times {\mvpcmpgtq\M} 2 } } +// { dg-final { scan-assembler-times {\mvpsubq\M} 2 } } + +typedef long long V __attribute__((vector_size (16))); +typedef long long W __attribute__((vector_size (32))); + +V +foo (V x) +{ + return x < 0 ? -x : x; +} + +W +bar (W x) +{ + return x < 0 ? -x : x; +}