Message ID | 20130426155035.GS28963@tucnak.redhat.com |
---|---|
State | New |
Headers | show |
On Fri, Apr 26, 2013 at 5:50 PM, Jakub Jelinek <jakub@redhat.com> wrote: > This patch fixes two wrong-code bugs with -mxop. > One is that vpmacsdqh instruction can be only used for vec_widen_smult_odd_v4si > but not vec_widen_umult_odd_v4si. Consider we have > unsigned V4SImode h* with arguments > { 3, 3, 3, 3 } h* { 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab } > (but not known at compile time). If we use vpmacsdqh, it sign-extends > the numbers and thus computes (3 * 0xffffffffaaaaaaabULL) >> 32, > i.e. 0xffffffff, while we want (3 * 0xaaaaaaabULL) >> 32, i.e. 2. > > The second bug is in wrong shift count for immediate xop_rotr. > We want element bitsize - immediate to transform the r>> immediate > into r<< immediate, but (<ssescalarnum> * 8) is correct for that only > for V4SImode - 32. For V2DImode it is 16 instead of the desired > 64, for V8HImode it is 64 instead of the desired 16 and for V16QImode > it is 128 instead of the desired 8. > > Bootstrapped/regtested on x86_64-linux, configured --with-arch=bdver2, > fixes: > > -FAIL: gcc.c-torture/execute/pr51581-1.c execution, -O3 -fomit-frame-pointer > -FAIL: gcc.c-torture/execute/pr51581-1.c execution, -O3 -fomit-frame-pointer -funroll-loops > -FAIL: gcc.c-torture/execute/pr51581-1.c execution, -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions > -FAIL: gcc.c-torture/execute/pr51581-1.c execution, -O3 -g > -FAIL: gcc.c-torture/execute/pr51581-2.c execution, -O3 -fomit-frame-pointer > -FAIL: gcc.c-torture/execute/pr51581-2.c execution, -O3 -fomit-frame-pointer -funroll-loops > -FAIL: gcc.c-torture/execute/pr51581-2.c execution, -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions > -FAIL: gcc.c-torture/execute/pr51581-2.c execution, -O3 -g > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O1 > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O2 > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O3 -fomit-frame-pointer > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O3 -fomit-frame-pointer -funroll-loops > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O3 -g > -FAIL: gcc.c-torture/execute/pr53645.c execution, -Os > -FAIL: gcc.c-torture/execute/pr53645.c execution, -Og -g > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O2 -flto -fno-use-linker-plugin -flto-partition=none > -FAIL: gcc.c-torture/execute/pr53645.c execution, -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects > -FAIL: gcc.c-torture/execute/pr56866.c execution, -O3 -fomit-frame-pointer > -FAIL: gcc.c-torture/execute/pr56866.c execution, -O3 -fomit-frame-pointer -funroll-loops > -FAIL: gcc.c-torture/execute/pr56866.c execution, -O3 -fomit-frame-pointer -funroll-all-loops -finline-functions > -FAIL: gcc.c-torture/execute/pr56866.c execution, -O3 -g > -FAIL: gcc.dg/vect/pr51581-1.c execution test > -FAIL: gcc.dg/vect/pr51581-2.c execution test > -FAIL: gcc.dg/vect/pr51581-3.c execution test > -FAIL: gcc.dg/vect/pr51581-1.c -flto execution test > -FAIL: gcc.dg/vect/pr51581-2.c -flto execution test > -FAIL: gcc.dg/vect/pr51581-3.c -flto execution test > -FAIL: gcc.target/i386/avx-mul-1.c execution test > -FAIL: gcc.target/i386/avx-pr51581-1.c execution test > -FAIL: gcc.target/i386/avx-pr51581-2.c execution test > -FAIL: gcc.target/i386/pr56866.c execution test > -FAIL: gcc.target/i386/sse2-mul-1.c execution test > -FAIL: gcc.target/i386/sse4_1-mul-1.c execution test > -FAIL: gcc.target/i386/xop-mul-1.c execution test > > failures that appear with stock gcc just with the testsuite/ > part of the patch applied. Ok for trunk/4.8 and partly for 4.7 > (the i386.c bug has been introduced in 2012-06-25 but the sse.md > bug existed in 4.7 already)? > > 2013-04-26 Jakub Jelinek <jakub@redhat.com> > > PR target/56866 > * config/i386/i386.c (ix86_expand_mul_widen_evenodd): Don't > use xop_pmacsdqh if uns_p. > * config/i386/sse.md (xop_rotr<mode>3): Fix up computation of > the immediate rotate count. > > * gcc.c-torture/execute/pr56866.c: New test. > * gcc.target/i386/pr56866.c: New test. > > --- gcc/config/i386/i386.c.jj 2013-04-22 10:26:22.000000000 +0200 > +++ gcc/config/i386/i386.c 2013-04-26 10:28:51.793534370 +0200 > @@ -40841,7 +40841,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, > the even slots. For some cpus this is faster than a PSHUFD. */ > if (odd_p) > { > - if (TARGET_XOP && mode == V4SImode) > + if (TARGET_XOP && mode == V4SImode && !uns_p) Please add a small comment on why !uns_p is needed here. OK everywhere with the above addition. Thanks, Uros.
--- gcc/config/i386/i386.c.jj 2013-04-22 10:26:22.000000000 +0200 +++ gcc/config/i386/i386.c 2013-04-26 10:28:51.793534370 +0200 @@ -40841,7 +40841,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, the even slots. For some cpus this is faster than a PSHUFD. */ if (odd_p) { - if (TARGET_XOP && mode == V4SImode) + if (TARGET_XOP && mode == V4SImode && !uns_p) { x = force_reg (wmode, CONST0_RTX (wmode)); emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); --- gcc/config/i386/sse.md.jj 2013-04-02 20:24:37.000000000 +0200 +++ gcc/config/i386/sse.md 2013-04-26 13:25:32.729590863 +0200 @@ -9924,7 +9924,8 @@ (define_insn "xop_rotr<mode>3" (match_operand:SI 2 "const_0_to_<sserotatemax>_operand" "n")))] "TARGET_XOP" { - operands[3] = GEN_INT ((<ssescalarnum> * 8) - INTVAL (operands[2])); + operands[3] + = GEN_INT (GET_MODE_BITSIZE (<ssescalarmode>mode) - INTVAL (operands[2])); return \"vprot<ssemodesuffix>\t{%3, %1, %0|%0, %1, %3}\"; } [(set_attr "type" "sseishft") --- gcc/testsuite/gcc.c-torture/execute/pr56866.c.jj 2013-04-26 13:48:21.490810656 +0200 +++ gcc/testsuite/gcc.c-torture/execute/pr56866.c 2013-04-26 13:46:10.000000000 +0200 @@ -0,0 +1,45 @@ +/* PR target/56866 */ + +int +main () +{ +#if __CHAR_BIT__ == 8 && __SIZEOF_LONG_LONG__ == 8 && __SIZEOF_INT__ == 4 && __SIZEOF_SHORT__ == 2 + unsigned long long wq[256], rq[256]; + unsigned int wi[256], ri[256]; + unsigned short ws[256], rs[256]; + unsigned char wc[256], rc[256]; + int t; + + __builtin_memset (wq, 0, sizeof wq); + __builtin_memset (wi, 0, sizeof wi); + __builtin_memset (ws, 0, sizeof ws); + __builtin_memset (wc, 0, sizeof wc); + wq[0] = 0x0123456789abcdefULL; + wi[0] = 0x01234567; + ws[0] = 0x4567; + wc[0] = 0x73; + + asm volatile ("" : : "g" (wq), "g" (wi), "g" (ws), "g" (wc) : "memory"); + + for (t = 0; t < 256; ++t) + rq[t] = (wq[t] >> 8) | (wq[t] << (sizeof (wq[0]) * __CHAR_BIT__ - 8)); + for (t = 0; t < 256; ++t) + ri[t] = (wi[t] >> 8) | (wi[t] << (sizeof (wi[0]) * __CHAR_BIT__ - 8)); + for (t = 0; t < 256; ++t) + rs[t] = (ws[t] >> 9) | (ws[t] << (sizeof (ws[0]) * __CHAR_BIT__ - 9)); + for (t = 0; t < 256; ++t) + rc[t] = (wc[t] >> 5) | (wc[t] << (sizeof (wc[0]) * __CHAR_BIT__ - 5)); + + asm volatile ("" : : "g" (rq), "g" (ri), "g" (rs), "g" (rc) : "memory"); + + if (rq[0] != 0xef0123456789abcdULL || rq[1]) + __builtin_abort (); + if (ri[0] != 0x67012345 || ri[1]) + __builtin_abort (); + if (rs[0] != 0xb3a2 || rs[1]) + __builtin_abort (); + if (rc[0] != 0x9b || rc[1]) + __builtin_abort (); +#endif + return 0; +} --- gcc/testsuite/gcc.target/i386/pr56866.c.jj 2013-04-26 13:50:18.798199429 +0200 +++ gcc/testsuite/gcc.target/i386/pr56866.c 2013-04-26 13:52:13.428603267 +0200 @@ -0,0 +1,16 @@ +/* PR target/56866 */ +/* { dg-do run } */ +/* { dg-require-effective-target xop } */ +/* { dg-options "-O3 -mxop" } */ + +#define main xop_test_main +#include "../../gcc.c-torture/execute/pr56866.c" +#undef main + +#include "xop-check.h" + +static void +xop_test (void) +{ + xop_test_main (); +}