diff mbox series

i386: Fix emit_reduc_half on V{64Q,32H}Imode [PR94500]

Message ID 20200406225118.GX2212@tucnak
State New
Headers show
Series i386: Fix emit_reduc_half on V{64Q,32H}Imode [PR94500] | expand

Commit Message

Li, Pan2 via Gcc-patches April 6, 2020, 10:51 p.m. UTC
Hi!

The following testcase is miscompiled in 8.x, because emit_reduc_half is
prepared to handle for 512-bit modes only i equal to 512, 256, 128 and 64.
V32HImode also needs i equal to 32 and V64QImode i equal to 32 and 16,
but emit_reduc_half in that case performs a redundant permutation exactly
like i == 32.  In 9+ the testcase works because Richard in r9-3393
changed the reduc_* expanders so that they actually don't call
ix86_expand_reduc on 512-bit modes, but only 128-bit ones.

The patch fixes emit_reduc_half to handle also i of 32 and 16 similarly to
how V32QImode/V16HImode are handled for AVX2.  I think it shouldn't hurt
to fix the function even on the trunk and 9 branch even when nothing uses
it ATM.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/9 and
primarily for 8.5 (obviously in that case s/i386-expand/i386/)?

2020-04-06  Jakub Jelinek  <jakub@redhat.com>

	PR target/94500
	* config/i386/i386-expand.c (emit_reduc_half): For V{64QI,32HI}mode
	handle i < 64 using avx512bw_lshrv4ti3.  Formatting fixes.

	* gcc.target/i386/avx512bw-pr94500.c: New test.


	Jakub

Comments

Li, Pan2 via Gcc-patches April 7, 2020, 6:10 a.m. UTC | #1
On Tue, Apr 7, 2020 at 12:51 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following testcase is miscompiled in 8.x, because emit_reduc_half is
> prepared to handle for 512-bit modes only i equal to 512, 256, 128 and 64.
> V32HImode also needs i equal to 32 and V64QImode i equal to 32 and 16,
> but emit_reduc_half in that case performs a redundant permutation exactly
> like i == 32.  In 9+ the testcase works because Richard in r9-3393
> changed the reduc_* expanders so that they actually don't call
> ix86_expand_reduc on 512-bit modes, but only 128-bit ones.
>
> The patch fixes emit_reduc_half to handle also i of 32 and 16 similarly to
> how V32QImode/V16HImode are handled for AVX2.  I think it shouldn't hurt
> to fix the function even on the trunk and 9 branch even when nothing uses
> it ATM.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/9 and
> primarily for 8.5 (obviously in that case s/i386-expand/i386/)?
>
> 2020-04-06  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/94500
>         * config/i386/i386-expand.c (emit_reduc_half): For V{64QI,32HI}mode
>         handle i < 64 using avx512bw_lshrv4ti3.  Formatting fixes.
>
>         * gcc.target/i386/avx512bw-pr94500.c: New test.

OK everywhere.

Thanks,
Uros.

> --- gcc/config/i386/i386-expand.c.jj    2020-03-29 19:26:31.748561262 +0200
> +++ gcc/config/i386/i386-expand.c       2020-04-06 17:18:44.906242980 +0200
> @@ -14891,43 +14891,51 @@ emit_reduc_half (rtx dest, rtx src, int
>        break;
>      case E_V64QImode:
>      case E_V32HImode:
> +      if (i < 64)
> +       {
> +         d = gen_reg_rtx (V4TImode);
> +         tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
> +                                       GEN_INT (i / 2));
> +         break;
> +       }
> +      /* FALLTHRU */
>      case E_V16SImode:
>      case E_V16SFmode:
>      case E_V8DImode:
>      case E_V8DFmode:
>        if (i > 128)
>         tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
> -                                     gen_lowpart (V16SImode, src),
> -                                     gen_lowpart (V16SImode, src),
> -                                     GEN_INT (0x4 + (i == 512 ? 4 : 0)),
> -                                     GEN_INT (0x5 + (i == 512 ? 4 : 0)),
> -                                     GEN_INT (0x6 + (i == 512 ? 4 : 0)),
> -                                     GEN_INT (0x7 + (i == 512 ? 4 : 0)),
> -                                     GEN_INT (0xC), GEN_INT (0xD),
> -                                     GEN_INT (0xE), GEN_INT (0xF),
> -                                     GEN_INT (0x10), GEN_INT (0x11),
> -                                     GEN_INT (0x12), GEN_INT (0x13),
> -                                     GEN_INT (0x14), GEN_INT (0x15),
> -                                     GEN_INT (0x16), GEN_INT (0x17));
> +                                       gen_lowpart (V16SImode, src),
> +                                       gen_lowpart (V16SImode, src),
> +                                       GEN_INT (0x4 + (i == 512 ? 4 : 0)),
> +                                       GEN_INT (0x5 + (i == 512 ? 4 : 0)),
> +                                       GEN_INT (0x6 + (i == 512 ? 4 : 0)),
> +                                       GEN_INT (0x7 + (i == 512 ? 4 : 0)),
> +                                       GEN_INT (0xC), GEN_INT (0xD),
> +                                       GEN_INT (0xE), GEN_INT (0xF),
> +                                       GEN_INT (0x10), GEN_INT (0x11),
> +                                       GEN_INT (0x12), GEN_INT (0x13),
> +                                       GEN_INT (0x14), GEN_INT (0x15),
> +                                       GEN_INT (0x16), GEN_INT (0x17));
>        else
>         tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
> -                                  gen_lowpart (V16SImode, src),
> -                                  GEN_INT (i == 128 ? 0x2 : 0x1),
> -                                  GEN_INT (0x3),
> -                                  GEN_INT (0x3),
> -                                  GEN_INT (0x3),
> -                                  GEN_INT (i == 128 ? 0x6 : 0x5),
> -                                  GEN_INT (0x7),
> -                                  GEN_INT (0x7),
> -                                  GEN_INT (0x7),
> -                                  GEN_INT (i == 128 ? 0xA : 0x9),
> -                                  GEN_INT (0xB),
> -                                  GEN_INT (0xB),
> -                                  GEN_INT (0xB),
> -                                  GEN_INT (i == 128 ? 0xE : 0xD),
> -                                  GEN_INT (0xF),
> -                                  GEN_INT (0xF),
> -                                  GEN_INT (0xF));
> +                                   gen_lowpart (V16SImode, src),
> +                                   GEN_INT (i == 128 ? 0x2 : 0x1),
> +                                   GEN_INT (0x3),
> +                                   GEN_INT (0x3),
> +                                   GEN_INT (0x3),
> +                                   GEN_INT (i == 128 ? 0x6 : 0x5),
> +                                   GEN_INT (0x7),
> +                                   GEN_INT (0x7),
> +                                   GEN_INT (0x7),
> +                                   GEN_INT (i == 128 ? 0xA : 0x9),
> +                                   GEN_INT (0xB),
> +                                   GEN_INT (0xB),
> +                                   GEN_INT (0xB),
> +                                   GEN_INT (i == 128 ? 0xE : 0xD),
> +                                   GEN_INT (0xF),
> +                                   GEN_INT (0xF),
> +                                   GEN_INT (0xF));
>        break;
>      default:
>        gcc_unreachable ();
> --- gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c.jj 2020-04-06 17:24:42.246904934 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c    2020-04-06 17:26:03.721687840 +0200
> @@ -0,0 +1,28 @@
> +/* PR target/94500 */
> +/* { dg-do run { target avx512bw } } */
> +/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
> +
> +#define AVX512BW
> +#include "avx512f-helper.h"
> +
> +__attribute__((noipa)) signed char
> +foo (signed char *p)
> +{
> +  signed char r = 0;
> +  int i;
> +  for (i = 0; i < 256; i++)
> +    if (p[i] > r) r = p[i];
> +  return r;
> +}
> +
> +signed char buf[256];
> +
> +static void
> +TEST (void)
> +{
> +  int i;
> +  for (i = 0; i < 256; i++)
> +    buf[i] = i - 128;
> +  if (foo (buf) != 127)
> +    abort ();
> +}
>
>         Jakub
>
diff mbox series

Patch

--- gcc/config/i386/i386-expand.c.jj	2020-03-29 19:26:31.748561262 +0200
+++ gcc/config/i386/i386-expand.c	2020-04-06 17:18:44.906242980 +0200
@@ -14891,43 +14891,51 @@  emit_reduc_half (rtx dest, rtx src, int
       break;
     case E_V64QImode:
     case E_V32HImode:
+      if (i < 64)
+	{
+	  d = gen_reg_rtx (V4TImode);
+	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
+					GEN_INT (i / 2));
+	  break;
+	}
+      /* FALLTHRU */
     case E_V16SImode:
     case E_V16SFmode:
     case E_V8DImode:
     case E_V8DFmode:
       if (i > 128)
 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
-				      gen_lowpart (V16SImode, src),
-				      gen_lowpart (V16SImode, src),
-				      GEN_INT (0x4 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x5 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x6 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0x7 + (i == 512 ? 4 : 0)),
-				      GEN_INT (0xC), GEN_INT (0xD),
-				      GEN_INT (0xE), GEN_INT (0xF),
-				      GEN_INT (0x10), GEN_INT (0x11),
-				      GEN_INT (0x12), GEN_INT (0x13),
-				      GEN_INT (0x14), GEN_INT (0x15),
-				      GEN_INT (0x16), GEN_INT (0x17));
+					gen_lowpart (V16SImode, src),
+					gen_lowpart (V16SImode, src),
+					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
+					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
+					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
+					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
+					GEN_INT (0xC), GEN_INT (0xD),
+					GEN_INT (0xE), GEN_INT (0xF),
+					GEN_INT (0x10), GEN_INT (0x11),
+					GEN_INT (0x12), GEN_INT (0x13),
+					GEN_INT (0x14), GEN_INT (0x15),
+					GEN_INT (0x16), GEN_INT (0x17));
       else
 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
-				   gen_lowpart (V16SImode, src),
-				   GEN_INT (i == 128 ? 0x2 : 0x1),
-				   GEN_INT (0x3),
-				   GEN_INT (0x3),
-				   GEN_INT (0x3),
-				   GEN_INT (i == 128 ? 0x6 : 0x5),
-				   GEN_INT (0x7),
-				   GEN_INT (0x7),
-				   GEN_INT (0x7),
-				   GEN_INT (i == 128 ? 0xA : 0x9),
-				   GEN_INT (0xB),
-				   GEN_INT (0xB),
-				   GEN_INT (0xB),
-				   GEN_INT (i == 128 ? 0xE : 0xD),
-				   GEN_INT (0xF),
-				   GEN_INT (0xF),
-				   GEN_INT (0xF));
+				    gen_lowpart (V16SImode, src),
+				    GEN_INT (i == 128 ? 0x2 : 0x1),
+				    GEN_INT (0x3),
+				    GEN_INT (0x3),
+				    GEN_INT (0x3),
+				    GEN_INT (i == 128 ? 0x6 : 0x5),
+				    GEN_INT (0x7),
+				    GEN_INT (0x7),
+				    GEN_INT (0x7),
+				    GEN_INT (i == 128 ? 0xA : 0x9),
+				    GEN_INT (0xB),
+				    GEN_INT (0xB),
+				    GEN_INT (0xB),
+				    GEN_INT (i == 128 ? 0xE : 0xD),
+				    GEN_INT (0xF),
+				    GEN_INT (0xF),
+				    GEN_INT (0xF));
       break;
     default:
       gcc_unreachable ();
--- gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c.jj	2020-04-06 17:24:42.246904934 +0200
+++ gcc/testsuite/gcc.target/i386/avx512bw-pr94500.c	2020-04-06 17:26:03.721687840 +0200
@@ -0,0 +1,28 @@ 
+/* PR target/94500 */
+/* { dg-do run { target avx512bw } } */
+/* { dg-options "-O3 -mavx512bw -mprefer-vector-width=512" } */
+
+#define AVX512BW
+#include "avx512f-helper.h"
+
+__attribute__((noipa)) signed char
+foo (signed char *p)
+{
+  signed char r = 0;
+  int i;
+  for (i = 0; i < 256; i++)
+    if (p[i] > r) r = p[i];
+  return r;
+}
+
+signed char buf[256];
+
+static void
+TEST (void)
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    buf[i] = i - 128;
+  if (foo (buf) != 127)
+    abort ();
+}