diff mbox series

Limit simplify_merge_mask optimization to cases where it can't trap (PR rtl-optimization/89445)

Message ID 20190223002746.GM7611@tucnak
State New
Headers show
Series Limit simplify_merge_mask optimization to cases where it can't trap (PR rtl-optimization/89445) | expand

Commit Message

Jakub Jelinek Feb. 23, 2019, 12:27 a.m. UTC
Hi!

The following testcase is miscompiled on x86_64.  The problem is that
simplify_merge_mask optimization throws away an inner VEC_MERGE when there
is an outer one with the same mask.  This can be done only if the change
doesn't have observable side-effects.  The code already uses side_effects_p
tests in various spots, that is needed, but as this testcase shows, not
sufficient.  Another issue is if there is a MEM load or store and not
MEM_NOTRAP_P, as the testcase shows.  And another problem can be vector
integer division by zero (I think only mips has such insn), or various
floating point operations if we care about floating point exceptions.

While fixing this, I've found that may_trap_p_1 doesn't really support
vector operations very much, vector floating point arithmetics can cause
exceptions like scalar floating point arithmetics; on the other side, the
4 VEC_* codes can't trap themselves, though their operands could.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-02-23  Jakub Jelinek  <jakub@redhat.com>

	PR rtl-optimization/89445
	* simplify-rtx.c (simplify_ternary_operation): Don't use
	simplify_merge_mask on operands that may trap.
	* rtlanal.c (may_trap_p_1): Use FLOAT_MODE_P instead of
	SCALAR_FLOAT_MODE_P checks.  For integral division by zero, if
	second operand is CONST_VECTOR, check if any element could be zero.
	Don't expect traps for VEC_{MERGE,SELECT,CONCAT,DUPLICATE} unless
	their operands can trap.

	* gcc.target/i386/avx512f-pr89445.c: New test.


	Jakub

Comments

Richard Biener Feb. 24, 2019, 6:53 p.m. UTC | #1
On February 23, 2019 1:27:46 AM GMT+01:00, Jakub Jelinek <jakub@redhat.com> wrote:
>Hi!
>
>The following testcase is miscompiled on x86_64.  The problem is that
>simplify_merge_mask optimization throws away an inner VEC_MERGE when
>there
>is an outer one with the same mask.  This can be done only if the
>change
>doesn't have observable side-effects.  The code already uses
>side_effects_p
>tests in various spots, that is needed, but as this testcase shows, not
>sufficient.  Another issue is if there is a MEM load or store and not
>MEM_NOTRAP_P, as the testcase shows.  And another problem can be vector
>integer division by zero (I think only mips has such insn), or various
>floating point operations if we care about floating point exceptions.
>
>While fixing this, I've found that may_trap_p_1 doesn't really support
>vector operations very much, vector floating point arithmetics can
>cause
>exceptions like scalar floating point arithmetics; on the other side,
>the
>4 VEC_* codes can't trap themselves, though their operands could.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK. 

Richard. 

>2019-02-23  Jakub Jelinek  <jakub@redhat.com>
>
>	PR rtl-optimization/89445
>	* simplify-rtx.c (simplify_ternary_operation): Don't use
>	simplify_merge_mask on operands that may trap.
>	* rtlanal.c (may_trap_p_1): Use FLOAT_MODE_P instead of
>	SCALAR_FLOAT_MODE_P checks.  For integral division by zero, if
>	second operand is CONST_VECTOR, check if any element could be zero.
>	Don't expect traps for VEC_{MERGE,SELECT,CONCAT,DUPLICATE} unless
>	their operands can trap.
>
>	* gcc.target/i386/avx512f-pr89445.c: New test.
>
>--- gcc/simplify-rtx.c.jj	2019-01-10 11:43:14.390377646 +0100
>+++ gcc/simplify-rtx.c	2019-02-22 19:01:08.977661098 +0100
>@@ -6073,8 +6073,10 @@ simplify_ternary_operation (enum rtx_cod
> 
>       if (!side_effects_p (op2))
> 	{
>-	  rtx top0 = simplify_merge_mask (op0, op2, 0);
>-	  rtx top1 = simplify_merge_mask (op1, op2, 1);
>+	  rtx top0
>+	    = may_trap_p (op0) ? NULL_RTX : simplify_merge_mask (op0, op2,
>0);
>+	  rtx top1
>+	    = may_trap_p (op1) ? NULL_RTX : simplify_merge_mask (op1, op2,
>1);
> 	  if (top0 || top1)
> 	    return simplify_gen_ternary (code, mode, mode,
> 					 top0 ? top0 : op0,
>--- gcc/rtlanal.c.jj	2019-02-20 10:00:49.279492877 +0100
>+++ gcc/rtlanal.c	2019-02-22 19:03:02.478790634 +0100
>@@ -2846,10 +2846,28 @@ may_trap_p_1 (const_rtx x, unsigned flag
>     case UMOD:
>       if (HONOR_SNANS (x))
> 	return 1;
>-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)))
>+      if (FLOAT_MODE_P (GET_MODE (x)))
> 	return flag_trapping_math;
>       if (!CONSTANT_P (XEXP (x, 1)) || (XEXP (x, 1) == const0_rtx))
> 	return 1;
>+      if (GET_CODE (XEXP (x, 1)) == CONST_VECTOR)
>+	{
>+	  /* For CONST_VECTOR, return 1 if any element is or might be zero. 
>*/
>+	  unsigned int n_elts;
>+	  rtx op = XEXP (x, 1);
>+	  if (!GET_MODE_NUNITS (GET_MODE (op)).is_constant (&n_elts))
>+	    {
>+	      if (!CONST_VECTOR_DUPLICATE_P (op))
>+		return 1;
>+	      for (unsigned i = 0; i < (unsigned int) XVECLEN (op, 0); i++)
>+		if (CONST_VECTOR_ENCODED_ELT (op, i) == const0_rtx)
>+		  return 1;
>+	    }
>+	  else
>+	    for (unsigned i = 0; i < n_elts; i++)
>+	      if (CONST_VECTOR_ELT (op, i) == const0_rtx)
>+		return 1;
>+	}
>       break;
> 
>     case EXPR_LIST:
>@@ -2898,12 +2916,16 @@ may_trap_p_1 (const_rtx x, unsigned flag
>     case NEG:
>     case ABS:
>     case SUBREG:
>+    case VEC_MERGE:
>+    case VEC_SELECT:
>+    case VEC_CONCAT:
>+    case VEC_DUPLICATE:
>       /* These operations don't trap even with floating point.  */
>       break;
> 
>     default:
>       /* Any floating arithmetic may trap.  */
>-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
>+      if (FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
> 	return 1;
>     }
> 
>--- gcc/testsuite/gcc.target/i386/avx512f-pr89445.c.jj	2019-02-22
>19:19:17.709898754 +0100
>+++ gcc/testsuite/gcc.target/i386/avx512f-pr89445.c	2019-02-22
>19:18:58.115216531 +0100
>@@ -0,0 +1,54 @@
>+/* PR rtl-optimization/89445 */
>+/* { dg-do run { target { avx512f && mmap } } } */
>+/* { dg-options "-O2 -mavx512f" } */
>+
>+#include "avx512f-check.h"
>+
>+#include <sys/mman.h>
>+#ifndef MAP_ANONYMOUS
>+#define MAP_ANONYMOUS MAP_ANON
>+#endif
>+#ifndef MAP_ANON
>+#define MAP_ANON 0
>+#endif
>+#ifndef MAP_FAILED
>+#define MAP_FAILED ((void *)-1)
>+#endif
>+
>+__attribute__ ((noipa))
>+void daxpy (unsigned long n, double a, double const *__restrict x,
>+	    double *__restrict y)
>+{
>+  const __m512d v_a = _mm512_broadcastsd_pd (_mm_set_sd (a));
>+  const __mmask16 final = (1U << (n % 8u)) - 1;
>+  __mmask16 mask = 65535u;
>+  unsigned long i;
>+  for (i = 0; i < n * sizeof (double); i += 8 * sizeof (double))
>+    {
>+      if (i + 8 * sizeof (double) > n * sizeof (double))
>+	mask = final;
>+      __m512d v_x = _mm512_maskz_loadu_pd (mask, (char const *) x +
>i);
>+      __m512d v_y = _mm512_maskz_loadu_pd (mask, (char const *) y +
>i);
>+      __m512d tmp = _mm512_fmadd_pd (v_x, v_a, v_y);
>+      _mm512_mask_storeu_pd ((char *) y + i, mask, tmp);
>+    }
>+}
>+
>+static const double x[] = { 1, 2, 3, 4 };
>+
>+static void
>+avx512f_test (void)
>+{
>+  char *ptr
>+    = (char *) mmap (NULL, 2 * 4096, PROT_READ | PROT_WRITE,
>+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
>+  if (ptr == MAP_FAILED)
>+    return;
>+
>+  munmap (ptr + 4096, 4096);
>+  double *y = (double *) (ptr + 4096 - sizeof (x));
>+  __builtin_memcpy (y, x, sizeof (x));
>+  daxpy (sizeof (x) / sizeof (x[0]), 1.0, x, y);
>+  if (y[0] != 2.0 || y[1] != 4.0 || y[2] != 6.0 || y[3] != 8.0)
>+    abort ();
>+}
>
>	Jakub
diff mbox series

Patch

--- gcc/simplify-rtx.c.jj	2019-01-10 11:43:14.390377646 +0100
+++ gcc/simplify-rtx.c	2019-02-22 19:01:08.977661098 +0100
@@ -6073,8 +6073,10 @@  simplify_ternary_operation (enum rtx_cod
 
       if (!side_effects_p (op2))
 	{
-	  rtx top0 = simplify_merge_mask (op0, op2, 0);
-	  rtx top1 = simplify_merge_mask (op1, op2, 1);
+	  rtx top0
+	    = may_trap_p (op0) ? NULL_RTX : simplify_merge_mask (op0, op2, 0);
+	  rtx top1
+	    = may_trap_p (op1) ? NULL_RTX : simplify_merge_mask (op1, op2, 1);
 	  if (top0 || top1)
 	    return simplify_gen_ternary (code, mode, mode,
 					 top0 ? top0 : op0,
--- gcc/rtlanal.c.jj	2019-02-20 10:00:49.279492877 +0100
+++ gcc/rtlanal.c	2019-02-22 19:03:02.478790634 +0100
@@ -2846,10 +2846,28 @@  may_trap_p_1 (const_rtx x, unsigned flag
     case UMOD:
       if (HONOR_SNANS (x))
 	return 1;
-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)))
+      if (FLOAT_MODE_P (GET_MODE (x)))
 	return flag_trapping_math;
       if (!CONSTANT_P (XEXP (x, 1)) || (XEXP (x, 1) == const0_rtx))
 	return 1;
+      if (GET_CODE (XEXP (x, 1)) == CONST_VECTOR)
+	{
+	  /* For CONST_VECTOR, return 1 if any element is or might be zero.  */
+	  unsigned int n_elts;
+	  rtx op = XEXP (x, 1);
+	  if (!GET_MODE_NUNITS (GET_MODE (op)).is_constant (&n_elts))
+	    {
+	      if (!CONST_VECTOR_DUPLICATE_P (op))
+		return 1;
+	      for (unsigned i = 0; i < (unsigned int) XVECLEN (op, 0); i++)
+		if (CONST_VECTOR_ENCODED_ELT (op, i) == const0_rtx)
+		  return 1;
+	    }
+	  else
+	    for (unsigned i = 0; i < n_elts; i++)
+	      if (CONST_VECTOR_ELT (op, i) == const0_rtx)
+		return 1;
+	}
       break;
 
     case EXPR_LIST:
@@ -2898,12 +2916,16 @@  may_trap_p_1 (const_rtx x, unsigned flag
     case NEG:
     case ABS:
     case SUBREG:
+    case VEC_MERGE:
+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_DUPLICATE:
       /* These operations don't trap even with floating point.  */
       break;
 
     default:
       /* Any floating arithmetic may trap.  */
-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
+      if (FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
 	return 1;
     }
 
--- gcc/testsuite/gcc.target/i386/avx512f-pr89445.c.jj	2019-02-22 19:19:17.709898754 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr89445.c	2019-02-22 19:18:58.115216531 +0100
@@ -0,0 +1,54 @@ 
+/* PR rtl-optimization/89445 */
+/* { dg-do run { target { avx512f && mmap } } } */
+/* { dg-options "-O2 -mavx512f" } */
+
+#include "avx512f-check.h"
+
+#include <sys/mman.h>
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#ifndef MAP_ANON
+#define MAP_ANON 0
+#endif
+#ifndef MAP_FAILED
+#define MAP_FAILED ((void *)-1)
+#endif
+
+__attribute__ ((noipa))
+void daxpy (unsigned long n, double a, double const *__restrict x,
+	    double *__restrict y)
+{
+  const __m512d v_a = _mm512_broadcastsd_pd (_mm_set_sd (a));
+  const __mmask16 final = (1U << (n % 8u)) - 1;
+  __mmask16 mask = 65535u;
+  unsigned long i;
+  for (i = 0; i < n * sizeof (double); i += 8 * sizeof (double))
+    {
+      if (i + 8 * sizeof (double) > n * sizeof (double))
+	mask = final;
+      __m512d v_x = _mm512_maskz_loadu_pd (mask, (char const *) x + i);
+      __m512d v_y = _mm512_maskz_loadu_pd (mask, (char const *) y + i);
+      __m512d tmp = _mm512_fmadd_pd (v_x, v_a, v_y);
+      _mm512_mask_storeu_pd ((char *) y + i, mask, tmp);
+    }
+}
+
+static const double x[] = { 1, 2, 3, 4 };
+
+static void
+avx512f_test (void)
+{
+  char *ptr
+    = (char *) mmap (NULL, 2 * 4096, PROT_READ | PROT_WRITE,
+		     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (ptr == MAP_FAILED)
+    return;
+
+  munmap (ptr + 4096, 4096);
+  double *y = (double *) (ptr + 4096 - sizeof (x));
+  __builtin_memcpy (y, x, sizeof (x));
+  daxpy (sizeof (x) / sizeof (x[0]), 1.0, x, y);
+  if (y[0] != 2.0 || y[1] != 4.0 || y[2] != 6.0 || y[3] != 8.0)
+    abort ();
+}