Patchwork Loop distribution improvements

login
register
mail settings
Submitter Jakub Jelinek
Date April 4, 2013, 6:17 p.m.
Message ID <20130404181758.GV4201@tucnak.redhat.com>
Download mbox | patch
Permalink /patch/233893/
State New
Headers show

Comments

Jakub Jelinek - April 4, 2013, 6:17 p.m.
Hi!

As discussed on IRC, this patch allows as to recognize more patterns as
memset, see the testcase for what it can do.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-04-04  Jakub Jelinek  <jakub@redhat.com>

	* tree-loop-distribution.c (generate_memset_builtin): Only handle
	integer_all_onesp as -1 val if TYPE_PRECISION is equal to mode bitsize.
	Use native_encode_expr if possible to compute val.
	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
	For QImode integers don't require anything about precision.  Use
	native_encode_expr to find out if the constant doesn't have repeated
	bytes in it.

	* gcc.dg/pr56837.c: New test.


	Jakub
Richard Guenther - April 4, 2013, 6:37 p.m.
Jakub Jelinek <jakub@redhat.com> wrote:

>Hi!
>
>As discussed on IRC, this patch allows as to recognize more patterns as
>memset, see the testcase for what it can do.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Can you factor out a function that returns
A proper qimode value if possible or null and
Use it in both places?

Thanks,
Richard.

>2013-04-04  Jakub Jelinek  <jakub@redhat.com>
>
>	* tree-loop-distribution.c (generate_memset_builtin): Only handle
>	integer_all_onesp as -1 val if TYPE_PRECISION is equal to mode
>bitsize.
>	Use native_encode_expr if possible to compute val.
>	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
>	For QImode integers don't require anything about precision.  Use
>	native_encode_expr to find out if the constant doesn't have repeated
>	bytes in it.
>
>	* gcc.dg/pr56837.c: New test.
>
>--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
>+++ gcc/tree-loop-distribution.c	2013-04-04 16:52:40.139875453 +0200
>@@ -331,11 +331,21 @@ generate_memset_builtin (struct loop *lo
>       || real_zerop (val)
>       || TREE_CODE (val) == CONSTRUCTOR)
>     val = integer_zero_node;
>-  else if (integer_all_onesp (val))
>+  else if (integer_all_onesp (val)
>+	   && (TYPE_PRECISION (TREE_TYPE (val))
>+	       == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val)))))
>     val = build_int_cst (integer_type_node, -1);
>   else
>     {
>-      if (TREE_CODE (val) == INTEGER_CST)
>+      /* Handle constants like 0x15151515 and similarly
>+	 floating point constants etc. where all bytes are the same.  */
>+      unsigned char buf[64];
>+      int len;
>+      if (CHAR_BIT == 8
>+	  && BITS_PER_UNIT == 8
>+	  && (len = native_encode_expr (val, buf, sizeof (buf))) != 0)
>+	val = build_int_cst (integer_type_node, buf[0]);
>+      else if (TREE_CODE (val) == INTEGER_CST)
> 	val = fold_convert (integer_type_node, val);
>else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE
>(val)))
> 	{
>@@ -944,16 +954,28 @@ classify_partition (loop_p loop, struct
>       if (!(integer_zerop (rhs)
> 	    || real_zerop (rhs)
> 	    || (TREE_CODE (rhs) == CONSTRUCTOR
>-		&& !TREE_CLOBBER_P (rhs))
>-	    || ((integer_all_onesp (rhs)
>-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>-		     && (TYPE_MODE (TREE_TYPE (rhs))
>-			 == TYPE_MODE (unsigned_char_type_node))))
>-		/* For stores of a non-zero value require that the precision
>-		   of the value matches its actual size.  */
>+		&& !TREE_CLOBBER_P (rhs)
>+		&& CONSTRUCTOR_NELTS (rhs) == 0)
>+	    || (integer_all_onesp (rhs)
> 		&& (TYPE_PRECISION (TREE_TYPE (rhs))
>-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
>-	return;
>+		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))
>+	    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>+		&& (TYPE_MODE (TREE_TYPE (rhs))
>+		    == TYPE_MODE (unsigned_char_type_node)))))
>+	{
>+	  /* Handle constants like 0x15151515 and similarly
>+	     floating point constants etc. where all bytes are the same.  */
>+	  unsigned char buf[64];
>+	  int i, len;
>+	  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
>+	    return;
>+	  len = native_encode_expr (rhs, buf, sizeof (buf));
>+	  if (len == 0)
>+	    return;
>+	  for (i = 1; i < len; i++)
>+	    if (buf[i] != buf[0])
>+	      return;
>+	}
>       if (TREE_CODE (rhs) == SSA_NAME
> 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
>	  && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT
>(rhs))))
>--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152
>+0200
>+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
>@@ -0,0 +1,67 @@
>+/* Limit this test to selected targets with IEEE double, 8-byte long
>long,
>+   supported 4x int vectors, 4-byte int.  */
>+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } }
>*/
>+/* { dg-options "-O3 -fdump-tree-optimized" } */
>+/* { dg-additional-options "-msse2" { target ia32 } } */
>+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } }
>*/
>+
>+typedef int V __attribute__((__vector_size__ (16)));
>+#define N 1024
>+double d[N];
>+long long int l[N];
>+_Bool b[N];
>+_Complex double c[N];
>+V v[N];
>+
>+void
>+fd (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    d[i] = 747708026454360457216.0;
>+}
>+
>+void
>+fl (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
>+}
>+
>+void
>+fb (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    b[i] = 1;
>+}
>+
>+void
>+fc (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
>+}
>+
>+void
>+fv (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
>+}
>+
>+/* Look for
>+  __builtin_memset (&d, 68, 8192);
>+  __builtin_memset (&l, 124, 8192);
>+  __builtin_memset (&b, 1, 1024);
>+  __builtin_memset (&c, 68, 16384);
>+  __builtin_memset (&v, 18, 16384); */
>+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { cleanup-tree-dump "optimized" } } */
>
>	Jakub

Patch

--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
+++ gcc/tree-loop-distribution.c	2013-04-04 16:52:40.139875453 +0200
@@ -331,11 +331,21 @@  generate_memset_builtin (struct loop *lo
       || real_zerop (val)
       || TREE_CODE (val) == CONSTRUCTOR)
     val = integer_zero_node;
-  else if (integer_all_onesp (val))
+  else if (integer_all_onesp (val)
+	   && (TYPE_PRECISION (TREE_TYPE (val))
+	       == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val)))))
     val = build_int_cst (integer_type_node, -1);
   else
     {
-      if (TREE_CODE (val) == INTEGER_CST)
+      /* Handle constants like 0x15151515 and similarly
+	 floating point constants etc. where all bytes are the same.  */
+      unsigned char buf[64];
+      int len;
+      if (CHAR_BIT == 8
+	  && BITS_PER_UNIT == 8
+	  && (len = native_encode_expr (val, buf, sizeof (buf))) != 0)
+	val = build_int_cst (integer_type_node, buf[0]);
+      else if (TREE_CODE (val) == INTEGER_CST)
 	val = fold_convert (integer_type_node, val);
       else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
 	{
@@ -944,16 +954,28 @@  classify_partition (loop_p loop, struct
       if (!(integer_zerop (rhs)
 	    || real_zerop (rhs)
 	    || (TREE_CODE (rhs) == CONSTRUCTOR
-		&& !TREE_CLOBBER_P (rhs))
-	    || ((integer_all_onesp (rhs)
-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
-		     && (TYPE_MODE (TREE_TYPE (rhs))
-			 == TYPE_MODE (unsigned_char_type_node))))
-		/* For stores of a non-zero value require that the precision
-		   of the value matches its actual size.  */
+		&& !TREE_CLOBBER_P (rhs)
+		&& CONSTRUCTOR_NELTS (rhs) == 0)
+	    || (integer_all_onesp (rhs)
 		&& (TYPE_PRECISION (TREE_TYPE (rhs))
-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
-	return;
+		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))
+	    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+		&& (TYPE_MODE (TREE_TYPE (rhs))
+		    == TYPE_MODE (unsigned_char_type_node)))))
+	{
+	  /* Handle constants like 0x15151515 and similarly
+	     floating point constants etc. where all bytes are the same.  */
+	  unsigned char buf[64];
+	  int i, len;
+	  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
+	    return;
+	  len = native_encode_expr (rhs, buf, sizeof (buf));
+	  if (len == 0)
+	    return;
+	  for (i = 1; i < len; i++)
+	    if (buf[i] != buf[0])
+	      return;
+	}
       if (TREE_CODE (rhs) == SSA_NAME
 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
 	  && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152 +0200
+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
@@ -0,0 +1,67 @@ 
+/* Limit this test to selected targets with IEEE double, 8-byte long long,
+   supported 4x int vectors, 4-byte int.  */
+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-additional-options "-msse2" { target ia32 } } */
+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } } */
+
+typedef int V __attribute__((__vector_size__ (16)));
+#define N 1024
+double d[N];
+long long int l[N];
+_Bool b[N];
+_Complex double c[N];
+V v[N];
+
+void
+fd (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    d[i] = 747708026454360457216.0;
+}
+
+void
+fl (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
+}
+
+void
+fb (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    b[i] = 1;
+}
+
+void
+fc (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
+}
+
+void
+fv (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
+}
+
+/* Look for
+  __builtin_memset (&d, 68, 8192);
+  __builtin_memset (&l, 124, 8192);
+  __builtin_memset (&b, 1, 1024);
+  __builtin_memset (&c, 68, 16384);
+  __builtin_memset (&v, 18, 16384); */
+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */