Patchwork Loop distribution improvements

login
register
mail settings
Submitter Jakub Jelinek
Date April 5, 2013, 1:38 p.m.
Message ID <20130405133844.GF20334@tucnak.redhat.com>
Download mbox | patch
Permalink /patch/234140/
State New
Headers show

Comments

Jakub Jelinek - April 5, 2013, 1:38 p.m.
On Fri, Apr 05, 2013 at 12:46:48PM +0200, Richard Biener wrote:
> >BTW, the integer_all_onesp stuff is broken for this from what I can
> >see, for complex
> >numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we
> >need
> >to rule out COMPLEX_CSTs (or do integer_all_onesp on each part
> >instead).
> >And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for.
> 
> Hmm, indeed.  Or remove the -1 special casing altogether.

Ok, zero/CONSTRUCTOR moved into the function, all_onesp handling removed (so
only on the CHAR_BIT == 8 hosts and BITS_PER_UNIT == 8 targets it will be
optimized).  Ok for trunk?

> Marc is probably right with his note as well.

I'll defer that to Marc ;)

2013-04-05  Jakub Jelinek  <jakub@redhat.com>

	* tree-loop-distribution.c (const_with_all_bytes_same): New function.
	(generate_memset_builtin): Only handle integer_all_onesp as -1 val if
	TYPE_PRECISION is equal to mode bitsize.  Use const_with_all_bytes_same
	if possible to compute val.
	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
	For QImode integers don't require anything about precision.  Use
	const_with_all_bytes_same to find out if the constant doesn't have
	repeated bytes in it.

	* gcc.dg/pr56837.c: New test.



	Jakub
Richard Guenther - April 5, 2013, 5:57 p.m.
Jakub Jelinek <jakub@redhat.com> wrote:

>On Fri, Apr 05, 2013 at 12:46:48PM +0200, Richard Biener wrote:
>> >BTW, the integer_all_onesp stuff is broken for this from what I can
>> >see, for complex
>> >numbers it returns true for -1 + 0i where all bytes aren't 0xff, so
>we
>> >need
>> >to rule out COMPLEX_CSTs (or do integer_all_onesp on each part
>> >instead).
>> >And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for.
>> 
>> Hmm, indeed.  Or remove the -1 special casing altogether.
>
>Ok, zero/CONSTRUCTOR moved into the function, all_onesp handling
>removed (so
>only on the CHAR_BIT == 8 hosts and BITS_PER_UNIT == 8 targets it will
>be
>optimized).  Ok for trunk?


Ok.

Thanks,
Richard.

>> Marc is probably right with his note as well.
>
>I'll defer that to Marc ;)
>
>2013-04-05  Jakub Jelinek  <jakub@redhat.com>
>
>	* tree-loop-distribution.c (const_with_all_bytes_same): New function.
>	(generate_memset_builtin): Only handle integer_all_onesp as -1 val if
>	TYPE_PRECISION is equal to mode bitsize.  Use
>const_with_all_bytes_same
>	if possible to compute val.
>	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
>	For QImode integers don't require anything about precision.  Use
>	const_with_all_bytes_same to find out if the constant doesn't have
>	repeated bytes in it.
>
>	* gcc.dg/pr56837.c: New test.
>
>--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
>+++ gcc/tree-loop-distribution.c	2013-04-05 15:21:10.641668895 +0200
>@@ -297,6 +297,36 @@ build_addr_arg_loc (location_t loc, data
>return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr),
>addr_base);
> }
> 
>+/* If VAL memory representation contains the same value in all bytes,
>+   return that value, otherwise return -1.
>+   E.g. for 0x24242424 return 0x24, for IEEE double
>+   747708026454360457216.0 return 0x44, etc.  */
>+
>+static int
>+const_with_all_bytes_same (tree val)
>+{
>+  unsigned char buf[64];
>+  int i, len;
>+
>+  if (integer_zerop (val)
>+      || real_zerop (val)
>+      || (TREE_CODE (val) == CONSTRUCTOR
>+          && !TREE_CLOBBER_P (val)
>+          && CONSTRUCTOR_NELTS (val) == 0))
>+    return 0;
>+
>+  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
>+    return -1;
>+
>+  len = native_encode_expr (val, buf, sizeof (buf));
>+  if (len == 0)
>+    return -1;
>+  for (i = 1; i < len; i++)
>+    if (buf[i] != buf[0])
>+      return -1;
>+  return buf[0];
>+}
>+
> /* Generate a call to memset for PARTITION in LOOP.  */
> 
> static void
>@@ -327,24 +357,20 @@ generate_memset_builtin (struct loop *lo
> 
>/* This exactly matches the pattern recognition in classify_partition. 
>*/
>   val = gimple_assign_rhs1 (stmt);
>-  if (integer_zerop (val)
>-      || real_zerop (val)
>-      || TREE_CODE (val) == CONSTRUCTOR)
>-    val = integer_zero_node;
>-  else if (integer_all_onesp (val))
>-    val = build_int_cst (integer_type_node, -1);
>-  else
>-    {
>-      if (TREE_CODE (val) == INTEGER_CST)
>-	val = fold_convert (integer_type_node, val);
>-      else if (!useless_type_conversion_p (integer_type_node,
>TREE_TYPE (val)))
>-	{
>-	  gimple cstmt;
>-	  tree tem = make_ssa_name (integer_type_node, NULL);
>-	  cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val,
>NULL_TREE);
>-	  gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
>-	  val = tem;
>-	}
>+  /* Handle constants like 0x15151515 and similarly
>+     floating point constants etc. where all bytes are the same.  */
>+  int bytev = const_with_all_bytes_same (val);
>+  if (bytev != -1)
>+    val = build_int_cst (integer_type_node, bytev);
>+  else if (TREE_CODE (val) == INTEGER_CST)
>+    val = fold_convert (integer_type_node, val);
>+  else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE
>(val)))
>+    {
>+      gimple cstmt;
>+      tree tem = make_ssa_name (integer_type_node, NULL);
>+      cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val,
>NULL_TREE);
>+      gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
>+      val = tem;
>     }
> 
>   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
>@@ -354,10 +380,8 @@ generate_memset_builtin (struct loop *lo
>   if (dump_file && (dump_flags & TDF_DETAILS))
>     {
>       fprintf (dump_file, "generated memset");
>-      if (integer_zerop (val))
>+      if (bytev == 0)
> 	fprintf (dump_file, " zero\n");
>-      else if (integer_all_onesp (val))
>-	fprintf (dump_file, " minus one\n");
>       else
> 	fprintf (dump_file, "\n");
>     }
>@@ -941,18 +965,10 @@ classify_partition (loop_p loop, struct
>     {
>       gimple stmt = DR_STMT (single_store);
>       tree rhs = gimple_assign_rhs1 (stmt);
>-      if (!(integer_zerop (rhs)
>-	    || real_zerop (rhs)
>-	    || (TREE_CODE (rhs) == CONSTRUCTOR
>-		&& !TREE_CLOBBER_P (rhs))
>-	    || ((integer_all_onesp (rhs)
>-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>-		     && (TYPE_MODE (TREE_TYPE (rhs))
>-			 == TYPE_MODE (unsigned_char_type_node))))
>-		/* For stores of a non-zero value require that the precision
>-		   of the value matches its actual size.  */
>-		&& (TYPE_PRECISION (TREE_TYPE (rhs))
>-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
>+      if (const_with_all_bytes_same (rhs) == -1
>+	  && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>+	      || (TYPE_MODE (TREE_TYPE (rhs))
>+		  != TYPE_MODE (unsigned_char_type_node))))
> 	return;
>       if (TREE_CODE (rhs) == SSA_NAME
> 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
>--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152
>+0200
>+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
>@@ -0,0 +1,67 @@
>+/* Limit this test to selected targets with IEEE double, 8-byte long
>long,
>+   supported 4x int vectors, 4-byte int.  */
>+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } }
>*/
>+/* { dg-options "-O3 -fdump-tree-optimized" } */
>+/* { dg-additional-options "-msse2" { target ia32 } } */
>+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } }
>*/
>+
>+typedef int V __attribute__((__vector_size__ (16)));
>+#define N 1024
>+double d[N];
>+long long int l[N];
>+_Bool b[N];
>+_Complex double c[N];
>+V v[N];
>+
>+void
>+fd (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    d[i] = 747708026454360457216.0;
>+}
>+
>+void
>+fl (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
>+}
>+
>+void
>+fb (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    b[i] = 1;
>+}
>+
>+void
>+fc (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
>+}
>+
>+void
>+fv (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
>+}
>+
>+/* Look for
>+  __builtin_memset (&d, 68, 8192);
>+  __builtin_memset (&l, 124, 8192);
>+  __builtin_memset (&b, 1, 1024);
>+  __builtin_memset (&c, 68, 16384);
>+  __builtin_memset (&v, 18, 16384); */
>+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { cleanup-tree-dump "optimized" } } */
>
>
>	Jakub

Patch

--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
+++ gcc/tree-loop-distribution.c	2013-04-05 15:21:10.641668895 +0200
@@ -297,6 +297,36 @@  build_addr_arg_loc (location_t loc, data
   return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
 }
 
+/* If VAL memory representation contains the same value in all bytes,
+   return that value, otherwise return -1.
+   E.g. for 0x24242424 return 0x24, for IEEE double
+   747708026454360457216.0 return 0x44, etc.  */
+
+static int
+const_with_all_bytes_same (tree val)
+{
+  unsigned char buf[64];
+  int i, len;
+
+  if (integer_zerop (val)
+      || real_zerop (val)
+      || (TREE_CODE (val) == CONSTRUCTOR
+          && !TREE_CLOBBER_P (val)
+          && CONSTRUCTOR_NELTS (val) == 0))
+    return 0;
+
+  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
+    return -1;
+
+  len = native_encode_expr (val, buf, sizeof (buf));
+  if (len == 0)
+    return -1;
+  for (i = 1; i < len; i++)
+    if (buf[i] != buf[0])
+      return -1;
+  return buf[0];
+}
+
 /* Generate a call to memset for PARTITION in LOOP.  */
 
 static void
@@ -327,24 +357,20 @@  generate_memset_builtin (struct loop *lo
 
   /* This exactly matches the pattern recognition in classify_partition.  */
   val = gimple_assign_rhs1 (stmt);
-  if (integer_zerop (val)
-      || real_zerop (val)
-      || TREE_CODE (val) == CONSTRUCTOR)
-    val = integer_zero_node;
-  else if (integer_all_onesp (val))
-    val = build_int_cst (integer_type_node, -1);
-  else
-    {
-      if (TREE_CODE (val) == INTEGER_CST)
-	val = fold_convert (integer_type_node, val);
-      else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
-	{
-	  gimple cstmt;
-	  tree tem = make_ssa_name (integer_type_node, NULL);
-	  cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val, NULL_TREE);
-	  gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
-	  val = tem;
-	}
+  /* Handle constants like 0x15151515 and similarly
+     floating point constants etc. where all bytes are the same.  */
+  int bytev = const_with_all_bytes_same (val);
+  if (bytev != -1)
+    val = build_int_cst (integer_type_node, bytev);
+  else if (TREE_CODE (val) == INTEGER_CST)
+    val = fold_convert (integer_type_node, val);
+  else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
+    {
+      gimple cstmt;
+      tree tem = make_ssa_name (integer_type_node, NULL);
+      cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val, NULL_TREE);
+      gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
+      val = tem;
     }
 
   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
@@ -354,10 +380,8 @@  generate_memset_builtin (struct loop *lo
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
       fprintf (dump_file, "generated memset");
-      if (integer_zerop (val))
+      if (bytev == 0)
 	fprintf (dump_file, " zero\n");
-      else if (integer_all_onesp (val))
-	fprintf (dump_file, " minus one\n");
       else
 	fprintf (dump_file, "\n");
     }
@@ -941,18 +965,10 @@  classify_partition (loop_p loop, struct
     {
       gimple stmt = DR_STMT (single_store);
       tree rhs = gimple_assign_rhs1 (stmt);
-      if (!(integer_zerop (rhs)
-	    || real_zerop (rhs)
-	    || (TREE_CODE (rhs) == CONSTRUCTOR
-		&& !TREE_CLOBBER_P (rhs))
-	    || ((integer_all_onesp (rhs)
-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
-		     && (TYPE_MODE (TREE_TYPE (rhs))
-			 == TYPE_MODE (unsigned_char_type_node))))
-		/* For stores of a non-zero value require that the precision
-		   of the value matches its actual size.  */
-		&& (TYPE_PRECISION (TREE_TYPE (rhs))
-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
+      if (const_with_all_bytes_same (rhs) == -1
+	  && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+	      || (TYPE_MODE (TREE_TYPE (rhs))
+		  != TYPE_MODE (unsigned_char_type_node))))
 	return;
       if (TREE_CODE (rhs) == SSA_NAME
 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152 +0200
+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
@@ -0,0 +1,67 @@ 
+/* Limit this test to selected targets with IEEE double, 8-byte long long,
+   supported 4x int vectors, 4-byte int.  */
+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-additional-options "-msse2" { target ia32 } } */
+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } } */
+
+typedef int V __attribute__((__vector_size__ (16)));
+#define N 1024
+double d[N];
+long long int l[N];
+_Bool b[N];
+_Complex double c[N];
+V v[N];
+
+void
+fd (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    d[i] = 747708026454360457216.0;
+}
+
+void
+fl (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
+}
+
+void
+fb (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    b[i] = 1;
+}
+
+void
+fc (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
+}
+
+void
+fv (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
+}
+
+/* Look for
+  __builtin_memset (&d, 68, 8192);
+  __builtin_memset (&l, 124, 8192);
+  __builtin_memset (&b, 1, 1024);
+  __builtin_memset (&c, 68, 16384);
+  __builtin_memset (&v, 18, 16384); */
+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */