Patchwork Loop distribution improvements

login
register
mail settings
Submitter Jakub Jelinek
Date April 4, 2013, 6:57 p.m.
Message ID <20130404185713.GW4201@tucnak.redhat.com>
Download mbox | patch
Permalink /patch/233898/
State New
Headers show

Comments

Jakub Jelinek - April 4, 2013, 6:57 p.m.
On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote:
> Can you factor out a function that returns
> A proper qimode value if possible or null and
> Use it in both places?

Like this?

2013-04-04  Jakub Jelinek  <jakub@redhat.com>

	* tree-loop-distribution.c (const_with_all_bytes_same): New function.
	(generate_memset_builtin): Only handle integer_all_onesp as -1 val if
	TYPE_PRECISION is equal to mode bitsize.  Use const_with_all_bytes_same
	if possible to compute val.
	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
	For QImode integers don't require anything about precision.  Use
	const_with_all_bytes_same to find out if the constant doesn't have
	repeated bytes in it.

	* gcc.dg/pr56837.c: New test.



	Jakub
Richard Guenther - April 5, 2013, 7:21 a.m.
Jakub Jelinek <jakub@redhat.com> wrote:

>On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote:
>> Can you factor out a function that returns
>> A proper qimode value if possible or null and
>> Use it in both places?
>
>Like this?

You should be able to remove zero, minus one and constructor special casing, no? Ok, maybe not constructor handling, but at least move handling of it to the function.

Richard.

>2013-04-04  Jakub Jelinek  <jakub@redhat.com>
>
>	* tree-loop-distribution.c (const_with_all_bytes_same): New function.
>	(generate_memset_builtin): Only handle integer_all_onesp as -1 val if
>	TYPE_PRECISION is equal to mode bitsize.  Use
>const_with_all_bytes_same
>	if possible to compute val.
>	(classify_partition): Verify CONSTRUCTOR doesn't have any elts.
>	For QImode integers don't require anything about precision.  Use
>	const_with_all_bytes_same to find out if the constant doesn't have
>	repeated bytes in it.
>
>	* gcc.dg/pr56837.c: New test.
>
>--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
>+++ gcc/tree-loop-distribution.c	2013-04-04 20:49:14.295546543 +0200
>@@ -297,6 +297,27 @@ build_addr_arg_loc (location_t loc, data
>return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr),
>addr_base);
> }
> 
>+/* If VAL memory representation contains the same value in all bytes,
>+   return that value, otherwise return -1.
>+   E.g. for 0x24242424 return 0x24, for IEEE double
>+   747708026454360457216.0 return 0x44, etc.  */
>+
>+static int
>+const_with_all_bytes_same (tree val)
>+{
>+  unsigned char buf[64];
>+  int i, len;
>+  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
>+    return -1;
>+  len = native_encode_expr (val, buf, sizeof (buf));
>+  if (len == 0)
>+    return -1;
>+  for (i = 1; i < len; i++)
>+    if (buf[i] != buf[0])
>+      return -1;
>+  return buf[0];
>+}
>+
> /* Generate a call to memset for PARTITION in LOOP.  */
> 
> static void
>@@ -331,11 +352,18 @@ generate_memset_builtin (struct loop *lo
>       || real_zerop (val)
>       || TREE_CODE (val) == CONSTRUCTOR)
>     val = integer_zero_node;
>-  else if (integer_all_onesp (val))
>+  else if (integer_all_onesp (val)
>+	   && TYPE_PRECISION (TREE_TYPE (val))
>+	      == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val))))
>     val = build_int_cst (integer_type_node, -1);
>   else
>     {
>-      if (TREE_CODE (val) == INTEGER_CST)
>+      /* Handle constants like 0x15151515 and similarly
>+	 floating point constants etc. where all bytes are the same.  */
>+      int bytev = const_with_all_bytes_same (val);
>+      if (bytev != -1)
>+	val = build_int_cst (integer_type_node, bytev);
>+      else if (TREE_CODE (val) == INTEGER_CST)
> 	val = fold_convert (integer_type_node, val);
>else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE
>(val)))
> 	{
>@@ -944,15 +972,15 @@ classify_partition (loop_p loop, struct
>       if (!(integer_zerop (rhs)
> 	    || real_zerop (rhs)
> 	    || (TREE_CODE (rhs) == CONSTRUCTOR
>-		&& !TREE_CLOBBER_P (rhs))
>-	    || ((integer_all_onesp (rhs)
>-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>-		     && (TYPE_MODE (TREE_TYPE (rhs))
>-			 == TYPE_MODE (unsigned_char_type_node))))
>-		/* For stores of a non-zero value require that the precision
>-		   of the value matches its actual size.  */
>+		&& !TREE_CLOBBER_P (rhs)
>+		&& CONSTRUCTOR_NELTS (rhs) == 0)
>+	    || (integer_all_onesp (rhs)
> 		&& (TYPE_PRECISION (TREE_TYPE (rhs))
>-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
>+		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))
>+	    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
>+		&& (TYPE_MODE (TREE_TYPE (rhs))
>+		    == TYPE_MODE (unsigned_char_type_node)))
>+	    || const_with_all_bytes_same (rhs) != -1))
> 	return;
>       if (TREE_CODE (rhs) == SSA_NAME
> 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
>--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152
>+0200
>+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
>@@ -0,0 +1,67 @@
>+/* Limit this test to selected targets with IEEE double, 8-byte long
>long,
>+   supported 4x int vectors, 4-byte int.  */
>+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } }
>*/
>+/* { dg-options "-O3 -fdump-tree-optimized" } */
>+/* { dg-additional-options "-msse2" { target ia32 } } */
>+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } }
>*/
>+
>+typedef int V __attribute__((__vector_size__ (16)));
>+#define N 1024
>+double d[N];
>+long long int l[N];
>+_Bool b[N];
>+_Complex double c[N];
>+V v[N];
>+
>+void
>+fd (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    d[i] = 747708026454360457216.0;
>+}
>+
>+void
>+fl (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
>+}
>+
>+void
>+fb (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    b[i] = 1;
>+}
>+
>+void
>+fc (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
>+}
>+
>+void
>+fv (void)
>+{
>+  int i;
>+  for (i = 0; i < N; i++)
>+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
>+}
>+
>+/* Look for
>+  __builtin_memset (&d, 68, 8192);
>+  __builtin_memset (&l, 124, 8192);
>+  __builtin_memset (&b, 1, 1024);
>+  __builtin_memset (&c, 68, 16384);
>+  __builtin_memset (&v, 18, 16384); */
>+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1
>"optimized" } } */
>+/* { dg-final { cleanup-tree-dump "optimized" } } */
>
>
>	Jakub
Jakub Jelinek - April 5, 2013, 7:44 a.m.
On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote:
> Jakub Jelinek <jakub@redhat.com> wrote:
> 
> >On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote:
> >> Can you factor out a function that returns
> >> A proper qimode value if possible or null and
> >> Use it in both places?
> >
> >Like this?
> 
> You should be able to remove zero, minus one and constructor special
> casing, no?  Ok, maybe not constructor handling, but at least move

No, because the function is only handling BITS_PER_UNIT == 8 && CHAR_BIT == 8,
plus is unnecessarily expensive for the common case of storing 0.

But if you want, I can move all that integer_zerop / real_zerop /
CONSTRUCTOR / integer_all_onesp handling into the function.

BTW, the integer_all_onesp stuff is broken for this from what I can see, for complex
numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we need
to rule out COMPLEX_CSTs (or do integer_all_onesp on each part instead).
And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for.

	Jakub
Marc Glisse - April 5, 2013, 8:12 a.m.
On Fri, 5 Apr 2013, Jakub Jelinek wrote:

> On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote:
>> Jakub Jelinek <jakub@redhat.com> wrote:
>>
>>> On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote:
>>>> Can you factor out a function that returns
>>>> A proper qimode value if possible or null and
>>>> Use it in both places?
>>>
>>> Like this?
>>
>> You should be able to remove zero, minus one and constructor special
>> casing, no?  Ok, maybe not constructor handling, but at least move
>
> No, because the function is only handling BITS_PER_UNIT == 8 && CHAR_BIT == 8,
> plus is unnecessarily expensive for the common case of storing 0.
>
> But if you want, I can move all that integer_zerop / real_zerop /
> CONSTRUCTOR / integer_all_onesp handling into the function.
>
> BTW, the integer_all_onesp stuff is broken for this from what I can see, for complex
> numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we need
> to rule out COMPLEX_CSTs (or do integer_all_onesp on each part instead).
> And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for.

Shouldn't we change integer_all_onesp to do what its name says and create 
a separate integer_minus_onep for the single place I could find where it 
would break, the folding of x * -1 ?
Richard Guenther - April 5, 2013, 10:46 a.m.
Jakub Jelinek <jakub@redhat.com> wrote:

>On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote:
>> Jakub Jelinek <jakub@redhat.com> wrote:
>> 
>> >On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote:
>> >> Can you factor out a function that returns
>> >> A proper qimode value if possible or null and
>> >> Use it in both places?
>> >
>> >Like this?
>> 
>> You should be able to remove zero, minus one and constructor special
>> casing, no?  Ok, maybe not constructor handling, but at least move
>
>No, because the function is only handling BITS_PER_UNIT == 8 &&
>CHAR_BIT == 8,
>plus is unnecessarily expensive for the common case of storing 0.
>
>But if you want, I can move all that integer_zerop / real_zerop /
>CONSTRUCTOR / integer_all_onesp handling into the function.

Please.

>BTW, the integer_all_onesp stuff is broken for this from what I can
>see, for complex
>numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we
>need
>to rule out COMPLEX_CSTs (or do integer_all_onesp on each part
>instead).
>And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for.

Hmm, indeed.  Or remove the -1 special casing altogether. Marc is probably right with his note as well.

Richard.

>	Jakub

Patch

--- gcc/tree-loop-distribution.c.jj	2013-04-04 15:03:28.000000000 +0200
+++ gcc/tree-loop-distribution.c	2013-04-04 20:49:14.295546543 +0200
@@ -297,6 +297,27 @@  build_addr_arg_loc (location_t loc, data
   return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
 }
 
+/* If VAL memory representation contains the same value in all bytes,
+   return that value, otherwise return -1.
+   E.g. for 0x24242424 return 0x24, for IEEE double
+   747708026454360457216.0 return 0x44, etc.  */
+
+static int
+const_with_all_bytes_same (tree val)
+{
+  unsigned char buf[64];
+  int i, len;
+  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
+    return -1;
+  len = native_encode_expr (val, buf, sizeof (buf));
+  if (len == 0)
+    return -1;
+  for (i = 1; i < len; i++)
+    if (buf[i] != buf[0])
+      return -1;
+  return buf[0];
+}
+
 /* Generate a call to memset for PARTITION in LOOP.  */
 
 static void
@@ -331,11 +352,18 @@  generate_memset_builtin (struct loop *lo
       || real_zerop (val)
       || TREE_CODE (val) == CONSTRUCTOR)
     val = integer_zero_node;
-  else if (integer_all_onesp (val))
+  else if (integer_all_onesp (val)
+	   && TYPE_PRECISION (TREE_TYPE (val))
+	      == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val))))
     val = build_int_cst (integer_type_node, -1);
   else
     {
-      if (TREE_CODE (val) == INTEGER_CST)
+      /* Handle constants like 0x15151515 and similarly
+	 floating point constants etc. where all bytes are the same.  */
+      int bytev = const_with_all_bytes_same (val);
+      if (bytev != -1)
+	val = build_int_cst (integer_type_node, bytev);
+      else if (TREE_CODE (val) == INTEGER_CST)
 	val = fold_convert (integer_type_node, val);
       else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
 	{
@@ -944,15 +972,15 @@  classify_partition (loop_p loop, struct
       if (!(integer_zerop (rhs)
 	    || real_zerop (rhs)
 	    || (TREE_CODE (rhs) == CONSTRUCTOR
-		&& !TREE_CLOBBER_P (rhs))
-	    || ((integer_all_onesp (rhs)
-		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
-		     && (TYPE_MODE (TREE_TYPE (rhs))
-			 == TYPE_MODE (unsigned_char_type_node))))
-		/* For stores of a non-zero value require that the precision
-		   of the value matches its actual size.  */
+		&& !TREE_CLOBBER_P (rhs)
+		&& CONSTRUCTOR_NELTS (rhs) == 0)
+	    || (integer_all_onesp (rhs)
 		&& (TYPE_PRECISION (TREE_TYPE (rhs))
-		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
+		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))
+	    || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
+		&& (TYPE_MODE (TREE_TYPE (rhs))
+		    == TYPE_MODE (unsigned_char_type_node)))
+	    || const_with_all_bytes_same (rhs) != -1))
 	return;
       if (TREE_CODE (rhs) == SSA_NAME
 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
--- gcc/testsuite/gcc.dg/pr56837.c.jj	2013-04-04 17:37:58.458675152 +0200
+++ gcc/testsuite/gcc.dg/pr56837.c	2013-04-04 17:36:40.000000000 +0200
@@ -0,0 +1,67 @@ 
+/* Limit this test to selected targets with IEEE double, 8-byte long long,
+   supported 4x int vectors, 4-byte int.  */
+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-additional-options "-msse2" { target ia32 } } */
+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } } */
+
+typedef int V __attribute__((__vector_size__ (16)));
+#define N 1024
+double d[N];
+long long int l[N];
+_Bool b[N];
+_Complex double c[N];
+V v[N];
+
+void
+fd (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    d[i] = 747708026454360457216.0;
+}
+
+void
+fl (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    l[i] = 0x7c7c7c7c7c7c7c7cULL;
+}
+
+void
+fb (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    b[i] = 1;
+}
+
+void
+fc (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    c[i] = 747708026454360457216.0 + 747708026454360457216.0i;
+}
+
+void
+fv (void)
+{
+  int i;
+  for (i = 0; i < N; i++)
+    v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 };
+}
+
+/* Look for
+  __builtin_memset (&d, 68, 8192);
+  __builtin_memset (&l, 124, 8192);
+  __builtin_memset (&b, 1, 1024);
+  __builtin_memset (&c, 68, 16384);
+  __builtin_memset (&v, 18, 16384); */
+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1 "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */