Message ID | 20130404185713.GW4201@tucnak.redhat.com |
---|---|
State | New |
Headers | show |
Jakub Jelinek <jakub@redhat.com> wrote: >On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote: >> Can you factor out a function that returns >> A proper qimode value if possible or null and >> Use it in both places? > >Like this? You should be able to remove zero, minus one and constructor special casing, no? Ok, maybe not constructor handling, but at least move handling of it to the function. Richard. >2013-04-04 Jakub Jelinek <jakub@redhat.com> > > * tree-loop-distribution.c (const_with_all_bytes_same): New function. > (generate_memset_builtin): Only handle integer_all_onesp as -1 val if > TYPE_PRECISION is equal to mode bitsize. Use >const_with_all_bytes_same > if possible to compute val. > (classify_partition): Verify CONSTRUCTOR doesn't have any elts. > For QImode integers don't require anything about precision. Use > const_with_all_bytes_same to find out if the constant doesn't have > repeated bytes in it. > > * gcc.dg/pr56837.c: New test. > >--- gcc/tree-loop-distribution.c.jj 2013-04-04 15:03:28.000000000 +0200 >+++ gcc/tree-loop-distribution.c 2013-04-04 20:49:14.295546543 +0200 >@@ -297,6 +297,27 @@ build_addr_arg_loc (location_t loc, data >return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), >addr_base); > } > >+/* If VAL memory representation contains the same value in all bytes, >+ return that value, otherwise return -1. >+ E.g. for 0x24242424 return 0x24, for IEEE double >+ 747708026454360457216.0 return 0x44, etc. */ >+ >+static int >+const_with_all_bytes_same (tree val) >+{ >+ unsigned char buf[64]; >+ int i, len; >+ if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) >+ return -1; >+ len = native_encode_expr (val, buf, sizeof (buf)); >+ if (len == 0) >+ return -1; >+ for (i = 1; i < len; i++) >+ if (buf[i] != buf[0]) >+ return -1; >+ return buf[0]; >+} >+ > /* Generate a call to memset for PARTITION in LOOP. */ > > static void >@@ -331,11 +352,18 @@ generate_memset_builtin (struct loop *lo > || real_zerop (val) > || TREE_CODE (val) == CONSTRUCTOR) > val = integer_zero_node; >- else if (integer_all_onesp (val)) >+ else if (integer_all_onesp (val) >+ && TYPE_PRECISION (TREE_TYPE (val)) >+ == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val)))) > val = build_int_cst (integer_type_node, -1); > else > { >- if (TREE_CODE (val) == INTEGER_CST) >+ /* Handle constants like 0x15151515 and similarly >+ floating point constants etc. where all bytes are the same. */ >+ int bytev = const_with_all_bytes_same (val); >+ if (bytev != -1) >+ val = build_int_cst (integer_type_node, bytev); >+ else if (TREE_CODE (val) == INTEGER_CST) > val = fold_convert (integer_type_node, val); >else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE >(val))) > { >@@ -944,15 +972,15 @@ classify_partition (loop_p loop, struct > if (!(integer_zerop (rhs) > || real_zerop (rhs) > || (TREE_CODE (rhs) == CONSTRUCTOR >- && !TREE_CLOBBER_P (rhs)) >- || ((integer_all_onesp (rhs) >- || (INTEGRAL_TYPE_P (TREE_TYPE (rhs)) >- && (TYPE_MODE (TREE_TYPE (rhs)) >- == TYPE_MODE (unsigned_char_type_node)))) >- /* For stores of a non-zero value require that the precision >- of the value matches its actual size. */ >+ && !TREE_CLOBBER_P (rhs) >+ && CONSTRUCTOR_NELTS (rhs) == 0) >+ || (integer_all_onesp (rhs) > && (TYPE_PRECISION (TREE_TYPE (rhs)) >- == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs))))))) >+ == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs))))) >+ || (INTEGRAL_TYPE_P (TREE_TYPE (rhs)) >+ && (TYPE_MODE (TREE_TYPE (rhs)) >+ == TYPE_MODE (unsigned_char_type_node))) >+ || const_with_all_bytes_same (rhs) != -1)) > return; > if (TREE_CODE (rhs) == SSA_NAME > && !SSA_NAME_IS_DEFAULT_DEF (rhs) >--- gcc/testsuite/gcc.dg/pr56837.c.jj 2013-04-04 17:37:58.458675152 >+0200 >+++ gcc/testsuite/gcc.dg/pr56837.c 2013-04-04 17:36:40.000000000 +0200 >@@ -0,0 +1,67 @@ >+/* Limit this test to selected targets with IEEE double, 8-byte long >long, >+ supported 4x int vectors, 4-byte int. */ >+/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } } >*/ >+/* { dg-options "-O3 -fdump-tree-optimized" } */ >+/* { dg-additional-options "-msse2" { target ia32 } } */ >+/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } } >*/ >+ >+typedef int V __attribute__((__vector_size__ (16))); >+#define N 1024 >+double d[N]; >+long long int l[N]; >+_Bool b[N]; >+_Complex double c[N]; >+V v[N]; >+ >+void >+fd (void) >+{ >+ int i; >+ for (i = 0; i < N; i++) >+ d[i] = 747708026454360457216.0; >+} >+ >+void >+fl (void) >+{ >+ int i; >+ for (i = 0; i < N; i++) >+ l[i] = 0x7c7c7c7c7c7c7c7cULL; >+} >+ >+void >+fb (void) >+{ >+ int i; >+ for (i = 0; i < N; i++) >+ b[i] = 1; >+} >+ >+void >+fc (void) >+{ >+ int i; >+ for (i = 0; i < N; i++) >+ c[i] = 747708026454360457216.0 + 747708026454360457216.0i; >+} >+ >+void >+fv (void) >+{ >+ int i; >+ for (i = 0; i < N; i++) >+ v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 }; >+} >+ >+/* Look for >+ __builtin_memset (&d, 68, 8192); >+ __builtin_memset (&l, 124, 8192); >+ __builtin_memset (&b, 1, 1024); >+ __builtin_memset (&c, 68, 16384); >+ __builtin_memset (&v, 18, 16384); */ >+/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1 >"optimized" } } */ >+/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1 >"optimized" } } */ >+/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1 >"optimized" } } */ >+/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1 >"optimized" } } */ >+/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1 >"optimized" } } */ >+/* { dg-final { cleanup-tree-dump "optimized" } } */ > > > Jakub
On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote: > Jakub Jelinek <jakub@redhat.com> wrote: > > >On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote: > >> Can you factor out a function that returns > >> A proper qimode value if possible or null and > >> Use it in both places? > > > >Like this? > > You should be able to remove zero, minus one and constructor special > casing, no? Ok, maybe not constructor handling, but at least move No, because the function is only handling BITS_PER_UNIT == 8 && CHAR_BIT == 8, plus is unnecessarily expensive for the common case of storing 0. But if you want, I can move all that integer_zerop / real_zerop / CONSTRUCTOR / integer_all_onesp handling into the function. BTW, the integer_all_onesp stuff is broken for this from what I can see, for complex numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we need to rule out COMPLEX_CSTs (or do integer_all_onesp on each part instead). And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for. Jakub
On Fri, 5 Apr 2013, Jakub Jelinek wrote: > On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote: >> Jakub Jelinek <jakub@redhat.com> wrote: >> >>> On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote: >>>> Can you factor out a function that returns >>>> A proper qimode value if possible or null and >>>> Use it in both places? >>> >>> Like this? >> >> You should be able to remove zero, minus one and constructor special >> casing, no? Ok, maybe not constructor handling, but at least move > > No, because the function is only handling BITS_PER_UNIT == 8 && CHAR_BIT == 8, > plus is unnecessarily expensive for the common case of storing 0. > > But if you want, I can move all that integer_zerop / real_zerop / > CONSTRUCTOR / integer_all_onesp handling into the function. > > BTW, the integer_all_onesp stuff is broken for this from what I can see, for complex > numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we need > to rule out COMPLEX_CSTs (or do integer_all_onesp on each part instead). > And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for. Shouldn't we change integer_all_onesp to do what its name says and create a separate integer_minus_onep for the single place I could find where it would break, the folding of x * -1 ?
Jakub Jelinek <jakub@redhat.com> wrote: >On Fri, Apr 05, 2013 at 09:21:16AM +0200, Richard Biener wrote: >> Jakub Jelinek <jakub@redhat.com> wrote: >> >> >On Thu, Apr 04, 2013 at 08:37:47PM +0200, Richard Biener wrote: >> >> Can you factor out a function that returns >> >> A proper qimode value if possible or null and >> >> Use it in both places? >> > >> >Like this? >> >> You should be able to remove zero, minus one and constructor special >> casing, no? Ok, maybe not constructor handling, but at least move > >No, because the function is only handling BITS_PER_UNIT == 8 && >CHAR_BIT == 8, >plus is unnecessarily expensive for the common case of storing 0. > >But if you want, I can move all that integer_zerop / real_zerop / >CONSTRUCTOR / integer_all_onesp handling into the function. Please. >BTW, the integer_all_onesp stuff is broken for this from what I can >see, for complex >numbers it returns true for -1 + 0i where all bytes aren't 0xff, so we >need >to rule out COMPLEX_CSTs (or do integer_all_onesp on each part >instead). >And TYPE_PRECISION on VECTOR_CSTs won't be what we are looking for. Hmm, indeed. Or remove the -1 special casing altogether. Marc is probably right with his note as well. Richard. > Jakub
--- gcc/tree-loop-distribution.c.jj 2013-04-04 15:03:28.000000000 +0200 +++ gcc/tree-loop-distribution.c 2013-04-04 20:49:14.295546543 +0200 @@ -297,6 +297,27 @@ build_addr_arg_loc (location_t loc, data return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base); } +/* If VAL memory representation contains the same value in all bytes, + return that value, otherwise return -1. + E.g. for 0x24242424 return 0x24, for IEEE double + 747708026454360457216.0 return 0x44, etc. */ + +static int +const_with_all_bytes_same (tree val) +{ + unsigned char buf[64]; + int i, len; + if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) + return -1; + len = native_encode_expr (val, buf, sizeof (buf)); + if (len == 0) + return -1; + for (i = 1; i < len; i++) + if (buf[i] != buf[0]) + return -1; + return buf[0]; +} + /* Generate a call to memset for PARTITION in LOOP. */ static void @@ -331,11 +352,18 @@ generate_memset_builtin (struct loop *lo || real_zerop (val) || TREE_CODE (val) == CONSTRUCTOR) val = integer_zero_node; - else if (integer_all_onesp (val)) + else if (integer_all_onesp (val) + && TYPE_PRECISION (TREE_TYPE (val)) + == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (val)))) val = build_int_cst (integer_type_node, -1); else { - if (TREE_CODE (val) == INTEGER_CST) + /* Handle constants like 0x15151515 and similarly + floating point constants etc. where all bytes are the same. */ + int bytev = const_with_all_bytes_same (val); + if (bytev != -1) + val = build_int_cst (integer_type_node, bytev); + else if (TREE_CODE (val) == INTEGER_CST) val = fold_convert (integer_type_node, val); else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) { @@ -944,15 +972,15 @@ classify_partition (loop_p loop, struct if (!(integer_zerop (rhs) || real_zerop (rhs) || (TREE_CODE (rhs) == CONSTRUCTOR - && !TREE_CLOBBER_P (rhs)) - || ((integer_all_onesp (rhs) - || (INTEGRAL_TYPE_P (TREE_TYPE (rhs)) - && (TYPE_MODE (TREE_TYPE (rhs)) - == TYPE_MODE (unsigned_char_type_node)))) - /* For stores of a non-zero value require that the precision - of the value matches its actual size. */ + && !TREE_CLOBBER_P (rhs) + && CONSTRUCTOR_NELTS (rhs) == 0) + || (integer_all_onesp (rhs) && (TYPE_PRECISION (TREE_TYPE (rhs)) - == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs))))))) + == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs))))) + || (INTEGRAL_TYPE_P (TREE_TYPE (rhs)) + && (TYPE_MODE (TREE_TYPE (rhs)) + == TYPE_MODE (unsigned_char_type_node))) + || const_with_all_bytes_same (rhs) != -1)) return; if (TREE_CODE (rhs) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (rhs) --- gcc/testsuite/gcc.dg/pr56837.c.jj 2013-04-04 17:37:58.458675152 +0200 +++ gcc/testsuite/gcc.dg/pr56837.c 2013-04-04 17:36:40.000000000 +0200 @@ -0,0 +1,67 @@ +/* Limit this test to selected targets with IEEE double, 8-byte long long, + supported 4x int vectors, 4-byte int. */ +/* { dg-do compile { target { i?86-*-* x86_64-*-* powerpc*-*-* } } } */ +/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-additional-options "-msse2" { target ia32 } } */ +/* { dg-additional-options "-mvsx -maltivec" { target powerpc*-*-* } } */ + +typedef int V __attribute__((__vector_size__ (16))); +#define N 1024 +double d[N]; +long long int l[N]; +_Bool b[N]; +_Complex double c[N]; +V v[N]; + +void +fd (void) +{ + int i; + for (i = 0; i < N; i++) + d[i] = 747708026454360457216.0; +} + +void +fl (void) +{ + int i; + for (i = 0; i < N; i++) + l[i] = 0x7c7c7c7c7c7c7c7cULL; +} + +void +fb (void) +{ + int i; + for (i = 0; i < N; i++) + b[i] = 1; +} + +void +fc (void) +{ + int i; + for (i = 0; i < N; i++) + c[i] = 747708026454360457216.0 + 747708026454360457216.0i; +} + +void +fv (void) +{ + int i; + for (i = 0; i < N; i++) + v[i] = (V) { 0x12121212, 0x12121212, 0x12121212, 0x12121212 }; +} + +/* Look for + __builtin_memset (&d, 68, 8192); + __builtin_memset (&l, 124, 8192); + __builtin_memset (&b, 1, 1024); + __builtin_memset (&c, 68, 16384); + __builtin_memset (&v, 18, 16384); */ +/* { dg-final { scan-tree-dump-times "memset ..d, 68, 8192.;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "memset ..l, 124, 8192.;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "memset ..b, 1, 1024.;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "memset ..c, 68, 16384.;" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "memset ..v, 18, 16384.;" 1 "optimized" } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */