fold strings as long as the array they're stored in (PR 86428)

Message ID 812effdd-d102-4266-0cee-362ac7cae19f@gmail.com
State New
Headers show
Series
  • fold strings as long as the array they're stored in (PR 86428)
Related show

Commit Message

Martin Sebor July 7, 2018, 9:47 p.m.
While working on other string folding improvements (PR 77357)
I came across another distinct case where GCC could do better:
it doesn't consider as candidates strings that have as many
elements as the size of the array they are stored in, even if
their length is within the bounds of the array.  For instance,
only the first strlen() call below is folded even though
the arguments to both are valid NUL-terminated strings.

    const char a[4] = "123";
    int f (void)
    {
      return strlen (a);   // folded
    }

    const char b[4] = "123\0";
    int g (void)
    {
      return strlen (b);   // not folded
    }

The attached tiny patch adjusts the the string_constant() function
to recognize as valid string constants all those whose length (as
returned by strlen()) is less than the size of the array they are
stored in.

Tested on x86_64-linux.

Testing of an earlier version of the patch exposed what I think
is a deficiency in the (very) early strlen folding: c_strlen()
folds expressions of the form strlen(A + I) to sizeof(A) - I when
A is an array of known size and I a non-const variable.  This not
only prevents -Warray-bounds from warning on invalid indices but
also defeats other, possibly more profitable optimizations based
on the range of the result of the strlen() call.  The logs show
that the code dates at least as far back as 1992.  With VRP and
other data flow based optimizations I think we can do better
today.  I opened bug 86434 to improve things.

Martin

Comments

Richard Biener July 9, 2018, 2:34 p.m. | #1
On Sat, Jul 7, 2018 at 11:47 PM Martin Sebor <msebor@gmail.com> wrote:
>
> While working on other string folding improvements (PR 77357)
> I came across another distinct case where GCC could do better:
> it doesn't consider as candidates strings that have as many
> elements as the size of the array they are stored in, even if
> their length is within the bounds of the array.  For instance,
> only the first strlen() call below is folded even though
> the arguments to both are valid NUL-terminated strings.
>
>     const char a[4] = "123";
>     int f (void)
>     {
>       return strlen (a);   // folded
>     }
>
>     const char b[4] = "123\0";
>     int g (void)
>     {
>       return strlen (b);   // not folded
>     }
>
> The attached tiny patch adjusts the the string_constant() function
> to recognize as valid string constants all those whose length (as
> returned by strlen()) is less than the size of the array they are
> stored in.
>
> Tested on x86_64-linux.
>
> Testing of an earlier version of the patch exposed what I think
> is a deficiency in the (very) early strlen folding: c_strlen()
> folds expressions of the form strlen(A + I) to sizeof(A) - I when
> A is an array of known size and I a non-const variable.  This not
> only prevents -Warray-bounds from warning on invalid indices but
> also defeats other, possibly more profitable optimizations based
> on the range of the result of the strlen() call.  The logs show
> that the code dates at least as far back as 1992.  With VRP and
> other data flow based optimizations I think we can do better
> today.  I opened bug 86434 to improve things.

I think that

+      unsigned HOST_WIDE_INT length = strlen (TREE_STRING_POINTER (init));

is dangerous and you should use strnlen (TREE_STRING_POINTER (init),
TREE_STRING_LENGTH (init))
instead.

OK with that change.

Richard.

> Martin
>

Patch

PR tree-optimization/86428 - strlen of const array initialized with a string of the same length not folded

gcc/ChangeLog:

	PR tree-optimization/86428
	* expr.c (string_constant): Handle string literals of
	length up to the size of the array they are stored in.

gcc/testsuite/ChangeLog:

	PR tree-optimization/86428
	* gcc.dg/strlenopt-49.c: New test.
	* gcc.c-torture/execute/builtins/strlen-3.c: Adjust.

Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 262437)
+++ gcc/expr.c	(working copy)
@@ -11358,7 +11358,6 @@  string_constant (tree arg, tree *ptr_offset)
     }
   else if (VAR_P (array) || TREE_CODE (array) == CONST_DECL)
     {
-      int length;
       tree init = ctor_for_folding (array);
 
       /* Variables initialized to string literals can be handled too.  */
@@ -11367,22 +11366,25 @@  string_constant (tree arg, tree *ptr_offset)
 	  || TREE_CODE (init) != STRING_CST)
 	return 0;
 
-      /* Avoid const char foo[4] = "abcde";  */
-      if (DECL_SIZE_UNIT (array) == NULL_TREE
-	  || TREE_CODE (DECL_SIZE_UNIT (array)) != INTEGER_CST
-	  || (length = TREE_STRING_LENGTH (init)) <= 0
-	  || compare_tree_int (DECL_SIZE_UNIT (array), length) < 0)
-	return 0;
+      tree array_size = DECL_SIZE_UNIT (array);
+      if (!array_size || TREE_CODE (array_size) != INTEGER_CST)
+	return NULL_TREE;
 
-      /* If variable is bigger than the string literal, OFFSET must be constant
-	 and inside of the bounds of the string literal.  */
-      offset = fold_convert (sizetype, offset);
-      if (compare_tree_int (DECL_SIZE_UNIT (array), length) > 0
-	  && (! tree_fits_uhwi_p (offset)
-	      || compare_tree_int (offset, length) >= 0))
-	return 0;
+      /* Avoid returning a string that doesn't fit in the array
+	 it is stored in, like
+	   const char a[4] = "abcde";
+	 but do handle those that fit even if they have excess
+	 initializers, such as in
+	   const char a[4] = "abc\000\000";
+	 The excess elements contribute to TREE_STRING_LENGTH()
+	 but not to strlen().  */
+      unsigned HOST_WIDE_INT length = strlen (TREE_STRING_POINTER (init));
+      if (compare_tree_int (array_size, length + 1) < 0)
+	return NULL_TREE;
 
-      *ptr_offset = offset;
+      /* Callers should verify that OFFSET is within the bounds
+	 of the array and warn for out-of-bounds offsets.  */
+      *ptr_offset = fold_convert (sizetype, offset);
       return init;
     }
 
Index: gcc/testsuite/gcc.c-torture/execute/builtins/strlen-3.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/builtins/strlen-3.c	(revision 262437)
+++ gcc/testsuite/gcc.c-torture/execute/builtins/strlen-3.c	(working copy)
@@ -2,8 +2,11 @@ 
 
    Test strlen on const variables initialized to string literals.
 
-   Written by Jakub Jelinek, 9/14/2004.  */
+   Written by Jakub Jelinek, 9/14/2004.
 
+   { dg-do compile }
+   { dg-options "-O2 -Wall -fdump-tree-optimized" } */
+
 extern void abort (void);
 extern __SIZE_TYPE__ strlen (const char *);
 extern char *strcpy (char *, const char *);
@@ -10,7 +13,6 @@  extern char *strcpy (char *, const char *);
 static const char bar[] = "Hello, World!";
 static const char baz[] = "hello, world?";
 static const char larger[20] = "short string";
-extern int inside_main;
 
 int l1 = 1;
 int x = 6;
@@ -59,12 +61,10 @@  main_test(void)
   if (strlen (&larger[10]) != 2)
     abort ();
 
-  inside_main = 0;
-  /* This will result in strlen call, because larger
-     array is bigger than its initializer.  */
   if (strlen (larger + (x++ & 7)) != 5)
     abort ();
   if (x != 8)
     abort ();
-  inside_main = 1;
 }
+
+/* { dg-final { scan-tree-dump-not "strlen" "optimized" } } */
Index: gcc/testsuite/gcc.dg/strlenopt-49.c
===================================================================
--- gcc/testsuite/gcc.dg/strlenopt-49.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/strlenopt-49.c	(working copy)
@@ -0,0 +1,53 @@ 
+/* PR tree-optimization/86428 - strlen of const array initialized with
+   a string of the same length not folded
+   { dg-do compile }
+   { dg-options "-O0 -Wall -fdump-tree-gimple" } */
+
+#include "strlenopt.h"
+
+const char a1[1] = "\0";
+const char a2[2] = "1\0";
+const char a3[3] = "12\0";
+const char a8[8] = "1234567\0";
+const char a9[9] = "12345678\0";
+
+const char ax[9] = "12345678\0\0\0\0";   /* { dg-warning "initializer-string for array of chars is too long" } */
+const char ay[9] = "\00012345678\0\0\0\0";   /* { dg-warning "initializer-string for array of chars is too long" } */
+
+
+int len1 (void)
+{
+  size_t len0 = strlen (a1);
+  return len0;
+}
+
+int len (void)
+{
+  size_t len = strlen (a2) + strlen (a3) + strlen (a8) + strlen (a9);
+  return len;
+}
+
+int lenx (void)
+{
+  size_t lenx = strlen (ax);
+  return lenx;
+}
+
+int leny (void)
+{
+  size_t leny = strlen (ay);
+  return leny;
+}
+
+int cmp88 (void)
+{
+  int cmp88 = memcmp (a8, "1234567\0", sizeof a8);
+  return cmp88;
+}
+
+/* { dg-final { scan-tree-dump-times "strlen" 0 "gimple" } }
+   { dg-final { scan-tree-dump-times "len0 = 0;" 1 "gimple" } }
+   { dg-final { scan-tree-dump-times "len = 18;" 1 "gimple" } }
+   { dg-final { scan-tree-dump-times "lenx = 8;" 1 "gimple" } }
+   { dg-final { scan-tree-dump-times "leny = 0;" 1 "gimple" } }
+   { dg-final { scan-tree-dump-times "cmp88 = 0;" 1 "gimple" } } */