Patchwork Optimize memcpy + memset (PR fortran/45636)

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 11, 2010, 8:46 p.m.
Message ID <20101011204617.GJ2082@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/67481/
State New
Headers show

Comments

Jakub Jelinek - Oct. 11, 2010, 8:46 p.m.
Hi!

This patch optimizes
memcpy (x, "abcd", 4);
memset (x + 4, ' ', 4);
sequences into a single
memcpy (x, "abcd    ", 8);
if it is known (or expected) that the memcpy will be implemented using
store_by_pieces (otherwise we'd risk enlarging .rodata too much, e.g.
if there are only few longer string literals in memcpy originally and
adding too many different memset variants after them would mean the string
literals couldn't be shared anymore).
The patch also handles
memcpy (x, "a", 1);
memset (x + 1, 'b', 11);
where the memcpy has been optimized into a store.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2010-10-11  Jakub Jelinek  <jakub@redhat.com>

	PR fortran/45636
	* tree-ssa-forwprop.c: Include expr.h.
	(constant_pointer_difference, simplify_builtin_call): New functions.
	(tree_ssa_forward_propagate_single_use_vars): Call
	simplify_builtin_call on builtin calls.

	* gcc.c-torture/execute/pr45636.c: New test.
	* gfortran.dg/pr45636.f90: New test.


	Jakub
Richard Guenther - Oct. 12, 2010, 9:18 a.m.
On Mon, 11 Oct 2010, Jakub Jelinek wrote:

> Hi!
> 
> This patch optimizes
> memcpy (x, "abcd", 4);
> memset (x + 4, ' ', 4);
> sequences into a single
> memcpy (x, "abcd    ", 8);
> if it is known (or expected) that the memcpy will be implemented using
> store_by_pieces (otherwise we'd risk enlarging .rodata too much, e.g.
> if there are only few longer string literals in memcpy originally and
> adding too many different memset variants after them would mean the string
> literals couldn't be shared anymore).
> The patch also handles
> memcpy (x, "a", 1);
> memset (x + 1, 'b', 11);
> where the memcpy has been optimized into a store.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2010-10-11  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR fortran/45636
> 	* tree-ssa-forwprop.c: Include expr.h.
> 	(constant_pointer_difference, simplify_builtin_call): New functions.
> 	(tree_ssa_forward_propagate_single_use_vars): Call
> 	simplify_builtin_call on builtin calls.
> 
> 	* gcc.c-torture/execute/pr45636.c: New test.
> 	* gfortran.dg/pr45636.f90: New test.
> 
> --- gcc/tree-ssa-forwprop.c.jj	2010-10-07 19:44:48.000000000 +0200
> +++ gcc/tree-ssa-forwprop.c	2010-10-11 16:43:03.000000000 +0200
> @@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  
>  #include "langhooks.h"
>  #include "flags.h"
>  #include "gimple.h"
> +#include "expr.h"
>  
>  /* This pass propagates the RHS of assignment statements into use
>     sites of the LHS of the assignment.  It's basically a specialized
> @@ -1317,6 +1318,296 @@ simplify_gimple_switch (gimple stmt)
>      }
>  }
>  
> +/* For pointers p2 and p1 return p2 - p1 if the
> +   difference is known and constant, otherwise return NULL.  */
> +
> +static tree
> +constant_pointer_difference (tree p1, tree p2)
> +{
> +  int i, j;
> +  tree exps[2][5];
> +  tree offs[2][5];
> +  int cnt[2];
> +
> +  for (i = 0; i < 2; i++)
> +    {
> +      tree p = i ? p1 : p2;
> +      tree off = size_zero_node;
> +      gimple stmt;
> +      enum tree_code code;
> +
> +      j = 0;
> +      do
> +	{
> +	  if (!POINTER_TYPE_P (TREE_TYPE (p)))
> +	    break;
> +	  if (TREE_CODE (p) == ADDR_EXPR)
> +	    {
> +	      tree q = TREE_OPERAND (p, 0);
> +	      if (handled_component_p (q)
> +		  || TREE_CODE (q) == MEM_REF
> +		  || TREE_CODE (q) == TARGET_MEM_REF)
> +		{
> +		  HOST_WIDE_INT offset, size, maxsize;
> +		  tree base
> +		    = get_ref_base_and_extent (q, &offset,
> +					       &size, &maxsize);

Please use get_addr_base_and_unit_offset here, that simplifies
the following checks

> +		  if (size == maxsize
> +		      && offset >= 0
> +		      && (offset % BITS_PER_UNIT) == 0)
> +		    {
> +		      q = base;
> +		      off = size_binop (PLUS_EXPR, off,
> +					size_int (offset / BITS_PER_UNIT));
> +		    }
> +		}
> +	      if (TREE_CODE (q) == MEM_REF
> +		  && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
> +		{
> +		  p = TREE_OPERAND (q, 0);
> +		  off = size_binop (PLUS_EXPR, off,
> +				    fold_convert (sizetype,
> +						  TREE_OPERAND (q, 1)));

Use double_int_to_tree (sizetype, mem_ref_offset (q)) which properly
sign-extends the pointer offset.

> +		}
> +	      else
> +		{
> +		  exps[i][j] = q;
> +		  offs[i][j++] = off;
> +		  break;
> +		}
> +	    }
> +	  if (TREE_CODE (p) != SSA_NAME)
> +	    break;
> +	  exps[i][j] = p;
> +	  offs[i][j++] = off;
> +	  if (j == 5)
> +	    break;
> +	  stmt = SSA_NAME_DEF_STMT (p);
> +	  if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
> +	    break;
> +	  code = gimple_assign_rhs_code (stmt);
> +	  if (code == POINTER_PLUS_EXPR)
> +	    {
> +	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
> +		break;
> +	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
> +	      p = gimple_assign_rhs1 (stmt);
> +	    }
> +	  else if (code == ADDR_EXPR || code == NOP_EXPR)
> +	    p = gimple_assign_rhs1 (stmt);
> +	  else
> +	    break;
> +	}
> +      while (1);
> +      cnt[i] = j;
> +    }

I wonder why the forward-propagation of POINTER_PLUS_EXPR and ADDR_EXPR
doesn't make the above redundant?

> +  for (i = 0; i < cnt[0]; i++)
> +    for (j = 0; j < cnt[1]; j++)
> +      if (exps[0][i] == exps[1][j])
> +	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
> +
> +  return NULL_TREE;
> +}
> +
> +/* *GSI_P is a GIMPLE_CALL to a builtin function.
> +   Optimize
> +   memcpy (p, "abcd", 4);
> +   memset (p + 4, ' ', 3);
> +   into
> +   memcpy (p, "abcd   ", 7);
> +   call if the latter can be stored by pieces during expansion.  */
> +
> +static bool
> +simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
> +{
> +  gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
> +  tree vuse = gimple_vuse (stmt2);
> +  if (vuse == NULL)
> +    return false;
> +  stmt1 = SSA_NAME_DEF_STMT (vuse);
> +
> +  switch (DECL_FUNCTION_CODE (callee2))
> +    {
> +    case BUILT_IN_MEMSET:
> +      if (gimple_call_num_args (stmt2) != 3
> +	  || gimple_call_lhs (stmt2)
> +	  || CHAR_BIT != 8
> +	  || BITS_PER_UNIT != 8)
> +	break;
> +      else
> +	{
> +	  tree callee1;
> +	  tree ptr1, src1, str1, off1, len1, lhs1;
> +	  tree ptr2 = gimple_call_arg (stmt2, 0);
> +	  tree val2 = gimple_call_arg (stmt2, 1);
> +	  tree len2 = gimple_call_arg (stmt2, 2);
> +	  tree diff, vdef, new_str_cst;
> +	  unsigned int ptr1_align;
> +	  unsigned HOST_WIDE_INT src_len;
> +	  char *src_buf;
> +	  imm_use_iterator imm_iter;
> +	  use_operand_p use_p;
> +	  bool do_fail;
> +
> +	  if (!host_integerp (val2, 0)
> +	      || !host_integerp (len2, 1))
> +	    break;
> +	  if (is_gimple_call (stmt1))
> +	    {
> +	      callee1 = gimple_call_fndecl (stmt1);
> +	      if (callee1 == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
> +		  || gimple_call_num_args (stmt1) != 3)
> +		break;
> +	      if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
> +		  && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
> +		break;
> +	      ptr1 = gimple_call_arg (stmt1, 0);
> +	      src1 = gimple_call_arg (stmt1, 1);
> +	      len1 = gimple_call_arg (stmt1, 2);
> +	      lhs1 = gimple_call_lhs (stmt1);
> +	      if (!host_integerp (len1, 1))
> +		break;
> +	      str1 = string_constant (src1, &off1);
> +	      if (str1 == NULL_TREE)
> +		break;
> +	      if (!host_integerp (off1, 1)
> +		  || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
> +		  || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
> +					     - tree_low_cst (off1, 1)) > 0
> +		  || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
> +		  || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
> +		     != TYPE_MODE (char_type_node))
> +		break;
> +	    }
> +	  else if (gimple_assign_single_p (stmt1))
> +	    {
> +    	      ptr1 = gimple_assign_lhs (stmt1);
> +	      src1 = gimple_assign_rhs1 (stmt1);
> +	      if (TREE_CODE (ptr1) != MEM_REF
> +		  || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
> +		  || !host_integerp (src1, 0))
> +		break;
> +	      ptr1 = build_fold_addr_expr (ptr1);
> +	      callee1 = NULL_TREE;
> +	      len1 = size_one_node;
> +	      lhs1 = NULL_TREE;
> +	      off1 = size_zero_node;
> +	      str1 = NULL_TREE;
> +	    }
> +	  else
> +	    break;
> +
> +	  diff = constant_pointer_difference (ptr1, ptr2);
> +	  if (diff == NULL && lhs1 != NULL)
> +	    {
> +	      diff = constant_pointer_difference (lhs1, ptr2);
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
> +		  && diff != NULL)
> +		diff = size_binop (PLUS_EXPR, diff,
> +				   fold_convert (sizetype, len1));
> +	    }
> +	  if (diff == NULL
> +	      || !host_integerp (diff, 1)
> +	      || tree_int_cst_lt (len1, diff))
> +	    break;
> +
> +	  src_len = tree_low_cst (diff, 1);
> +	  src_len += tree_low_cst (len2, 1);
> +	  if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
> +	    src_len = tree_low_cst (len1, 1);
> +	  if (src_len > 1024)
> +	    break;
> +
> +	  do_fail = false;
> +	  if (lhs1 != NULL
> +	      && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> +	    {
> +	      if (TREE_CODE (lhs1) != SSA_NAME)
> +		break;
> +	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs1)
> +		{
> +		  if (USE_STMT (use_p) != stmt2)
> +		    {
> +		      do_fail = true;
> +		      break;
> +		    }
> +		}

That looks like

  if (TREE_CODE (lhs1) != SSA_NAME
      || !single_imm_use (lhs1, &use_p, &use_stmt)
      || use_stmt != stmt2)
    break;

and your loop gets confused by DEBUG_STMTs at least (and possibly
generates different code -g vs. -g0)

> +	      if (do_fail)
> +		break;
> +	    }
> +	  vdef = gimple_vdef (stmt1);
> +	  if (vdef != NULL)
> +	    {
> +	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vdef)
> +		{
> +		  if (USE_STMT (use_p) != stmt2)
> +		    {
> +		      do_fail = true;
> +		      break;
> +		    }
> +		}

Likewise.  Why do we care about other uses of the VDEF?

> +	      if (do_fail)
> +		break;
> +	    }
> +
> +	  ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
> +	  src_buf = XALLOCAVEC (char, src_len + 1);
> +	  if (callee1)
> +	    memcpy (src_buf,
> +		    TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
> +		    tree_low_cst (len1, 1));
> +	  else
> +	    src_buf[0] = tree_low_cst (src1, 0);
> +	  memset (src_buf + tree_low_cst (diff, 1),
> +		  tree_low_cst (val2, 1), tree_low_cst (len2, 1));
> +	  src_buf[src_len] = '\0';
> +	  if (strlen (src_buf) != src_len)
> +	    break;
> +	  rtl_profile_for_bb (gimple_bb (stmt2));
> +	  if (!can_store_by_pieces (src_len,
> +				    builtin_strncpy_read_str,
> +				    src_buf, ptr1_align, false))
> +	    break;
> +
> +	  new_str_cst = build_string_literal (src_len, src_buf);
> +	  if (callee1)
> +	    {
> +	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
> +		gimple_call_set_lhs (stmt1, NULL_TREE);
> +	      gimple_call_set_arg (stmt1, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt1, 2,
> +				   build_int_cst (TREE_TYPE (len1), src_len));
> +	      update_stmt (stmt1);
> +	      unlink_stmt_vdef (stmt2);
> +	      gsi_remove (gsi_p, true);
> +	      release_defs (stmt2);
> +	      return true;
> +	    }
> +	  else
> +	    {
> +	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
> +
> +	      gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
> +	      gimple_call_set_arg (stmt2, 0, ptr1);
> +	      gimple_call_set_arg (stmt2, 1, new_str_cst);
> +	      gimple_call_set_arg (stmt2, 2,
> +				   build_int_cst (TREE_TYPE (len2), src_len));
> +	      unlink_stmt_vdef (stmt1);
> +	      gsi_remove (&gsi, true);
> +	      release_defs (stmt1);
> +	      update_stmt (stmt2);
> +	      return false;
> +	    }
> +	}
> +      break;
> +    default:
> +      break;
> +    }
> +  return false;
> +}

The code above could use some comments, like those you gave about
can_store_by_pieces in the mail.

Otherwise the patch looks good, but I'm really curious as of why
constant_pointer_difference is as complex as it is.

Thanks,
Richard.

>  /* Run bitwise and assignments throug the folder.  If the first argument is an
>     ssa name that is itself a result of a typecast of an ADDR_EXPR to an
>     integer, feed the ADDR_EXPR to the folder rather than the ssa name.
> @@ -1784,6 +2075,14 @@ tree_ssa_forward_propagate_single_use_va
>  					      WARN_STRICT_OVERFLOW_CONDITIONAL);
>  	      gsi_next (&gsi);
>  	    }
> +	  else if (is_gimple_call (stmt))
> +	    {
> +	      tree callee = gimple_call_fndecl (stmt);
> +	      if (callee == NULL_TREE
> +		  || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
> +		  || !simplify_builtin_call (&gsi, callee))
> +		gsi_next (&gsi);
> +	    }
>  	  else
>  	    gsi_next (&gsi);
>  	}
> --- gcc/Makefile.in.jj	2010-10-11 07:51:09.000000000 +0200
> +++ gcc/Makefile.in	2010-10-11 09:15:25.000000000 +0200
> @@ -2404,7 +2404,7 @@ tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
>  tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> -   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
> +   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
>  tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
> --- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj	2010-10-11 09:15:25.000000000 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr45636.c	2010-10-11 09:15:25.000000000 +0200
> @@ -0,0 +1,75 @@
> +/* PR fortran/45636 */
> +
> +typedef __SIZE_TYPE__ size_t;
> +void *memcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
> +void *memset (void *, int, size_t);
> +int memcmp (const void *, const void *, size_t);
> +extern void abort (void);
> +
> +struct A { int i; char c[32]; } a[2];
> +
> +__attribute__((noinline, noclone)) int
> +f1 (char *p, int q, int z)
> +{
> +  memcpy (p, "abcd", 4);
> +  if (q)
> +    z = z + 123;
> +  else
> +    z *= 114;
> +  memset (p + 4, ' ', 2);
> +  return z;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f2 (void)
> +{
> +  char *p = mempcpy (&a[0].c[13], "123456", 4);
> +  memset (p, '7', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f3 (struct A *p)
> +{
> +  p++;
> +  char *q = &p->c[10];
> +  memcpy (q + 4, "__1234567" + 2, 7);
> +  memset (&p->c[21], '9', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f4 (void)
> +{
> +  memcpy (&a[0].c[10], "0123456789", 10);
> +  memset (&a[0].c[13], ' ', 3);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +check (const char *p, const char *str, size_t size)
> +{
> +  const char *q;
> +  for (q = (const char *) &a; q < p; q++)
> +    if (*q)
> +      abort ();
> +  if (memcmp (p, str, size) != 0)
> +    abort ();
> +  for (q = p + size; q < (const char *) (&a[0] + 2); q++)
> +    if (*q)
> +      abort ();
> +  memset (&a, '\0', sizeof a);
> +}
> +
> +int
> +main (void)
> +{
> +  if (f1 (&a[0].c[7], 1, 2) != 125)
> +    abort ();
> +  check (&a[0].c[7], "abcd  ", 6);
> +  f2 ();
> +  check (&a[0].c[13], "1234777", 7);
> +  f3 (&a[0]);
> +  check (&a[1].c[14], "1234567999", 10);
> +  f4 ();
> +  check (&a[0].c[10], "012   6789", 10);
> +  return 0;
> +}
> --- gcc/testsuite/gfortran.dg/pr45636.f90.jj	2010-10-11 13:19:41.000000000 +0200
> +++ gcc/testsuite/gfortran.dg/pr45636.f90	2010-10-11 16:49:19.000000000 +0200
> @@ -0,0 +1,14 @@
> +! PR fortran/45636
> +! { dg-do compile }
> +! { dg-options "-O2 -fdump-tree-forwprop2" }
> +! PR 45636 - make sure no memset is needed for a short right-hand side.
> +program main
> +  character(len=2), parameter :: x='a '
> +  character(len=1), parameter :: y='b'
> +  character(len=4) :: a, b
> +  a = x
> +  b = y
> +  call sub(a, b)
> +end program main
> +! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
> +! { dg-final { cleanup-tree-dump "forwprop2" } }
> 
> 	Jakub
> 
>
H.J. Lu - Nov. 13, 2010, 3:08 p.m.
On Mon, Oct 11, 2010 at 1:46 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> This patch optimizes
> memcpy (x, "abcd", 4);
> memset (x + 4, ' ', 4);
> sequences into a single
> memcpy (x, "abcd    ", 8);
> if it is known (or expected) that the memcpy will be implemented using
> store_by_pieces (otherwise we'd risk enlarging .rodata too much, e.g.
> if there are only few longer string literals in memcpy originally and
> adding too many different memset variants after them would mean the string
> literals couldn't be shared anymore).
> The patch also handles
> memcpy (x, "a", 1);
> memset (x + 1, 'b', 11);
> where the memcpy has been optimized into a store.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2010-10-11  Jakub Jelinek  <jakub@redhat.com>
>
>        PR fortran/45636
>        * tree-ssa-forwprop.c: Include expr.h.
>        (constant_pointer_difference, simplify_builtin_call): New functions.
>        (tree_ssa_forward_propagate_single_use_vars): Call
>        simplify_builtin_call on builtin calls.
>

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46461

Patch

--- gcc/tree-ssa-forwprop.c.jj	2010-10-07 19:44:48.000000000 +0200
+++ gcc/tree-ssa-forwprop.c	2010-10-11 16:43:03.000000000 +0200
@@ -33,6 +33,7 @@  along with GCC; see the file COPYING3.  
 #include "langhooks.h"
 #include "flags.h"
 #include "gimple.h"
+#include "expr.h"
 
 /* This pass propagates the RHS of assignment statements into use
    sites of the LHS of the assignment.  It's basically a specialized
@@ -1317,6 +1318,296 @@  simplify_gimple_switch (gimple stmt)
     }
 }
 
+/* For pointers p2 and p1 return p2 - p1 if the
+   difference is known and constant, otherwise return NULL.  */
+
+static tree
+constant_pointer_difference (tree p1, tree p2)
+{
+  int i, j;
+  tree exps[2][5];
+  tree offs[2][5];
+  int cnt[2];
+
+  for (i = 0; i < 2; i++)
+    {
+      tree p = i ? p1 : p2;
+      tree off = size_zero_node;
+      gimple stmt;
+      enum tree_code code;
+
+      j = 0;
+      do
+	{
+	  if (!POINTER_TYPE_P (TREE_TYPE (p)))
+	    break;
+	  if (TREE_CODE (p) == ADDR_EXPR)
+	    {
+	      tree q = TREE_OPERAND (p, 0);
+	      if (handled_component_p (q)
+		  || TREE_CODE (q) == MEM_REF
+		  || TREE_CODE (q) == TARGET_MEM_REF)
+		{
+		  HOST_WIDE_INT offset, size, maxsize;
+		  tree base
+		    = get_ref_base_and_extent (q, &offset,
+					       &size, &maxsize);
+		  if (size == maxsize
+		      && offset >= 0
+		      && (offset % BITS_PER_UNIT) == 0)
+		    {
+		      q = base;
+		      off = size_binop (PLUS_EXPR, off,
+					size_int (offset / BITS_PER_UNIT));
+		    }
+		}
+	      if (TREE_CODE (q) == MEM_REF
+		  && TREE_CODE (TREE_OPERAND (q, 0)) == SSA_NAME)
+		{
+		  p = TREE_OPERAND (q, 0);
+		  off = size_binop (PLUS_EXPR, off,
+				    fold_convert (sizetype,
+						  TREE_OPERAND (q, 1)));
+		}
+	      else
+		{
+		  exps[i][j] = q;
+		  offs[i][j++] = off;
+		  break;
+		}
+	    }
+	  if (TREE_CODE (p) != SSA_NAME)
+	    break;
+	  exps[i][j] = p;
+	  offs[i][j++] = off;
+	  if (j == 5)
+	    break;
+	  stmt = SSA_NAME_DEF_STMT (p);
+	  if (!is_gimple_assign (stmt) || gimple_assign_lhs (stmt) != p)
+	    break;
+	  code = gimple_assign_rhs_code (stmt);
+	  if (code == POINTER_PLUS_EXPR)
+	    {
+	      if (TREE_CODE (gimple_assign_rhs2 (stmt)) != INTEGER_CST)
+		break;
+	      off = size_binop (PLUS_EXPR, off, gimple_assign_rhs2 (stmt));
+	      p = gimple_assign_rhs1 (stmt);
+	    }
+	  else if (code == ADDR_EXPR || code == NOP_EXPR)
+	    p = gimple_assign_rhs1 (stmt);
+	  else
+	    break;
+	}
+      while (1);
+      cnt[i] = j;
+    }
+
+  for (i = 0; i < cnt[0]; i++)
+    for (j = 0; j < cnt[1]; j++)
+      if (exps[0][i] == exps[1][j])
+	return size_binop (MINUS_EXPR, offs[0][i], offs[1][j]);
+
+  return NULL_TREE;
+}
+
+/* *GSI_P is a GIMPLE_CALL to a builtin function.
+   Optimize
+   memcpy (p, "abcd", 4);
+   memset (p + 4, ' ', 3);
+   into
+   memcpy (p, "abcd   ", 7);
+   call if the latter can be stored by pieces during expansion.  */
+
+static bool
+simplify_builtin_call (gimple_stmt_iterator *gsi_p, tree callee2)
+{
+  gimple stmt1, stmt2 = gsi_stmt (*gsi_p);
+  tree vuse = gimple_vuse (stmt2);
+  if (vuse == NULL)
+    return false;
+  stmt1 = SSA_NAME_DEF_STMT (vuse);
+
+  switch (DECL_FUNCTION_CODE (callee2))
+    {
+    case BUILT_IN_MEMSET:
+      if (gimple_call_num_args (stmt2) != 3
+	  || gimple_call_lhs (stmt2)
+	  || CHAR_BIT != 8
+	  || BITS_PER_UNIT != 8)
+	break;
+      else
+	{
+	  tree callee1;
+	  tree ptr1, src1, str1, off1, len1, lhs1;
+	  tree ptr2 = gimple_call_arg (stmt2, 0);
+	  tree val2 = gimple_call_arg (stmt2, 1);
+	  tree len2 = gimple_call_arg (stmt2, 2);
+	  tree diff, vdef, new_str_cst;
+	  unsigned int ptr1_align;
+	  unsigned HOST_WIDE_INT src_len;
+	  char *src_buf;
+	  imm_use_iterator imm_iter;
+	  use_operand_p use_p;
+	  bool do_fail;
+
+	  if (!host_integerp (val2, 0)
+	      || !host_integerp (len2, 1))
+	    break;
+	  if (is_gimple_call (stmt1))
+	    {
+	      callee1 = gimple_call_fndecl (stmt1);
+	      if (callee1 == NULL_TREE
+		  || DECL_BUILT_IN_CLASS (callee1) != BUILT_IN_NORMAL
+		  || gimple_call_num_args (stmt1) != 3)
+		break;
+	      if (DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMCPY
+		  && DECL_FUNCTION_CODE (callee1) != BUILT_IN_MEMPCPY)
+		break;
+	      ptr1 = gimple_call_arg (stmt1, 0);
+	      src1 = gimple_call_arg (stmt1, 1);
+	      len1 = gimple_call_arg (stmt1, 2);
+	      lhs1 = gimple_call_lhs (stmt1);
+	      if (!host_integerp (len1, 1))
+		break;
+	      str1 = string_constant (src1, &off1);
+	      if (str1 == NULL_TREE)
+		break;
+	      if (!host_integerp (off1, 1)
+		  || compare_tree_int (off1, TREE_STRING_LENGTH (str1) - 1) > 0
+		  || compare_tree_int (len1, TREE_STRING_LENGTH (str1)
+					     - tree_low_cst (off1, 1)) > 0
+		  || TREE_CODE (TREE_TYPE (str1)) != ARRAY_TYPE
+		  || TYPE_MODE (TREE_TYPE (TREE_TYPE (str1)))
+		     != TYPE_MODE (char_type_node))
+		break;
+	    }
+	  else if (gimple_assign_single_p (stmt1))
+	    {
+    	      ptr1 = gimple_assign_lhs (stmt1);
+	      src1 = gimple_assign_rhs1 (stmt1);
+	      if (TREE_CODE (ptr1) != MEM_REF
+		  || TYPE_MODE (TREE_TYPE (ptr1)) != TYPE_MODE (char_type_node)
+		  || !host_integerp (src1, 0))
+		break;
+	      ptr1 = build_fold_addr_expr (ptr1);
+	      callee1 = NULL_TREE;
+	      len1 = size_one_node;
+	      lhs1 = NULL_TREE;
+	      off1 = size_zero_node;
+	      str1 = NULL_TREE;
+	    }
+	  else
+	    break;
+
+	  diff = constant_pointer_difference (ptr1, ptr2);
+	  if (diff == NULL && lhs1 != NULL)
+	    {
+	      diff = constant_pointer_difference (lhs1, ptr2);
+	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY
+		  && diff != NULL)
+		diff = size_binop (PLUS_EXPR, diff,
+				   fold_convert (sizetype, len1));
+	    }
+	  if (diff == NULL
+	      || !host_integerp (diff, 1)
+	      || tree_int_cst_lt (len1, diff))
+	    break;
+
+	  src_len = tree_low_cst (diff, 1);
+	  src_len += tree_low_cst (len2, 1);
+	  if (src_len < (unsigned HOST_WIDE_INT) tree_low_cst (len1, 1))
+	    src_len = tree_low_cst (len1, 1);
+	  if (src_len > 1024)
+	    break;
+
+	  do_fail = false;
+	  if (lhs1 != NULL
+	      && DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
+	    {
+	      if (TREE_CODE (lhs1) != SSA_NAME)
+		break;
+	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs1)
+		{
+		  if (USE_STMT (use_p) != stmt2)
+		    {
+		      do_fail = true;
+		      break;
+		    }
+		}
+	      if (do_fail)
+		break;
+	    }
+	  vdef = gimple_vdef (stmt1);
+	  if (vdef != NULL)
+	    {
+	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, vdef)
+		{
+		  if (USE_STMT (use_p) != stmt2)
+		    {
+		      do_fail = true;
+		      break;
+		    }
+		}
+	      if (do_fail)
+		break;
+	    }
+
+	  ptr1_align = get_pointer_alignment (ptr1, BIGGEST_ALIGNMENT);
+	  src_buf = XALLOCAVEC (char, src_len + 1);
+	  if (callee1)
+	    memcpy (src_buf,
+		    TREE_STRING_POINTER (str1) + tree_low_cst (off1, 1),
+		    tree_low_cst (len1, 1));
+	  else
+	    src_buf[0] = tree_low_cst (src1, 0);
+	  memset (src_buf + tree_low_cst (diff, 1),
+		  tree_low_cst (val2, 1), tree_low_cst (len2, 1));
+	  src_buf[src_len] = '\0';
+	  if (strlen (src_buf) != src_len)
+	    break;
+	  rtl_profile_for_bb (gimple_bb (stmt2));
+	  if (!can_store_by_pieces (src_len,
+				    builtin_strncpy_read_str,
+				    src_buf, ptr1_align, false))
+	    break;
+
+	  new_str_cst = build_string_literal (src_len, src_buf);
+	  if (callee1)
+	    {
+	      if (DECL_FUNCTION_CODE (callee1) == BUILT_IN_MEMPCPY)
+		gimple_call_set_lhs (stmt1, NULL_TREE);
+	      gimple_call_set_arg (stmt1, 1, new_str_cst);
+	      gimple_call_set_arg (stmt1, 2,
+				   build_int_cst (TREE_TYPE (len1), src_len));
+	      update_stmt (stmt1);
+	      unlink_stmt_vdef (stmt2);
+	      gsi_remove (gsi_p, true);
+	      release_defs (stmt2);
+	      return true;
+	    }
+	  else
+	    {
+	      gimple_stmt_iterator gsi = gsi_for_stmt (stmt1);
+
+	      gimple_call_set_fndecl (stmt2, built_in_decls [BUILT_IN_MEMCPY]);
+	      gimple_call_set_arg (stmt2, 0, ptr1);
+	      gimple_call_set_arg (stmt2, 1, new_str_cst);
+	      gimple_call_set_arg (stmt2, 2,
+				   build_int_cst (TREE_TYPE (len2), src_len));
+	      unlink_stmt_vdef (stmt1);
+	      gsi_remove (&gsi, true);
+	      release_defs (stmt1);
+	      update_stmt (stmt2);
+	      return false;
+	    }
+	}
+      break;
+    default:
+      break;
+    }
+  return false;
+}
+
 /* Run bitwise and assignments throug the folder.  If the first argument is an
    ssa name that is itself a result of a typecast of an ADDR_EXPR to an
    integer, feed the ADDR_EXPR to the folder rather than the ssa name.
@@ -1784,6 +2075,14 @@  tree_ssa_forward_propagate_single_use_va
 					      WARN_STRICT_OVERFLOW_CONDITIONAL);
 	      gsi_next (&gsi);
 	    }
+	  else if (is_gimple_call (stmt))
+	    {
+	      tree callee = gimple_call_fndecl (stmt);
+	      if (callee == NULL_TREE
+		  || DECL_BUILT_IN_CLASS (callee) != BUILT_IN_NORMAL
+		  || !simplify_builtin_call (&gsi, callee))
+		gsi_next (&gsi);
+	    }
 	  else
 	    gsi_next (&gsi);
 	}
--- gcc/Makefile.in.jj	2010-10-11 07:51:09.000000000 +0200
+++ gcc/Makefile.in	2010-10-11 09:15:25.000000000 +0200
@@ -2404,7 +2404,7 @@  tree-ssa-dse.o : tree-ssa-dse.c $(CONFIG
 tree-ssa-forwprop.o : tree-ssa-forwprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
-   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h
+   langhooks.h $(FLAGS_H) $(GIMPLE_H) tree-pretty-print.h $(EXPR_H)
 tree-ssa-phiprop.o : tree-ssa-phiprop.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) $(DIAGNOSTIC_H) $(TIMEVAR_H) \
--- gcc/testsuite/gcc.c-torture/execute/pr45636.c.jj	2010-10-11 09:15:25.000000000 +0200
+++ gcc/testsuite/gcc.c-torture/execute/pr45636.c	2010-10-11 09:15:25.000000000 +0200
@@ -0,0 +1,75 @@ 
+/* PR fortran/45636 */
+
+typedef __SIZE_TYPE__ size_t;
+void *memcpy (void *__restrict__, const void *__restrict__, size_t);
+void *mempcpy (void *__restrict__, const void *__restrict__, size_t);
+void *memset (void *, int, size_t);
+int memcmp (const void *, const void *, size_t);
+extern void abort (void);
+
+struct A { int i; char c[32]; } a[2];
+
+__attribute__((noinline, noclone)) int
+f1 (char *p, int q, int z)
+{
+  memcpy (p, "abcd", 4);
+  if (q)
+    z = z + 123;
+  else
+    z *= 114;
+  memset (p + 4, ' ', 2);
+  return z;
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  char *p = mempcpy (&a[0].c[13], "123456", 4);
+  memset (p, '7', 3);
+}
+
+__attribute__((noinline, noclone)) void
+f3 (struct A *p)
+{
+  p++;
+  char *q = &p->c[10];
+  memcpy (q + 4, "__1234567" + 2, 7);
+  memset (&p->c[21], '9', 3);
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  memcpy (&a[0].c[10], "0123456789", 10);
+  memset (&a[0].c[13], ' ', 3);
+}
+
+__attribute__((noinline, noclone)) void
+check (const char *p, const char *str, size_t size)
+{
+  const char *q;
+  for (q = (const char *) &a; q < p; q++)
+    if (*q)
+      abort ();
+  if (memcmp (p, str, size) != 0)
+    abort ();
+  for (q = p + size; q < (const char *) (&a[0] + 2); q++)
+    if (*q)
+      abort ();
+  memset (&a, '\0', sizeof a);
+}
+
+int
+main (void)
+{
+  if (f1 (&a[0].c[7], 1, 2) != 125)
+    abort ();
+  check (&a[0].c[7], "abcd  ", 6);
+  f2 ();
+  check (&a[0].c[13], "1234777", 7);
+  f3 (&a[0]);
+  check (&a[1].c[14], "1234567999", 10);
+  f4 ();
+  check (&a[0].c[10], "012   6789", 10);
+  return 0;
+}
--- gcc/testsuite/gfortran.dg/pr45636.f90.jj	2010-10-11 13:19:41.000000000 +0200
+++ gcc/testsuite/gfortran.dg/pr45636.f90	2010-10-11 16:49:19.000000000 +0200
@@ -0,0 +1,14 @@ 
+! PR fortran/45636
+! { dg-do compile }
+! { dg-options "-O2 -fdump-tree-forwprop2" }
+! PR 45636 - make sure no memset is needed for a short right-hand side.
+program main
+  character(len=2), parameter :: x='a '
+  character(len=1), parameter :: y='b'
+  character(len=4) :: a, b
+  a = x
+  b = y
+  call sub(a, b)
+end program main
+! { dg-final { scan-tree-dump-times "memset" 0 "forwprop2" } }
+! { dg-final { cleanup-tree-dump "forwprop2" } }