diff mbox series

[v2] Re: Introduce -finline-stringops

Message ID orled0dnm0.fsf_-_@lxoliva.fsfla.org
State New
Headers show
Series [v2] Re: Introduce -finline-stringops | expand

Commit Message

Alexandre Oliva Sept. 21, 2023, 6:52 a.m. UTC
On Sep 15, 2023, Alexandre Oliva <oliva@adacore.com> wrote:

> On Jun 22, 2023, Alexandre Oliva <oliva@adacore.com> wrote:
>> On Jun  2, 2023, Alexandre Oliva <oliva@adacore.com> wrote:
>>> Introduce -finline-stringops

>> Ping?  https://gcc.gnu.org/pipermail/gcc-patches/2023-June/620472.html

> Ping?

Here's a refreshed and improved patch, that improves the memcmp
expansion a little, dropping redundant barriers and conditional
branches.

Regstrapped on x86_64-linux-gnu, and also bootstrap with
-finline-stringops enabled by default.  Ok to install?


Introduce -finline-stringops

try_store_by_multiple_pieces was added not long ago, enabling
variable-sized memset to be expanded inline when the worst-case
in-range constant length would, using conditional blocks with powers
of two to cover all possibilities of length and alignment.

This patch introduces -finline-stringops[=fn] to request expansions to
start with a loop, so as to still take advantage of known alignment
even with long lengths, but without necessarily adding store blocks
for every power of two.

This makes it possible for the supported stringops (memset, memcpy,
memmove, memset) to be expanded, even if storing a single byte per
iteration.  Surely efficient implementations can run faster, with a
pre-loop to increase alignment, but that would likely be excessive for
inline expansions.

Still, in some cases, such as in freestanding environments, users
prefer to inline such stringops, especially those that the compiler
may introduce itself, even if the expansion is not as performant as a
highly optimized C library implementation could be, to avoid
depending on a C runtime library.


for  gcc/ChangeLog

	* expr.cc (emit_block_move_hints): Take ctz of len.  Obey
	-finline-stringops.  Use oriented or sized loop.
	(emit_block_move): Take ctz of len, and pass it on.
	(emit_block_move_via_sized_loop): New.
	(emit_block_move_via_oriented_loop): New.
	(emit_block_move_via_loop): Take incr.  Move an incr-sized
	block per iteration.
	(emit_block_cmp_via_cmpmem): Take ctz of len.  Obey
	-finline-stringops.
	(emit_block_cmp_via_loop): New.
	* expr.h (emit_block_move): Add ctz of len defaulting to zero.
	(emit_block_move_hints): Likewise.
	(emit_block_cmp_hints): Likewise.
	* builtins.cc (expand_builtin_memory_copy_args): Pass ctz of
	len to emit_block_move_hints.
	(try_store_by_multiple_pieces): Support starting with a loop.
	(expand_builtin_memcmp): Pass ctz of len to
	emit_block_cmp_hints.
	(expand_builtin): Allow inline expansion of memset, memcpy,
	memmove and memcmp if requested.
	* common.opt (finline-stringops): New.
	(ilsop_fn): New enum.
	* flag-types.h (enum ilsop_fn): New.
	* doc/invoke.texi (-finline-stringops): Add.

for  gcc/testsuite/ChangeLog

	* gcc.dg/torture/inline-mem-cmp-1.c: New.
	* gcc.dg/torture/inline-mem-cpy-1.c: New.
	* gcc.dg/torture/inline-mem-cpy-cmp-1.c: New.
	* gcc.dg/torture/inline-mem-move-1.c: New.
	* gcc.dg/torture/inline-mem-set-1.c: New.
---
 gcc/builtins.cc                                    |  114 +++++-
 gcc/common.opt                                     |   34 ++
 gcc/doc/invoke.texi                                |   15 +
 gcc/expr.cc                                        |  396 +++++++++++++++++++-
 gcc/expr.h                                         |    9 
 gcc/flag-types.h                                   |   11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c    |    7 
 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c    |    8 
 .../gcc.dg/torture/inline-mem-cpy-cmp-1.c          |   11 +
 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c   |    9 
 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c    |   84 ++++
 11 files changed, 668 insertions(+), 30 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
 create mode 100644 gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
 create mode 100644 gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
 create mode 100644 gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
diff mbox series

Patch

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 3b453b3ec8c35..23646da09f8f1 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -3771,7 +3771,7 @@  expand_builtin_memory_copy_args (tree dest, tree src, tree len,
 				     expected_align, expected_size,
 				     min_size, max_size, probable_max_size,
 				     use_mempcpy_call, &is_move_done,
-				     might_overlap);
+				     might_overlap, tree_ctz (len));
 
   /* Bail out when a mempcpy call would be expanded as libcall and when
      we have a target that provides a fast implementation
@@ -4337,6 +4337,10 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   int tst_bits = (max_bits != min_bits ? max_bits
 		  : floor_log2 (max_len ^ min_len));
 
+  /* Save the pre-blksize values.  */
+  int orig_max_bits = max_bits;
+  int orig_tst_bits = tst_bits;
+
   /* Check whether it's profitable to start by storing a fixed BLKSIZE
      bytes, to lower max_bits.  In the unlikely case of a constant LEN
      (implied by identical MAX_LEN and MIN_LEN), we want to issue a
@@ -4376,9 +4380,70 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
   if (max_bits >= 0)
     xlenest += ((HOST_WIDE_INT_1U << max_bits) * 2
 		- (HOST_WIDE_INT_1U << ctz_len));
-  if (!can_store_by_pieces (xlenest, builtin_memset_read_str,
-			    &valc, align, true))
-    return false;
+  bool max_loop = false;
+  /* Skip the test in case of overflow in xlenest.  It shouldn't
+     happen because of the way max_bits and blksize are related, but
+     it doesn't hurt to test.  */
+  if (blksize > xlenest
+      || !can_store_by_pieces (xlenest, builtin_memset_read_str,
+			       &valc, align, true))
+    {
+      if (!(flag_inline_stringops & ILSOP_MEMSET))
+	return false;
+
+      for (max_bits = orig_max_bits;
+	   max_bits >= sctz_len;
+	   --max_bits)
+	{
+	  xlenest = ((HOST_WIDE_INT_1U << max_bits) * 2
+		     - (HOST_WIDE_INT_1U << ctz_len));
+	  /* Check that blksize plus the bits to be stored as blocks
+	     sized at powers of two can be stored by pieces.  This is
+	     like the test above, but with smaller max_bits.  Skip
+	     orig_max_bits (it would be redundant).  Also skip in case
+	     of overflow.  */
+	  if (max_bits < orig_max_bits
+	      && xlenest + blksize >= xlenest
+	      && can_store_by_pieces (xlenest + blksize,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_loop = true;
+	      break;
+	    }
+	  if (blksize
+	      && can_store_by_pieces (xlenest,
+				      builtin_memset_read_str,
+				      &valc, align, true))
+	    {
+	      max_len += blksize;
+	      min_len += blksize;
+	      tst_bits = orig_tst_bits;
+	      blksize = 0;
+	      max_loop = true;
+	      break;
+	    }
+	  if (max_bits == sctz_len)
+	    {
+	      --sctz_len;
+	      --ctz_len;
+	    }
+	}
+      if (!max_loop)
+	return false;
+      /* If the boundaries are such that min and max may run a
+	 different number of trips in the initial loop, the remainder
+	 needs not be between the moduli, so set tst_bits to cover all
+	 bits.  Otherwise, if the trip counts are the same, max_len
+	 has the common prefix, and the previously-computed tst_bits
+	 is usable.  */
+      if (max_len >> max_bits > min_len >> max_bits)
+	tst_bits = max_bits;
+    }
+  /* ??? Do we have to check that all powers of two lengths from
+     max_bits down to ctz_len pass can_store_by_pieces?  As in, could
+     it possibly be that xlenest passes while smaller power-of-two
+     sizes don't?  */
 
   by_pieces_constfn constfun;
   void *constfundata;
@@ -4420,7 +4485,9 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
      the least significant bit possibly set in the length.  */
   for (int i = max_bits; i >= sctz_len; i--)
     {
+      rtx_code_label *loop_label = NULL;
       rtx_code_label *label = NULL;
+
       blksize = HOST_WIDE_INT_1U << i;
 
       /* If we're past the bits shared between min_ and max_len, expand
@@ -4434,18 +4501,31 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 				   profile_probability::even ());
 	}
       /* If we are at a bit that is in the prefix shared by min_ and
-	 max_len, skip this BLKSIZE if the bit is clear.  */
-      else if ((max_len & blksize) == 0)
+	 max_len, skip the current BLKSIZE if the bit is clear, but do
+	 not skip the loop, even if it doesn't require
+	 prechecking.  */
+      else if ((max_len & blksize) == 0
+	       && !(max_loop && i == max_bits))
 	continue;
 
+      if (max_loop && i == max_bits)
+	{
+	  loop_label = gen_label_rtx ();
+	  emit_label (loop_label);
+	  /* Since we may run this multiple times, don't assume we
+	     know anything about the offset.  */
+	  clear_mem_offset (to);
+	}
+
       /* Issue a store of BLKSIZE bytes.  */
+      bool update_needed = i != sctz_len || loop_label;
       to = store_by_pieces (to, blksize,
 			    constfun, constfundata,
 			    align, true,
-			    i != sctz_len ? RETURN_END : RETURN_BEGIN);
+			    update_needed ? RETURN_END : RETURN_BEGIN);
 
       /* Adjust REM and PTR, unless this is the last iteration.  */
-      if (i != sctz_len)
+      if (update_needed)
 	{
 	  emit_move_insn (ptr, force_operand (XEXP (to, 0), NULL_RTX));
 	  to = replace_equiv_address (to, ptr);
@@ -4453,6 +4533,11 @@  try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
 	  emit_move_insn (rem, force_operand (rem_minus_blksize, NULL_RTX));
 	}
 
+      if (loop_label)
+	emit_cmp_and_jump_insns (rem, GEN_INT (blksize), GE, NULL,
+				 ptr_mode, 1, loop_label,
+				 profile_probability::likely ());
+
       if (label)
 	{
 	  emit_label (label);
@@ -4739,7 +4824,8 @@  expand_builtin_memcmp (tree exp, rtx target, bool result_eq)
   result = emit_block_cmp_hints (arg1_rtx, arg2_rtx, len_rtx,
 				 TREE_TYPE (len), target,
 				 result_eq, constfn,
-				 CONST_CAST (char *, rep));
+				 CONST_CAST (char *, rep),
+				 tree_ctz (len));
 
   if (result)
     {
@@ -7382,7 +7468,15 @@  expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode,
       && fcode != BUILT_IN_EXECVE
       && fcode != BUILT_IN_CLEAR_CACHE
       && !ALLOCA_FUNCTION_CODE_P (fcode)
-      && fcode != BUILT_IN_FREE)
+      && fcode != BUILT_IN_FREE
+      && (fcode != BUILT_IN_MEMSET
+	  || !(flag_inline_stringops & ILSOP_MEMSET))
+      && (fcode != BUILT_IN_MEMCPY
+	  || !(flag_inline_stringops & ILSOP_MEMCPY))
+      && (fcode != BUILT_IN_MEMMOVE
+	  || !(flag_inline_stringops & ILSOP_MEMMOVE))
+      && (fcode != BUILT_IN_MEMCMP
+	  || !(flag_inline_stringops & ILSOP_MEMCMP)))
     return expand_call (exp, target, ignore);
 
   /* The built-in function expanders test for target == const0_rtx
diff --git a/gcc/common.opt b/gcc/common.opt
index f137a1f81ac8a..1e3f8efbc97d0 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1908,6 +1908,40 @@  finline-atomics
 Common Var(flag_inline_atomics) Init(1) Optimization
 Inline __atomic operations when a lock free instruction sequence is available.
 
+finline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_ALL) Enum(ilsop_fn) Init(ILSOP_NONE) Optimization Undocumented
+
+fno-inline-stringops
+Common RejectNegative Enum(ilsop_fn) Var(flag_inline_stringops, ILSOP_NONE) Enum(ilsop_fn) Optimization Undocumented
+
+finline-stringops=
+Common Joined Var(flag_inline_stringops) EnumSet Enum(ilsop_fn) Optimization
+-finline-stringops[=memcmp|memcpy|memmove|memset]
+Expand supported mem/str operations inline, even if against optimization.
+
+Enum
+Name(ilsop_fn) Type(enum ilsop_fn) UnknownError(unavailable stringop for inlining %qs)
+
+; This is not part of any set.
+; EnumValue
+; Enum(ilsop_fn) String(none) Value(ILSOP_NONE)
+
+EnumValue
+Enum(ilsop_fn) String(memcmp) Value(ILSOP_MEMCMP) Set(1)
+
+EnumValue
+Enum(ilsop_fn) String(memcpy) Value(ILSOP_MEMCPY) Set(2)
+
+EnumValue
+Enum(ilsop_fn) String(memmove) Value(ILSOP_MEMMOVE) Set(3)
+
+EnumValue
+Enum(ilsop_fn) String(memset) Value(ILSOP_MEMSET) Set(4)
+
+; This is not part of any set either.
+; EnumValue
+; Enum(ilsop_fn) String(all) Value(ILSOP_ALL)
+
 fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 03d93e6b1850c..b0649712792fa 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -556,6 +556,7 @@  Objective-C and Objective-C++ Dialects}.
 -fgcse  -fgcse-after-reload  -fgcse-las  -fgcse-lm  -fgraphite-identity
 -fgcse-sm  -fhoist-adjacent-loads  -fif-conversion
 -fif-conversion2  -findirect-inlining
+-finline-stringops[=@var{fn}]
 -finline-functions  -finline-functions-called-once  -finline-limit=@var{n}
 -finline-small-functions -fipa-modref -fipa-cp  -fipa-cp-clone
 -fipa-bit-cp  -fipa-vrp  -fipa-pta  -fipa-profile  -fipa-pure-const
@@ -12228,6 +12229,20 @@  their @code{_FORTIFY_SOURCE} counterparts into faster alternatives.
 
 Enabled at levels @option{-O2}, @option{-O3}.
 
+@opindex finline-stringops
+@item -finline-stringops[=@var{fn}]
+Expand memory and string operations (for now, only @code{memset})
+inline, even when the length is variable or big enough as to require
+looping.  This is most useful along with @option{-ffreestanding} and
+@option{-fno-builtin}.
+
+In some circumstances, it enables the compiler to generate code that
+takes advantage of known alignment and length multipliers, but even then
+it may be less efficient than optimized runtime implementations, and
+grow code size so much that even a less performant but shared
+implementation runs faster due to better use of code caches.  This
+option is disabled by default.
+
 @opindex fno-inline
 @opindex finline
 @item -fno-inline
diff --git a/gcc/expr.cc b/gcc/expr.cc
index d5b6494b4fc1d..c6842996b9876 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -80,7 +80,11 @@  static bool emit_block_move_via_pattern (rtx, rtx, rtx, unsigned, unsigned,
 					 HOST_WIDE_INT, unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT,
 					 unsigned HOST_WIDE_INT, bool);
-static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned);
+static void emit_block_move_via_loop (rtx, rtx, rtx, unsigned, int);
+static void emit_block_move_via_sized_loop (rtx, rtx, rtx, unsigned, unsigned);
+static void emit_block_move_via_oriented_loop (rtx, rtx, rtx, unsigned, unsigned);
+static rtx emit_block_cmp_via_loop (rtx, rtx, rtx, tree, rtx, bool,
+				    unsigned, unsigned);
 static void clear_by_pieces (rtx, unsigned HOST_WIDE_INT, unsigned int);
 static rtx_insn *compress_float_constant (rtx, rtx);
 static rtx get_subtarget (rtx);
@@ -1955,6 +1959,8 @@  compare_by_pieces (rtx arg0, rtx arg1, unsigned HOST_WIDE_INT len,
    MIN_SIZE is the minimal size of block to move
    MAX_SIZE is the maximal size of block to move, if it cannot be represented
    in unsigned HOST_WIDE_INT, than it is mask of all ones.
+   CTZ_SIZE is the trailing-zeros count of SIZE; even a nonconstant SIZE is
+   known to be a multiple of 1<<CTZ_SIZE.
 
    Return the address of the new block, if memcpy is called and returns it,
    0 otherwise.  */
@@ -1966,7 +1972,7 @@  emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 		       unsigned HOST_WIDE_INT max_size,
 		       unsigned HOST_WIDE_INT probable_max_size,
 		       bool bail_out_libcall, bool *is_move_done,
-		       bool might_overlap)
+		       bool might_overlap, unsigned ctz_size)
 {
   int may_use_call;
   rtx retval = 0;
@@ -2052,6 +2058,14 @@  emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 	}
     }
 
+  bool dynamic_direction = false;
+  if (!pattern_ok && !pieces_ok && may_use_call
+      && (flag_inline_stringops & (might_overlap ? ILSOP_MEMMOVE : ILSOP_MEMCPY)))
+    {
+      may_use_call = 0;
+      dynamic_direction = might_overlap;
+    }
+
   if (pattern_ok)
     ;
   else if (pieces_ok)
@@ -2073,10 +2087,12 @@  emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
       retval = emit_block_copy_via_libcall (x, y, size,
 					    method == BLOCK_OP_TAILCALL);
     }
+  else if (dynamic_direction)
+    emit_block_move_via_oriented_loop (x, y, size, align, ctz_size);
   else if (might_overlap)
     *is_move_done = false;
   else
-    emit_block_move_via_loop (x, y, size, align);
+    emit_block_move_via_sized_loop (x, y, size, align, ctz_size);
 
   if (method == BLOCK_OP_CALL_PARM)
     OK_DEFER_POP;
@@ -2085,7 +2101,8 @@  emit_block_move_hints (rtx x, rtx y, rtx size, enum block_op_methods method,
 }
 
 rtx
-emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
+emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method,
+		 unsigned int ctz_size)
 {
   unsigned HOST_WIDE_INT max, min = 0;
   if (GET_CODE (size) == CONST_INT)
@@ -2093,7 +2110,8 @@  emit_block_move (rtx x, rtx y, rtx size, enum block_op_methods method)
   else
     max = GET_MODE_MASK (GET_MODE (size));
   return emit_block_move_hints (x, y, size, method, 0, -1,
-				min, max, max);
+				min, max, max,
+				false, NULL, false, ctz_size);
 }
 
 /* A subroutine of emit_block_move.  Returns true if calling the
@@ -2255,13 +2273,117 @@  emit_block_move_via_pattern (rtx x, rtx y, rtx size, unsigned int align,
   return false;
 }
 
+/* Like emit_block_move_via_loop, but choose a suitable INCR based on
+   ALIGN and CTZ_SIZE.  */
+
+static void
+emit_block_move_via_sized_loop (rtx x, rtx y, rtx size,
+				unsigned int align,
+				unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !can_move_by_pieces (incr, align))
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  return emit_block_move_via_loop (x, y, size, align, incr);
+}
+
+/* Like emit_block_move_via_sized_loop, but besides choosing INCR so
+   as to ensure safe moves even in case of overlap, output dynamic
+   tests to choose between two loops, one moving downwards, another
+   moving upwards.  */
+
+static void
+emit_block_move_via_oriented_loop (rtx x, rtx y, rtx size,
+				   unsigned int align,
+				   unsigned int ctz_size)
+{
+  int incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (size))
+    ctz_size = MAX (ctz_size, (unsigned) wi::ctz (UINTVAL (size)));
+
+  if (HOST_WIDE_INT_1U << ctz_size < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_size;
+
+  while (incr > 1 && !int_mode_for_size (incr, 0).exists ())
+    incr >>= 1;
+
+  gcc_checking_assert (incr);
+
+  rtx_code_label *upw_label, *end_label;
+  upw_label = gen_label_rtx ();
+  end_label = gen_label_rtx ();
+
+  rtx x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  rtx y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  machine_mode mode = GET_MODE (x_addr);
+  if (mode != GET_MODE (y_addr))
+    {
+      scalar_int_mode xmode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE (mode));
+      scalar_int_mode ymode
+	= smallest_int_mode_for_size (GET_MODE_BITSIZE
+				      (GET_MODE (y_addr)));
+      if (GET_MODE_BITSIZE (xmode) < GET_MODE_BITSIZE (ymode))
+	mode = ymode;
+      else
+	mode = xmode;
+
+#ifndef POINTERS_EXTEND_UNSIGNED
+      const int POINTERS_EXTEND_UNSIGNED = 1;
+#endif
+      x_addr = convert_modes (mode, GET_MODE (x_addr), x_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+      y_addr = convert_modes (mode, GET_MODE (y_addr), y_addr,
+			      POINTERS_EXTEND_UNSIGNED);
+    }
+
+  /* Test for overlap: if (x >= y || x + size <= y) goto upw_label.  */
+  emit_cmp_and_jump_insns (x_addr, y_addr, GEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (5, 10));
+  rtx tmp = convert_modes (GET_MODE (x_addr), GET_MODE (size), size, true);
+  tmp = simplify_gen_binary (PLUS, GET_MODE (x_addr), x_addr, tmp);
+
+  emit_cmp_and_jump_insns (tmp, y_addr, LEU, NULL_RTX, mode,
+			   true, upw_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (8, 10));
+
+  emit_block_move_via_loop (x, y, size, align, -incr);
+
+  emit_jump (end_label);
+  emit_label (upw_label);
+
+  emit_block_move_via_loop (x, y, size, align, incr);
+
+  emit_label (end_label);
+}
+
 /* A subroutine of emit_block_move.  Copy the data via an explicit
-   loop.  This is used only when libcalls are forbidden.  */
-/* ??? It'd be nice to copy in hunks larger than QImode.  */
+   loop.  This is used only when libcalls are forbidden, or when
+   inlining is required.  INCR is the block size to be copied in each
+   loop iteration.  If it is negative, the absolute value is used, and
+   the block is copied backwards.  INCR must be a power of two, an
+   exact divisor for SIZE and ALIGN, and imply a mode that can be
+   safely copied per iteration assuming no overlap.  */
 
 static void
 emit_block_move_via_loop (rtx x, rtx y, rtx size,
-			  unsigned int align ATTRIBUTE_UNUSED)
+			  unsigned int align, int incr)
 {
   rtx_code_label *cmp_label, *top_label;
   rtx iter, x_addr, y_addr, tmp;
@@ -2277,7 +2399,38 @@  emit_block_move_via_loop (rtx x, rtx y, rtx size,
   cmp_label = gen_label_rtx ();
   iter = gen_reg_rtx (iter_mode);
 
-  emit_move_insn (iter, const0_rtx);
+  bool downwards = incr < 0;
+  rtx iter_init;
+  rtx_code iter_cond;
+  rtx iter_limit;
+  rtx iter_incr;
+  machine_mode move_mode;
+  if (downwards)
+    {
+      incr = -incr;
+      iter_init = size;
+      iter_cond = GEU;
+      iter_limit = const0_rtx;
+      iter_incr = GEN_INT (incr);
+    }
+  else
+    {
+      iter_init = const0_rtx;
+      iter_cond = LTU;
+      iter_limit = size;
+      iter_incr = GEN_INT (incr);
+    }
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_move_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_move_mode) != incr * BITS_PER_UNIT)
+    {
+      move_mode = BLKmode;
+      gcc_checking_assert (can_move_by_pieces (incr, align));
+    }
+  else
+    move_mode = int_move_mode;
 
   x_addr = force_operand (XEXP (x, 0), NULL_RTX);
   y_addr = force_operand (XEXP (y, 0), NULL_RTX);
@@ -2293,19 +2446,32 @@  emit_block_move_via_loop (rtx x, rtx y, rtx size,
     tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
   y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
 
-  x = change_address (x, QImode, x_addr);
-  y = change_address (y, QImode, y_addr);
+  x = change_address (x, move_mode, x_addr);
+  y = change_address (y, move_mode, y_addr);
+
+  if (move_mode == BLKmode)
+    {
+      bool done;
+      emit_block_move_hints (x, y, iter_incr, BLOCK_OP_NO_LIBCALL,
+			     align, incr, incr, incr, incr,
+			     false, &done, false);
+      gcc_checking_assert (done);
+    }
+  else
+    emit_move_insn (x, y);
 
-  emit_move_insn (x, y);
+  if (downwards)
+    emit_label (cmp_label);
 
-  tmp = expand_simple_binop (iter_mode, PLUS, iter, const1_rtx, iter,
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
 			     true, OPTAB_LIB_WIDEN);
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_label (cmp_label);
+  if (!downwards)
+    emit_label (cmp_label);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+  emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
 			   true, top_label,
 			   profile_probability::guessed_always ()
 				.apply_scale (9, 10));
@@ -2405,7 +2571,8 @@  emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 
    Both X and Y must be MEM rtx's.  LEN is an rtx that says how long
    they are.  LEN_TYPE is the type of the expression that was used to
-   calculate it.
+   calculate it, and CTZ_LEN is the known trailing-zeros count of LEN,
+   so LEN must be a multiple of 1<<CTZ_LEN even if it's not constant.
 
    If EQUALITY_ONLY is true, it means we don't have to return the tri-state
    value of a normal memcmp call, instead we can just compare for equality.
@@ -2421,7 +2588,7 @@  emit_block_cmp_via_cmpmem (rtx x, rtx y, rtx len, tree len_type, rtx target,
 rtx
 emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
 		      bool equality_only, by_pieces_constfn y_cfn,
-		      void *y_cfndata)
+		      void *y_cfndata, unsigned ctz_len)
 {
   rtx result = 0;
 
@@ -2443,8 +2610,203 @@  emit_block_cmp_hints (rtx x, rtx y, rtx len, tree len_type, rtx target,
   else
     result = emit_block_cmp_via_cmpmem (x, y, len, len_type, target, align);
 
+  if (!result && (flag_inline_stringops & ILSOP_MEMCMP))
+    result = emit_block_cmp_via_loop (x, y, len, len_type,
+				      target, equality_only,
+				      align, ctz_len);
+
   return result;
 }
+
+/* Like emit_block_cmp_hints, but with known alignment and no support
+   for constats.  Always expand to a loop with iterations that compare
+   blocks of the largest compare-by-pieces size that divides both len
+   and align, and then, if !EQUALITY_ONLY, identify the word and then
+   the unit that first differs to return the result.  */
+
+rtx
+emit_block_cmp_via_loop (rtx x, rtx y, rtx len, tree len_type, rtx target,
+			 bool equality_only, unsigned align, unsigned ctz_len)
+{
+  unsigned incr = align / BITS_PER_UNIT;
+
+  if (CONST_INT_P (len))
+    ctz_len = MAX (ctz_len, (unsigned) wi::ctz (UINTVAL (len)));
+
+  if (HOST_WIDE_INT_1U << ctz_len < (unsigned HOST_WIDE_INT) incr)
+    incr = HOST_WIDE_INT_1U << ctz_len;
+
+  while (incr > 1
+	 && !can_do_by_pieces (incr, align, COMPARE_BY_PIECES))
+    incr >>= 1;
+
+  rtx_code_label *cmp_label, *top_label, *ne_label, *res_label;
+  rtx iter, x_addr, y_addr, tmp;
+  machine_mode x_addr_mode = get_address_mode (x);
+  machine_mode y_addr_mode = get_address_mode (y);
+  machine_mode iter_mode;
+
+  iter_mode = GET_MODE (len);
+  if (iter_mode == VOIDmode)
+    iter_mode = word_mode;
+
+  rtx iter_init = const0_rtx;
+  rtx_code iter_cond = LTU;
+  rtx_code entry_cond = GEU;
+  rtx iter_limit = len;
+  rtx iter_incr = GEN_INT (incr);
+  machine_mode cmp_mode;
+
+  /* We can drop the loop back edge if we know there's exactly one
+     iteration.  */
+  top_label = (!rtx_equal_p (len, iter_incr)
+	       ? gen_label_rtx ()
+	       : NULL);
+  /* We need not test before entering the loop if len is known
+     nonzero.  ??? This could be even stricter, testing whether a
+     nonconstant LEN could possibly be zero.  */
+  cmp_label = (!CONSTANT_P (len) || rtx_equal_p (len, iter_init)
+	       ? gen_label_rtx ()
+	       : NULL);
+  ne_label = gen_label_rtx ();
+  res_label = gen_label_rtx ();
+
+  iter = gen_reg_rtx (iter_mode);
+  emit_move_insn (iter, iter_init);
+
+  scalar_int_mode int_cmp_mode
+    = smallest_int_mode_for_size (incr * BITS_PER_UNIT);
+  if (GET_MODE_BITSIZE (int_cmp_mode) != incr * BITS_PER_UNIT
+      || !can_compare_p (NE, int_cmp_mode, ccp_jump))
+    {
+      cmp_mode = BLKmode;
+      gcc_checking_assert (incr != 1);
+    }
+  else
+    cmp_mode = int_cmp_mode;
+
+  /* Save the base addresses.  */
+  x_addr = force_operand (XEXP (x, 0), NULL_RTX);
+  y_addr = force_operand (XEXP (y, 0), NULL_RTX);
+  do_pending_stack_adjust ();
+
+  if (cmp_label)
+    {
+      if (top_label)
+	emit_jump (cmp_label);
+      else
+	emit_cmp_and_jump_insns (iter, iter_limit, entry_cond,
+				 NULL_RTX, iter_mode,
+				 true, cmp_label,
+				 profile_probability::guessed_always ()
+				 .apply_scale (1, 10));
+    }
+  if (top_label)
+    emit_label (top_label);
+
+  /* Offset the base addresses by ITER.  */
+  tmp = convert_modes (x_addr_mode, iter_mode, iter, true);
+  x_addr = simplify_gen_binary (PLUS, x_addr_mode, x_addr, tmp);
+
+  if (x_addr_mode != y_addr_mode)
+    tmp = convert_modes (y_addr_mode, iter_mode, iter, true);
+  y_addr = simplify_gen_binary (PLUS, y_addr_mode, y_addr, tmp);
+
+  x = change_address (x, cmp_mode, x_addr);
+  y = change_address (y, cmp_mode, y_addr);
+
+  /* Compare one block.  */
+  rtx part_res;
+  if (cmp_mode == BLKmode)
+    part_res = compare_by_pieces (x, y, incr, target, align, 0, 0);
+  else
+    part_res = expand_binop (cmp_mode, sub_optab, x, y, NULL_RTX,
+			     true, OPTAB_LIB_WIDEN);
+
+  /* Stop if we found a difference.  */
+  emit_cmp_and_jump_insns (part_res, GEN_INT (0), NE, NULL_RTX,
+			   GET_MODE (part_res), true, ne_label,
+			   profile_probability::guessed_always ()
+				.apply_scale (1, 10));
+
+  /* Increment ITER.  */
+  tmp = expand_simple_binop (iter_mode, PLUS, iter, iter_incr, iter,
+			     true, OPTAB_LIB_WIDEN);
+  if (tmp != iter)
+    emit_move_insn (iter, tmp);
+
+  if (cmp_label)
+    emit_label (cmp_label);
+  /* Loop until we reach the limit.  */
+
+  if (top_label)
+    emit_cmp_and_jump_insns (iter, iter_limit, iter_cond, NULL_RTX, iter_mode,
+			     true, top_label,
+			     profile_probability::guessed_always ()
+			     .apply_scale (9, 10));
+
+  /* We got to the end without differences, so the result is zero.  */
+  if (target == NULL_RTX
+      || !REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
+    target = gen_reg_rtx (TYPE_MODE (integer_type_node));
+
+  emit_move_insn (target, const0_rtx);
+  emit_jump (res_label);
+
+  emit_label (ne_label);
+
+  /* Return nonzero, or pinpoint the difference to return the expected
+     result for non-equality tests.  */
+  if (equality_only)
+    emit_move_insn (target, const1_rtx);
+  else
+    {
+      if (incr > UNITS_PER_WORD)
+	/* ??? Re-compare the block found to be different one word at a
+	   time.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_WORD, 0);
+      else if (incr > 1)
+	/* ??? Re-compare the block found to be different one byte at a
+	   time.  We could do better using part_res, and being careful
+	   about endianness.  */
+	part_res = emit_block_cmp_via_loop (x, y, GEN_INT (incr), len_type,
+					    target, equality_only,
+					    BITS_PER_UNIT, 0);
+      else if (known_gt (GET_MODE_BITSIZE (GET_MODE (target)),
+			 GET_MODE_BITSIZE (cmp_mode)))
+	part_res = expand_binop (GET_MODE (target), sub_optab, x, y, target,
+				 true, OPTAB_LIB_WIDEN);
+      else
+	{
+	  /* In the odd chance target is QImode, we can't count on
+	     widening subtract to capture the result of the unsigned
+	     compares.  */
+	  rtx_code_label *ltu_label;
+	  ltu_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (x, y, LTU, NULL_RTX,
+				   cmp_mode, true, ltu_label,
+				   profile_probability::guessed_always ()
+				   .apply_scale (5, 10));
+
+	  emit_move_insn (target, const1_rtx);
+	  emit_jump (res_label);
+
+	  emit_label (ltu_label);
+	  emit_move_insn (target, constm1_rtx);
+	  part_res = target;
+	}
+
+      if (target != part_res)
+	convert_move (target, part_res, false);
+    }
+
+  emit_label (res_label);
+
+  return target;
+}
+
 
 /* Copy all or part of a value X into registers starting at REGNO.
    The number of registers to be filled is NREGS.  */
diff --git a/gcc/expr.h b/gcc/expr.h
index 11bff53186206..988c783e45033 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -126,7 +126,8 @@  struct by_pieces_prev
   fixed_size_mode mode;
 };
 
-extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
+extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods,
+			    unsigned ctz_size = 0);
 extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 			          unsigned int, HOST_WIDE_INT,
 				  unsigned HOST_WIDE_INT,
@@ -134,9 +135,11 @@  extern rtx emit_block_move_hints (rtx, rtx, rtx, enum block_op_methods,
 				  unsigned HOST_WIDE_INT,
 				  bool bail_out_libcall = false,
 				  bool *is_move_done = NULL,
-				  bool might_overlap = false);
+				  bool might_overlap = false,
+				  unsigned ctz_size = 0);
 extern rtx emit_block_cmp_hints (rtx, rtx, rtx, tree, rtx, bool,
-				 by_pieces_constfn, void *);
+				 by_pieces_constfn, void *,
+				 unsigned ctz_len = 0);
 extern bool emit_storent_insn (rtx to, rtx from);
 
 /* Copy all or part of a value X into registers starting at REGNO.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index 7466c1106f2ba..7a20cfd5de3de 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -437,6 +437,17 @@  enum gfc_convert
 };
 
 
+/* Inline String Operations functions.  */
+enum ilsop_fn
+{
+  ILSOP_NONE = 0,
+  ILSOP_MEMSET = 1 << 0,
+  ILSOP_MEMCPY = 1 << 1,
+  ILSOP_MEMMOVE = 1 << 2,
+  ILSOP_MEMCMP = 1 << 3,
+  ILSOP_ALL = -1
+};
+
 /* Control-Flow Protection values.  */
 enum cf_protection_level
 {
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
new file mode 100644
index 0000000000000..a368f0741129d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cmp-1.c
@@ -0,0 +1,7 @@ 
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcmp -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+
+/* Check that no memcmp calls remain, but allow for lib_memcmp calls.  */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
new file mode 100644
index 0000000000000..c98e903c1f169
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-1.c
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memcpy -save-temps -g0 -fno-lto" } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
new file mode 100644
index 0000000000000..2cd2057a9b597
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-cpy-cmp-1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do run } */
+/* { dg-options "-finline-stringops -save-temps -g0 -fno-lto" } */
+/* { dg-require-effective-target ptr32plus } */
+/* { dg-timeout-factor 2 } */
+
+#include "../memcmp-1.c"
+/* Yeah, this memcmp test exercises plenty of memcpy, more than any of the
+   memcpy tests.  */
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {(^|\*)\mmemcmp\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
new file mode 100644
index 0000000000000..f1d77b50a4cd4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-move-1.c
@@ -0,0 +1,9 @@ 
+/* { dg-do run } */
+/* { dg-options "-finline-stringops=memmove -save-temps -g0 -fno-lto" } */
+
+#include "../../gcc.c-torture/execute/builtins/memmove.c"
+
+#include "../../gcc.c-torture/execute/builtins/lib/main.c"
+
+/* { dg-final { scan-assembler-not {\mmemcpy\M} } } */
+/* { dg-final { scan-assembler-not {\mmemmove\M} } } */
diff --git a/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
new file mode 100644
index 0000000000000..bdcf9bf5b61d8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/inline-mem-set-1.c
@@ -0,0 +1,84 @@ 
+/* { dg-do compile } */
+/* { dg-options "-finline-stringops -fno-lto" } */
+
+void *zero (unsigned long long (*p)[32], int n)
+{
+  return __builtin_memset (p, 0, n * sizeof (*p));
+}
+
+void *ones (char (*p)[128], int n)
+{
+  return __builtin_memset (p, -1, n * sizeof (*p));
+}
+
+void *opt2 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 2) * sizeof (*p));
+}
+
+void *opt8 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 8) * sizeof (*p));
+}
+
+void *opt32 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 32) * sizeof (*p));
+}
+
+void *opt128 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 128) * sizeof (*p));
+}
+
+void *opt512 (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 1024 : 512) * sizeof (*p));
+}
+
+void *opt_primes (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 509 : 7) * sizeof (*p));
+}
+
+void *opt_primes_blk (int *p, int i)
+{
+  return __builtin_memset (p, 0, (i ? 521 : 9) * sizeof (*p));
+}
+
+void *huge (long (*p)[16384])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1 (long (*p)[16384+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep4 (long (*p)[16384+4])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep16 (long (*p)[16384+16])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep64 (long (*p)[16384+64])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep256 (long (*p)[16384+256])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+void *hugep1024p256p64p16p4p1 (long (*p)[16384+1024+64+16+4+1])
+{
+  return __builtin_memset (p, 0, sizeof (*p));
+}
+
+/* { dg-final { scan-assembler-not {\mmemset\M} } } */