diff mbox series

[RS6000] Add movmemsi pattern for inline expansion of memmove()

Message ID cce00464-f26b-1aaf-22d2-798447d0cc95@linux.ibm.com
State New
Headers show
Series [RS6000] Add movmemsi pattern for inline expansion of memmove() | expand

Commit Message

Aaron Sawdey Sept. 30, 2019, 4:36 p.m. UTC
This patch uses the support added in the patch I posted last week for actually doing
inline expansion of memmove(). I've added a might_overlap parameter to expand_block_move()
to tell it when it must make sure to handle overlapping moves. I changed the code to
save up the generated rtx for both loads and stores instead of just stores. In the
might_overlap==true case, if we get to MAX_MOVE_REG and the move is not done yet, then
we bail out and return false. So what this can now do is inline expand any memmove()
that can be done in 4 loads followed by 4 stores. It will use lxv/stxv if size/alignment
allows, otherwise it will use unaligned integer loads/stores. So it can expand most
memmove() up to 32 bytes, and some that are 33-64 bytes if the arguments are 16 byte
aligned.

I've also removed the code from expand_block_move() for dealing with
mode==BLKmode because I don't believe that can happen. The big if construct that
figures out which size we are going to use has a plain else on it, and every
clause in it sets mode to something other than BLKmode. So I removed that code
to simplify things and just left a gcc_assert(mode != BLKmode).

Regtest in progress on ppc64le (power9), if tests are ok, is this ok for trunk
after the movmem optab patch posted last week is approved?

Thanks!
   Aaron

2019-09-30  Aaron Sawdey <acsawdey@linux.ibm.com>

	* config/rs6000/rs6000-protos.h (expand_block_move): Change prototype.
	* config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap parm.
	* config/rs6000/rs6000.md (movmemsi): Add new pattern.
	(cpymemsi): Add might_overlap parm to expand_block_move() call.

Comments

Segher Boessenkool Oct. 1, 2019, 1:39 a.m. UTC | #1
Hi!

On Mon, Sep 30, 2019 at 11:36:31AM -0500, Aaron Sawdey wrote:
> This patch uses the support added in the patch I posted last week for actually doing
> inline expansion of memmove().

> I've also removed the code from expand_block_move() for dealing with
> mode==BLKmode because I don't believe that can happen. The big if construct that
> figures out which size we are going to use has a plain else on it, and every
> clause in it sets mode to something other than BLKmode. So I removed that code
> to simplify things and just left a gcc_assert(mode != BLKmode).

That looks fine, with that assert.

> 2019-09-30  Aaron Sawdey <acsawdey@linux.ibm.com>
> 
> 	* config/rs6000/rs6000-protos.h (expand_block_move): Change prototype.
> 	* config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap parm.

This line is too long...  80 columns max.

> 	* config/rs6000/rs6000.md (movmemsi): Add new pattern.
> 	(cpymemsi): Add might_overlap parm to expand_block_move() call.


Okay for trunk.  Thanks!


Segher
diff mbox series

Patch

Index: gcc/config/rs6000/rs6000-protos.h
===================================================================
--- gcc/config/rs6000/rs6000-protos.h	(revision 276131)
+++ gcc/config/rs6000/rs6000-protos.h	(working copy)
@@ -69,7 +69,7 @@ 
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
-extern int expand_block_move (rtx[]);
+extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
Index: gcc/config/rs6000/rs6000-string.c
===================================================================
--- gcc/config/rs6000/rs6000-string.c	(revision 276131)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -2719,7 +2719,7 @@ 
 #define MAX_MOVE_REG 4

 int
-expand_block_move (rtx operands[])
+expand_block_move (rtx operands[], bool might_overlap)
 {
   rtx orig_dest = operands[0];
   rtx orig_src	= operands[1];
@@ -2730,6 +2730,7 @@ 
   int bytes;
   int offset;
   int move_bytes;
+  rtx loads[MAX_MOVE_REG];
   rtx stores[MAX_MOVE_REG];
   int num_reg = 0;

@@ -2817,47 +2818,35 @@ 
 	  gen_func.mov = gen_movqi;
 	}

+      /* Mode is always set to something other than BLKmode by one of the
+	 cases of the if statement above.  */
+      gcc_assert (mode != BLKmode);
+
       src = adjust_address (orig_src, mode, offset);
       dest = adjust_address (orig_dest, mode, offset);

-      if (mode != BLKmode)
-	{
-	  rtx tmp_reg = gen_reg_rtx (mode);
+      rtx tmp_reg = gen_reg_rtx (mode);
+
+      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
+      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);

-	  emit_insn ((*gen_func.mov) (tmp_reg, src));
-	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
-	}
+      /* If we didn't succeed in doing it in one pass, we can't do it in the
+	 might_overlap case.  Bail out and return failure.  */
+      if (might_overlap && num_reg >= MAX_MOVE_REG
+	  && bytes > move_bytes)
+	return 0;

-      if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
+      /* Emit loads and stores saved up.  */
+      if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
 	{
 	  int i;
 	  for (i = 0; i < num_reg; i++)
+	    emit_insn (loads[i]);
+	  for (i = 0; i < num_reg; i++)
 	    emit_insn (stores[i]);
 	  num_reg = 0;
 	}
-
-      if (mode == BLKmode)
-	{
-	  /* Move the address into scratch registers.  The movmemsi
-	     patterns require zero offset.  */
-	  if (!REG_P (XEXP (src, 0)))
-	    {
-	      rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
-	      src = replace_equiv_address (src, src_reg);
-	    }
-	  set_mem_size (src, move_bytes);
-
-	  if (!REG_P (XEXP (dest, 0)))
-	    {
-	      rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
-	      dest = replace_equiv_address (dest, dest_reg);
-	    }
-	  set_mem_size (dest, move_bytes);
-
-	  emit_insn ((*gen_func.movmemsi) (dest, src,
-					   GEN_INT (move_bytes & 31),
-					   align_rtx));
-	}
+	
     }

   return 1;
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 276131)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -9057,7 +9057,7 @@ 
     FAIL;
 })

-;; String/block move insn.
+;; String/block copy insn (source and destination must not overlap).
 ;; Argument 0 is the destination
 ;; Argument 1 is the source
 ;; Argument 2 is the length
@@ -9070,11 +9070,31 @@ 
 	      (use (match_operand:SI 3 ""))])]
   ""
 {
-  if (expand_block_move (operands))
+  if (expand_block_move (operands, false))
     DONE;
   else
     FAIL;
 })
+
+;; String/block move insn (source and destination may overlap).
+;; Argument 0 is the destination
+;; Argument 1 is the source
+;; Argument 2 is the length
+;; Argument 3 is the alignment
+
+(define_expand "movmemsi"
+  [(parallel [(set (match_operand:BLK 0 "")
+		   (match_operand:BLK 1 ""))
+	      (use (match_operand:SI 2 ""))
+	      (use (match_operand:SI 3 ""))])]
+  ""
+{
+  if (expand_block_move (operands, true))
+    DONE;
+  else
+    FAIL;
+})
+
 
 ;; Define insns that do load or store with update.  Some of these we can
 ;; get by using pre-decrement or pre-increment, but the hardware can also