Patchwork [SH] inline builtin_memset

login
register
mail settings
Submitter Christian Bruel
Date March 26, 2014, 10:22 a.m.
Message ID <5332AA5E.1090701@st.com>
Download mbox | patch
Permalink /patch/333810/
State New
Headers show

Comments

Christian Bruel - March 26, 2014, 10:22 a.m.
Hello,

This patch inlines builtin_memset whose size is a constant 128 < size <
15. Small sizes are better unrolled with mov_insn sequences. Big size
(or non constants) are better handled with a libc implementation that
does cache line aligned copying and unrolling or prefetching.

No new regressions for sh-none-elf and sh-linux-elf without new errors.

OK for trunk ?

many thanks,
Christian Bruel - March 26, 2014, 10:27 a.m.
On 03/26/2014 11:22 AM, Christian Bruel wrote:
> Hello,
>
> This patch inlines builtin_memset whose size is a constant 128 < size <
> 15. Small sizes are better unrolled with mov_insn sequences. Big size
> (or non constants) are better handled with a libc implementation that
> does cache line aligned copying and unrolling or prefetching.

Correction, it's memcpy that can do that, but nevertheless, a
specialized implementation in the glibc is better for big sizes (and
absorbs the cost of the jump).


>
> No new regressions for sh-none-elf and sh-linux-elf without new errors.
>
> OK for trunk ?
>
> many thanks,
>
>
>
>
>
>
>
>
Kaz Kojima - March 26, 2014, 11:53 a.m.
Christian Bruel <christian.bruel@st.com> wrote:
> This patch inlines builtin_memset whose size is a constant 128 < size <
> 15. Small sizes are better unrolled with mov_insn sequences. Big size
> (or non constants) are better handled with a libc implementation that
> does cache line aligned copying and unrolling or prefetching.
> 
> No new regressions for sh-none-elf and sh-linux-elf without new errors.
> 
> OK for trunk ?

OK for trunk when it returns to stage 1 or 2.

Regards,
	kaz

Patch

2014-03-20  Christian Bruel  <christian.bruel@st.com>

	* config/sh/sh.md (setmemqi): New expand pattern.
	(CLEAR_RATIO): Define.
	* config/sh/sh-mem.cc (sh_expand_setmem): Define.
	* config/sh/sh-protos.h (sh_expand_setmem): Declare.

2014-01-20  Christian Bruel  <christian.bruel@st.com>

	* gcc.target/sh/memset.c: New test.

Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc	(revision 208745)
+++ gcc/config/sh/sh-mem.cc	(working copy)
@@ -608,3 +608,106 @@  sh_expand_strlen (rtx *operands)
 
   return true;
 }
+
+/* Emit code to perform a memset
+
+   OPERANDS[0] is the destination.
+   OPERANDS[1] is the size;
+   OPERANDS[2] is the char to search.
+   OPERANDS[3] is the alignment.  */
+void
+sh_expand_setmem (rtx *operands)
+{
+  rtx L_loop_byte = gen_label_rtx ();
+  rtx L_loop_word = gen_label_rtx ();
+  rtx L_return = gen_label_rtx ();
+  rtx jump;
+  rtx dest = copy_rtx (operands[0]);
+  rtx dest_addr = copy_addr_to_reg (XEXP (dest, 0));
+  rtx val = force_reg (SImode, operands[2]);
+  int align = INTVAL (operands[3]);
+  int count = 0;
+  rtx len = force_reg (SImode, operands[1]);
+
+  if (! CONST_INT_P (operands[1]))
+    return;
+
+  count = INTVAL (operands[1]);
+
+  if (CONST_INT_P (operands[2])
+      && (INTVAL (operands[2]) == 0 || INTVAL (operands[2]) == -1) && count > 8)
+    {
+      rtx lenw = gen_reg_rtx (SImode);
+
+      if (align < 4)
+        {
+          emit_insn (gen_tstsi_t (GEN_INT (3), dest_addr));
+          jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+          add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+        }
+
+      /* word count. Do we have iterations ? */
+      emit_insn (gen_lshrsi3 (lenw, len, GEN_INT (2)));
+
+      dest = adjust_automodify_address (dest, SImode, dest_addr, 0);
+
+      /* start loop.  */
+      emit_label (L_loop_word);
+
+      if (TARGET_SH2)
+        emit_insn (gen_dect (lenw, lenw));
+      else
+        {
+          emit_insn (gen_addsi3 (lenw, lenw, GEN_INT (-1)));
+          emit_insn (gen_tstsi_t (lenw, lenw));
+        }
+
+      emit_move_insn (dest, val);
+      emit_move_insn (dest_addr, plus_constant (Pmode, dest_addr,
+                                                GET_MODE_SIZE (SImode)));
+
+
+      jump = emit_jump_insn (gen_branch_false (L_loop_word));
+      add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+      count = count % 4;
+
+      dest = adjust_address (dest, QImode, 0);
+
+      val = gen_lowpart (QImode, val);
+
+      while (count--)
+        {
+          emit_move_insn (dest, val);
+          emit_move_insn (dest_addr, plus_constant (Pmode, dest_addr,
+                                                    GET_MODE_SIZE (QImode)));
+        }
+
+      jump = emit_jump_insn (gen_jump_compact (L_return));
+      emit_barrier_after (jump);
+    }
+
+  dest = adjust_automodify_address (dest, QImode, dest_addr, 0);
+
+  /* start loop.  */
+  emit_label (L_loop_byte);
+
+  if (TARGET_SH2)
+    emit_insn (gen_dect (len, len));
+  else
+    {
+      emit_insn (gen_addsi3 (len, len, GEN_INT (-1)));
+      emit_insn (gen_tstsi_t (len, len));
+    }
+
+  val = gen_lowpart (QImode, val);
+  emit_move_insn (dest, val);
+  emit_move_insn (dest_addr, plus_constant (Pmode, dest_addr,
+                                            GET_MODE_SIZE (QImode)));
+
+  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_label (L_return);
+
+  return;
+}
Index: gcc/config/sh/sh-protos.h
===================================================================
--- gcc/config/sh/sh-protos.h	(revision 208745)
+++ gcc/config/sh/sh-protos.h	(working copy)
@@ -119,6 +119,7 @@  extern void prepare_move_operands (rtx[], enum mac
 extern bool sh_expand_cmpstr (rtx *);
 extern bool sh_expand_cmpnstr (rtx *);
 extern bool sh_expand_strlen  (rtx *);
+extern void sh_expand_setmem (rtx *);
 extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
 					       enum rtx_code comparison);
 extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);
Index: gcc/config/sh/sh.h
===================================================================
--- gcc/config/sh/sh.h	(revision 208745)
+++ gcc/config/sh/sh.h	(working copy)
@@ -1594,6 +1594,11 @@  struct sh_args {
 
 #define SET_BY_PIECES_P(SIZE, ALIGN) STORE_BY_PIECES_P(SIZE, ALIGN)
 
+/* If a memory clear move would take CLEAR_RATIO or more simple
+   move-instruction pairs, we will do a setmem instead.  */
+
+#define CLEAR_RATIO(speed) ((speed) ? 15 : 3)
+
 /* Macros to check register numbers against specific register classes.  */
 
 /* These assume that REGNO is a hard or pseudo reg number.
Index: gcc/config/sh/sh.md
===================================================================
--- gcc/config/sh/sh.md	(revision 208745)
+++ gcc/config/sh/sh.md	(working copy)
@@ -12089,6 +12089,20 @@  label:
    FAIL;
 })
 
+(define_expand "setmemqi"
+  [(parallel [(set (match_operand:BLK 0 "memory_operand")
+                   (match_operand 2 "const_int_operand"))
+              (use (match_operand:QI 1 "const_int_operand"))
+              (use (match_operand:QI 3 "const_int_operand"))])]
+  "TARGET_SH1 && optimize"
+  {
+    if (optimize_insn_for_size_p ())
+       FAIL;
+
+    sh_expand_setmem (operands);
+    DONE;
+  })
+
 
 ;; -------------------------------------------------------------------------
 ;; Floating point instructions.
Index: gcc/testsuite/gcc.target/sh/memset.c
===================================================================
--- gcc/testsuite/gcc.target/sh/memset.c	(revision 0)
+++ gcc/testsuite/gcc.target/sh/memset.c	(working copy)
@@ -0,0 +1,13 @@ 
+/* Check that the __builtin_memset function is inlined when
+   optimizing for speed.  */
+/* { dg-do compile }  */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" } { "" } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+
+void
+test00(char *dstb)
+{
+  __builtin_memset (dstb, 0, 15);
+}
+