diff mbox

[SH] Add support for inlined builtin_strncmp

Message ID 526A7DE6.9030304@st.com
State New
Headers show

Commit Message

Christian Bruel Oct. 25, 2013, 2:19 p.m. UTC
Hello,

This patch implements the cmpstrnsi pattern to support the strncmp
builtin for constant lengths. The cmp/str instructions is used for size
>= 8 bytes, else fall back to the byte-at-a-time check to favor small
strings.

I now also handle the cases where align is known for both cmpstr and
cmpstrn, so we can avoid the pointer check, and added a schedule
improvement to speculate the extu.b  r1,r1 instruction into the delay
slot, winning an additional instruction (we know that r1 is 0) when the
end of string is reached. The byte-at-a-time loop becomes:

        mov.b   @r4+,r1
        tst     r1,r1
        bt/s    .L4
        mov.b   @r3+,r0
        cmp/eq  r1,r0
        bt/s    .L9
        extu.b  r1,r1
.L4:
        extu.b  r0,r0
        rts
        sub     r1,r0

Enabled the existing execute/builtins/strncmp-2.c for functional check
and added 2 new target specific tests.

No regressions for -m2 and -m4 for sh-elf.
OK for trunk ?

Many thanks,

Christian

Comments

Christian Bruel Oct. 25, 2013, 2:23 p.m. UTC | #1
In the ChangeLog,  the entry

* gcc/config/sh/sh-mem.cc (sh_expand_cmpnstr): Moved here.

is instead

 * gcc/config/sh/sh-mem.cc (sh_expand_cmpnstr): New function.

Sorry for this,

Christian
Kaz Kojima Oct. 25, 2013, 11:16 p.m. UTC | #2
Christian Bruel <christian.bruel@st.com> wrote:
> No regressions for -m2 and -m4 for sh-elf.
> OK for trunk ?

OK with the change of ChangeLog entry suggested by your another mail.
Thanks!

Regards,
	kaz
diff mbox

Patch

2013-10-27  Christian Bruel  <christian.bruel@st.com>

	* gcc/config/sh/sh-mem.cc (sh_expand_cmpnstr): Moved here.
	(sh_expand_cmpstr): Handle known align and schedule improvements.
	* gcc/config/sh/sh-protos.h (sh_expand_cmpstrn): Declare.
	* gcc/config/sh/sh.md (cmpstrnsi): New pattern.

	* gcc.c-torture/execute/builtins/strncmp-2.c: Enable for SH.
	* gcc.target/sh/cmpstr.c: New test.
	* gcc.target/sh/cmpstrn.c: New test.

Index: config/sh/sh-mem.cc
===================================================================
--- config/sh/sh-mem.cc	(revision 204013)
+++ config/sh/sh-mem.cc	(working copy)
@@ -200,22 +200,25 @@  sh_expand_cmpstr (rtx *operands)
   rtx L_return = gen_label_rtx ();
   rtx L_loop_byte = gen_label_rtx ();
   rtx L_end_loop_byte = gen_label_rtx ();
-  rtx L_loop_long = gen_label_rtx ();
-  rtx L_end_loop_long = gen_label_rtx ();
 
   rtx jump, addr1, addr2;
   int prob_unlikely = REG_BR_PROB_BASE / 10;
   int prob_likely = REG_BR_PROB_BASE / 4;
 
-  emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
-  emit_move_insn (tmp0, GEN_INT (3));
+  rtx L_loop_long = gen_label_rtx ();
+  rtx L_end_loop_long = gen_label_rtx ();
 
-  emit_insn (gen_tstsi_t (tmp0, tmp1));
+  int align = INTVAL (operands[3]);
 
   emit_move_insn (tmp0, const0_rtx);
 
-  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
-  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  if (align < 4)
+    {
+      emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+      emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
+      jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+      add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+    }
 
   addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
   addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
@@ -250,7 +253,7 @@  sh_expand_cmpstr (rtx *operands)
   add_int_reg_note (jump, REG_BR_PROB, prob_likely);
   /* end loop.  */
 
-  /* Fallthu, check if one of the word is greater.  */
+  /* Fallthu, diff results r.  */
   if (TARGET_LITTLE_ENDIAN)
     {
       rtx low_1 = gen_lowpart (HImode, tmp1);
@@ -267,15 +270,15 @@  sh_expand_cmpstr (rtx *operands)
   jump = emit_jump_insn (gen_jump_compact (L_return));
   emit_barrier_after (jump);
 
-  /* start byte loop.  */
-  addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
-  addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
-
   emit_label (L_end_loop_long);
 
   emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
   emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
 
+  /* start byte loop.  */
+  addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
   emit_label (L_loop_byte);
 
   emit_insn (gen_extendqisi2 (tmp2, addr2));
@@ -289,13 +292,16 @@  sh_expand_cmpstr (rtx *operands)
   add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
 
   emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
-  emit_jump_insn (gen_branch_true (L_loop_byte));
+  if (flag_delayed_branch)
+    emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+  jump = emit_jump_insn (gen_branch_true (L_loop_byte));
   add_int_reg_note (jump, REG_BR_PROB, prob_likely);
   /* end loop.  */
 
   emit_label (L_end_loop_byte);
 
-  emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+  if (! flag_delayed_branch)
+    emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
   emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
 
   emit_label (L_return);
@@ -305,3 +311,166 @@  sh_expand_cmpstr (rtx *operands)
   return true;
 }
 
+/* Emit code to perform a strcmp.
+
+   OPERANDS[0] is the destination.
+   OPERANDS[1] is the first string.
+   OPERANDS[2] is the second string.
+   OPERANDS[3] is the length.
+   OPERANDS[4] is the align.  */
+bool
+sh_expand_cmpnstr (rtx *operands)
+{
+  rtx s1 = copy_rtx (operands[1]);
+  rtx s2 = copy_rtx (operands[2]);
+
+  rtx s1_addr = copy_addr_to_reg (XEXP (s1, 0));
+  rtx s2_addr = copy_addr_to_reg (XEXP (s2, 0));
+  rtx tmp0 = gen_reg_rtx (SImode);
+  rtx tmp1 = gen_reg_rtx (SImode);
+  rtx tmp2 = gen_reg_rtx (SImode);
+
+  rtx L_return = gen_label_rtx ();
+  rtx L_loop_byte = gen_label_rtx ();
+  rtx L_end_loop_byte = gen_label_rtx ();
+
+  rtx jump, addr1, addr2;
+  int prob_unlikely = REG_BR_PROB_BASE / 10;
+  int prob_likely = REG_BR_PROB_BASE / 4;
+
+  rtx len = force_reg (SImode, operands[3]);
+  int constp = (CONST_INT_P (operands[3]));
+  int bytes = (constp ? INTVAL (operands[3]) : 0);
+  int witers = bytes / 4;
+
+  /* We could still loop on a register count. Not found very
+     convincing to optimize yet.  */
+  if (! constp)
+    return false;
+
+  if (witers > 1)
+    {
+      rtx L_loop_long = gen_label_rtx ();
+      rtx L_end_loop_long = gen_label_rtx ();
+      rtx tmp3 = gen_reg_rtx (SImode);
+      rtx lenw = gen_reg_rtx (SImode);
+      int align = INTVAL (operands[4]);
+
+      emit_move_insn (tmp0, const0_rtx);
+
+      if (align < 4)
+	{
+	  emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+	  emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
+	  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+	  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+	}
+
+      addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
+      addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
+
+      /* word count. Do we have iterations ? */
+      emit_insn (gen_lshrsi3 (lenw, len, GEN_INT (2)));
+
+      /*start long loop.  */
+      emit_label (L_loop_long);
+
+      /* tmp2 is aligned, OK to load.  */
+      emit_move_insn (tmp2, addr2);
+      emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+      /* tmp1 is aligned, OK to load.  */
+      emit_move_insn (tmp1, addr1);
+      emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4));
+
+      /* Is there a 0 byte ?  */
+      emit_insn (gen_andsi3 (tmp3, tmp2, tmp1));
+
+      emit_insn (gen_cmpstr_t (tmp0, tmp3));
+      jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
+      add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+      emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+      jump = emit_jump_insn (gen_branch_false (L_end_loop_long));
+      add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+      if (TARGET_SH2)
+	emit_insn (gen_dect (lenw, lenw));
+      else
+	{
+	  emit_insn (gen_addsi3 (lenw, lenw, GEN_INT (-1)));
+	  emit_insn (gen_tstsi_t (lenw, lenw));
+	}
+      jump = emit_jump_insn (gen_branch_false (L_loop_long));
+      add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+      /* end loop.  Reached max iteration.  */
+      if (bytes % 4 == 0)
+	{
+	  /* Done.  */
+	  jump = emit_jump_insn (gen_jump_compact (L_return));
+	  emit_barrier_after (jump);
+	}
+      else
+	{
+	  /* Remaining bytes to read.   */
+	  emit_move_insn (len, GEN_INT (bytes % 4));
+	  jump = emit_jump_insn (gen_jump_compact (L_loop_byte));
+	  emit_barrier_after (jump);
+	}
+
+      emit_label (L_end_loop_long);
+
+      /* Remaining bytes to read.   */
+      emit_move_insn (len, GEN_INT (4));
+
+      /* Found last word.  Restart it byte per byte. */
+      emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
+      emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+    }
+
+    addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+    addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
+    emit_label (L_loop_byte);
+
+    emit_insn (gen_extendqisi2 (tmp2, addr2));
+    emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 1));
+
+    emit_insn (gen_extendqisi2 (tmp1, addr1));
+    emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 1));
+
+    emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx));
+    jump = emit_jump_insn (gen_branch_true (L_end_loop_byte));
+    add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+    emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+    if (flag_delayed_branch)
+      emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+    jump = emit_jump_insn (gen_branch_false (L_end_loop_byte));
+    add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+    if (TARGET_SH2)
+      emit_insn (gen_dect (len, len));
+    else
+      {
+	emit_insn (gen_addsi3 (len, len, GEN_INT (-1)));
+	emit_insn (gen_tstsi_t (len, len));
+      }
+
+    jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+    add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+    /* end byte loop.  */
+
+    emit_label (L_end_loop_byte);
+
+    if (! flag_delayed_branch)
+      emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+    emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
+
+    emit_label (L_return);
+
+    emit_insn (gen_subsi3 (operands[0], tmp1, tmp2));
+
+    return true;
+}
Index: config/sh/sh-protos.h
===================================================================
--- config/sh/sh-protos.h	(revision 204013)
+++ config/sh/sh-protos.h	(working copy)
@@ -117,6 +117,7 @@  extern void output_pic_addr_const (FILE *, rtx);
 extern bool expand_block_move (rtx *);
 extern void prepare_move_operands (rtx[], enum machine_mode mode);
 extern bool sh_expand_cmpstr (rtx *);
+extern bool sh_expand_cmpnstr (rtx *);
 extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
 					       enum rtx_code comparison);
 extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);
Index: config/sh/sh.md
===================================================================
--- config/sh/sh.md	(revision 204013)
+++ config/sh/sh.md	(working copy)
@@ -11943,6 +11943,19 @@  label:
    else FAIL;
 })
 
+(define_expand "cmpstrnsi"
+  [(set (match_operand:SI 0 "register_operand")
+	(compare:SI (match_operand:BLK 1 "memory_operand")
+		    (match_operand:BLK 2 "memory_operand")))
+   (use (match_operand:SI 3 "immediate_operand"))
+   (use (match_operand:SI 4 "immediate_operand"))]
+  "TARGET_SH1"
+{
+ if (! optimize_insn_for_size_p () && sh_expand_cmpnstr (operands))
+    DONE;
+  else FAIL;
+})
+
 
 ;; -------------------------------------------------------------------------
 ;; Floating point instructions.
Index: testsuite/gcc.c-torture/execute/builtins/strncmp-2.c
===================================================================
--- testsuite/gcc.c-torture/execute/builtins/strncmp-2.c	(revision 204013)
+++ testsuite/gcc.c-torture/execute/builtins/strncmp-2.c	(working copy)
@@ -12,7 +12,7 @@  extern int strncmp (const char *, const char *, si
 void
 main_test (void)
 {
-#if !defined(__OPTIMIZE__) || ((defined(__i386__) || defined (__x86_64__)) && !defined(__OPTIMIZE_SIZE__))
+#if !defined(__OPTIMIZE__) || ((defined(__sh__) || defined(__i386__) || defined (__x86_64__)) && !defined(__OPTIMIZE_SIZE__))
   /* These tests work on platforms which support cmpstrsi.  We test it
      at -O0 on all platforms to ensure the strncmp logic is correct.  */
   const char *const s1 = "hello world";
Index: testsuite/gcc.target/sh/cmpstr.c
===================================================================
--- testsuite/gcc.target/sh/cmpstr.c	(revision 0)
+++ testsuite/gcc.target/sh/cmpstr.c	(working copy)
@@ -0,0 +1,27 @@ 
+/* Check that the __builtin_strcmp function is inlined 
+   when optimizing for speed.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" } { "" } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+/* { dg-final { scan-assembler-times "cmp/str" 3 } } */
+/* { dg-final { scan-assembler-times "tst\t#3" 2 } } */
+
+test00 (const char *s1, const char *s2)
+{
+  return __builtin_strcmp (s1, s2);
+}
+
+/* NB: This might change as further optimisation might detect the
+   max length and fallback to cmpstrn.  */
+test01(const char *s2)
+{
+  return __builtin_strcmp ("abc", s2);
+}
+
+/* Check that no test for alignment is needed.  */
+test03(const char *s1, const char *s2)
+{
+  return __builtin_strcmp (__builtin_assume_aligned (s1, 4),
+			   __builtin_assume_aligned (s2, 4));
+}
Index: testsuite/gcc.target/sh/cmpstrn.c
===================================================================
--- testsuite/gcc.target/sh/cmpstrn.c	(revision 0)
+++ testsuite/gcc.target/sh/cmpstrn.c	(working copy)
@@ -0,0 +1,21 @@ 
+/* Check that the __builtin_strncmp function is inlined 
+   when optimizing for speed.  */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" } { "" } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+/* { dg-final { scan-assembler-times "cmp/str" 1 } } */
+
+/* Test that the cmp/str instruction is not used.  */
+test01(const char *s1, const char *s2, int n)
+{
+  return __builtin_strncmp (s1, "abcde", 3);
+}
+
+/* Test that the cmp/str instruction is used.  */
+test02(const char *s1, const char *s2, int n)
+{
+  return __builtin_strncmp (s1, "abcdefghi", 8);
+}
+
+