2013-10-27 Christian Bruel <christian.bruel@st.com>
* gcc/config/sh/sh-mem.cc (sh_expand_cmpnstr): Moved here.
(sh_expand_cmpstr): Handle known align and schedule improvements.
* gcc/config/sh/sh-protos.h (sh_expand_cmpstrn): Declare.
* gcc/config/sh/sh.md (cmpstrnsi): New pattern.
* gcc.c-torture/execute/builtins/strncmp-2.c: Enable for SH.
* gcc.target/sh/cmpstr.c: New test.
* gcc.target/sh/cmpstrn.c: New test.
===================================================================
@@ -200,22 +200,25 @@ sh_expand_cmpstr (rtx *operands)
rtx L_return = gen_label_rtx ();
rtx L_loop_byte = gen_label_rtx ();
rtx L_end_loop_byte = gen_label_rtx ();
- rtx L_loop_long = gen_label_rtx ();
- rtx L_end_loop_long = gen_label_rtx ();
rtx jump, addr1, addr2;
int prob_unlikely = REG_BR_PROB_BASE / 10;
int prob_likely = REG_BR_PROB_BASE / 4;
- emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
- emit_move_insn (tmp0, GEN_INT (3));
+ rtx L_loop_long = gen_label_rtx ();
+ rtx L_end_loop_long = gen_label_rtx ();
- emit_insn (gen_tstsi_t (tmp0, tmp1));
+ int align = INTVAL (operands[3]);
emit_move_insn (tmp0, const0_rtx);
- jump = emit_jump_insn (gen_branch_false (L_loop_byte));
- add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ if (align < 4)
+ {
+ emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+ emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
+ jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ }
addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
@@ -250,7 +253,7 @@ sh_expand_cmpstr (rtx *operands)
add_int_reg_note (jump, REG_BR_PROB, prob_likely);
/* end loop. */
- /* Fallthu, check if one of the word is greater. */
+ /* Fallthu, diff results r. */
if (TARGET_LITTLE_ENDIAN)
{
rtx low_1 = gen_lowpart (HImode, tmp1);
@@ -267,15 +270,15 @@ sh_expand_cmpstr (rtx *operands)
jump = emit_jump_insn (gen_jump_compact (L_return));
emit_barrier_after (jump);
- /* start byte loop. */
- addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
- addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
-
emit_label (L_end_loop_long);
emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+ /* start byte loop. */
+ addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+ addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
emit_label (L_loop_byte);
emit_insn (gen_extendqisi2 (tmp2, addr2));
@@ -289,13 +292,16 @@ sh_expand_cmpstr (rtx *operands)
add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
- emit_jump_insn (gen_branch_true (L_loop_byte));
+ if (flag_delayed_branch)
+ emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+ jump = emit_jump_insn (gen_branch_true (L_loop_byte));
add_int_reg_note (jump, REG_BR_PROB, prob_likely);
/* end loop. */
emit_label (L_end_loop_byte);
- emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+ if (! flag_delayed_branch)
+ emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
emit_label (L_return);
@@ -305,3 +311,166 @@ sh_expand_cmpstr (rtx *operands)
return true;
}
+/* Emit code to perform a strcmp.
+
+ OPERANDS[0] is the destination.
+ OPERANDS[1] is the first string.
+ OPERANDS[2] is the second string.
+ OPERANDS[3] is the length.
+ OPERANDS[4] is the align. */
+bool
+sh_expand_cmpnstr (rtx *operands)
+{
+ rtx s1 = copy_rtx (operands[1]);
+ rtx s2 = copy_rtx (operands[2]);
+
+ rtx s1_addr = copy_addr_to_reg (XEXP (s1, 0));
+ rtx s2_addr = copy_addr_to_reg (XEXP (s2, 0));
+ rtx tmp0 = gen_reg_rtx (SImode);
+ rtx tmp1 = gen_reg_rtx (SImode);
+ rtx tmp2 = gen_reg_rtx (SImode);
+
+ rtx L_return = gen_label_rtx ();
+ rtx L_loop_byte = gen_label_rtx ();
+ rtx L_end_loop_byte = gen_label_rtx ();
+
+ rtx jump, addr1, addr2;
+ int prob_unlikely = REG_BR_PROB_BASE / 10;
+ int prob_likely = REG_BR_PROB_BASE / 4;
+
+ rtx len = force_reg (SImode, operands[3]);
+ int constp = (CONST_INT_P (operands[3]));
+ int bytes = (constp ? INTVAL (operands[3]) : 0);
+ int witers = bytes / 4;
+
+ /* We could still loop on a register count. Not found very
+ convincing to optimize yet. */
+ if (! constp)
+ return false;
+
+ if (witers > 1)
+ {
+ rtx L_loop_long = gen_label_rtx ();
+ rtx L_end_loop_long = gen_label_rtx ();
+ rtx tmp3 = gen_reg_rtx (SImode);
+ rtx lenw = gen_reg_rtx (SImode);
+ int align = INTVAL (operands[4]);
+
+ emit_move_insn (tmp0, const0_rtx);
+
+ if (align < 4)
+ {
+ emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+ emit_insn (gen_tstsi_t (GEN_INT (3), tmp1));
+ jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ }
+
+ addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
+ addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
+
+ /* word count. Do we have iterations ? */
+ emit_insn (gen_lshrsi3 (lenw, len, GEN_INT (2)));
+
+ /*start long loop. */
+ emit_label (L_loop_long);
+
+ /* tmp2 is aligned, OK to load. */
+ emit_move_insn (tmp2, addr2);
+ emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+ /* tmp1 is aligned, OK to load. */
+ emit_move_insn (tmp1, addr1);
+ emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4));
+
+ /* Is there a 0 byte ? */
+ emit_insn (gen_andsi3 (tmp3, tmp2, tmp1));
+
+ emit_insn (gen_cmpstr_t (tmp0, tmp3));
+ jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
+ add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+ emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+ jump = emit_jump_insn (gen_branch_false (L_end_loop_long));
+ add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+ if (TARGET_SH2)
+ emit_insn (gen_dect (lenw, lenw));
+ else
+ {
+ emit_insn (gen_addsi3 (lenw, lenw, GEN_INT (-1)));
+ emit_insn (gen_tstsi_t (lenw, lenw));
+ }
+ jump = emit_jump_insn (gen_branch_false (L_loop_long));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+ /* end loop. Reached max iteration. */
+ if (bytes % 4 == 0)
+ {
+ /* Done. */
+ jump = emit_jump_insn (gen_jump_compact (L_return));
+ emit_barrier_after (jump);
+ }
+ else
+ {
+ /* Remaining bytes to read. */
+ emit_move_insn (len, GEN_INT (bytes % 4));
+ jump = emit_jump_insn (gen_jump_compact (L_loop_byte));
+ emit_barrier_after (jump);
+ }
+
+ emit_label (L_end_loop_long);
+
+ /* Remaining bytes to read. */
+ emit_move_insn (len, GEN_INT (4));
+
+ /* Found last word. Restart it byte per byte. */
+ emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
+ emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+ }
+
+ addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+ addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
+ emit_label (L_loop_byte);
+
+ emit_insn (gen_extendqisi2 (tmp2, addr2));
+ emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 1));
+
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 1));
+
+ emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_end_loop_byte));
+ add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+ emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+ if (flag_delayed_branch)
+ emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+ jump = emit_jump_insn (gen_branch_false (L_end_loop_byte));
+ add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+ if (TARGET_SH2)
+ emit_insn (gen_dect (len, len));
+ else
+ {
+ emit_insn (gen_addsi3 (len, len, GEN_INT (-1)));
+ emit_insn (gen_tstsi_t (len, len));
+ }
+
+ jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+ /* end byte loop. */
+
+ emit_label (L_end_loop_byte);
+
+ if (! flag_delayed_branch)
+ emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+ emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
+
+ emit_label (L_return);
+
+ emit_insn (gen_subsi3 (operands[0], tmp1, tmp2));
+
+ return true;
+}
===================================================================
@@ -117,6 +117,7 @@ extern void output_pic_addr_const (FILE *, rtx);
extern bool expand_block_move (rtx *);
extern void prepare_move_operands (rtx[], enum machine_mode mode);
extern bool sh_expand_cmpstr (rtx *);
+extern bool sh_expand_cmpnstr (rtx *);
extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
enum rtx_code comparison);
extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);
===================================================================
@@ -11943,6 +11943,19 @@ label:
else FAIL;
})
+(define_expand "cmpstrnsi"
+ [(set (match_operand:SI 0 "register_operand")
+ (compare:SI (match_operand:BLK 1 "memory_operand")
+ (match_operand:BLK 2 "memory_operand")))
+ (use (match_operand:SI 3 "immediate_operand"))
+ (use (match_operand:SI 4 "immediate_operand"))]
+ "TARGET_SH1"
+{
+ if (! optimize_insn_for_size_p () && sh_expand_cmpnstr (operands))
+ DONE;
+ else FAIL;
+})
+
;; -------------------------------------------------------------------------
;; Floating point instructions.
===================================================================
@@ -12,7 +12,7 @@ extern int strncmp (const char *, const char *, si
void
main_test (void)
{
-#if !defined(__OPTIMIZE__) || ((defined(__i386__) || defined (__x86_64__)) && !defined(__OPTIMIZE_SIZE__))
+#if !defined(__OPTIMIZE__) || ((defined(__sh__) || defined(__i386__) || defined (__x86_64__)) && !defined(__OPTIMIZE_SIZE__))
/* These tests work on platforms which support cmpstrsi. We test it
at -O0 on all platforms to ensure the strncmp logic is correct. */
const char *const s1 = "hello world";
===================================================================
@@ -0,0 +1,27 @@
+/* Check that the __builtin_strcmp function is inlined
+ when optimizing for speed. */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" } { "" } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+/* { dg-final { scan-assembler-times "cmp/str" 3 } } */
+/* { dg-final { scan-assembler-times "tst\t#3" 2 } } */
+
+test00 (const char *s1, const char *s2)
+{
+ return __builtin_strcmp (s1, s2);
+}
+
+/* NB: This might change as further optimisation might detect the
+ max length and fallback to cmpstrn. */
+test01(const char *s2)
+{
+ return __builtin_strcmp ("abc", s2);
+}
+
+/* Check that no test for alignment is needed. */
+test03(const char *s1, const char *s2)
+{
+ return __builtin_strcmp (__builtin_assume_aligned (s1, 4),
+ __builtin_assume_aligned (s2, 4));
+}
===================================================================
@@ -0,0 +1,21 @@
+/* Check that the __builtin_strncmp function is inlined
+ when optimizing for speed. */
+/* { dg-do compile { target "sh*-*-*" } } */
+/* { dg-options "-O2" } */
+/* { dg-skip-if "" { "sh*-*-*" } { "-m5*" } { "" } } */
+/* { dg-final { scan-assembler-not "jmp" } } */
+/* { dg-final { scan-assembler-times "cmp/str" 1 } } */
+
+/* Test that the cmp/str instruction is not used. */
+test01(const char *s1, const char *s2, int n)
+{
+ return __builtin_strncmp (s1, "abcde", 3);
+}
+
+/* Test that the cmp/str instruction is used. */
+test02(const char *s1, const char *s2, int n)
+{
+ return __builtin_strncmp (s1, "abcdefghi", 8);
+}
+
+
Hello, This patch implements the cmpstrnsi pattern to support the strncmp builtin for constant lengths. The cmp/str instructions is used for size >= 8 bytes, else fall back to the byte-at-a-time check to favor small strings. I now also handle the cases where align is known for both cmpstr and cmpstrn, so we can avoid the pointer check, and added a schedule improvement to speculate the extu.b r1,r1 instruction into the delay slot, winning an additional instruction (we know that r1 is 0) when the end of string is reached. The byte-at-a-time loop becomes: mov.b @r4+,r1 tst r1,r1 bt/s .L4 mov.b @r3+,r0 cmp/eq r1,r0 bt/s .L9 extu.b r1,r1 .L4: extu.b r0,r0 rts sub r1,r0 Enabled the existing execute/builtins/strncmp-2.c for functional check and added 2 new target specific tests. No regressions for -m2 and -m4 for sh-elf. OK for trunk ? Many thanks, Christian