Patchwork [PATH,SH] Small builtin_strlen improvement

login
register
mail settings
Submitter Christian Bruel
Date March 26, 2014, 7:58 a.m.
Message ID <533288C1.1080306@st.com>
Download mbox | patch
Permalink /patch/333797/
State New
Headers show

Comments

Christian Bruel - March 26, 2014, 7:58 a.m.
Hello,

This patches adds a few instructions to the inlined builtin_strlen to
unroll the remaining bytes for word-at-a-time loop. This enables to have
2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
allowing block alignment assignation. This partially improves the
problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
string functions ignore loop and label alignment

whereas the test now expands (-O2 -m4) as
        mov     r4,r0
        tst     #3,r0
        mov     r4,r2
        bf/s    .L12
        mov     r4,r3
        mov     #0,r2
.L4:
        mov.l   @r4+,r1
        cmp/str r2,r1
        bf      .L4
        add     #-4,r4
        mov.b   @r4,r1
        tst     r1,r1
        bt      .L2
        add     #1,r4
        mov.b   @r4,r1
        tst     r1,r1
        bt      .L2
        add     #1,r4
        mov.b   @r4,r1
        tst     r1,r1
        mov     #-1,r1
        negc    r1,r1
        add     r1,r4
.L2:
        mov     r4,r0
        rts
        sub     r3,r0
        .align 1
.L12:
        mov.b   @r4+,r1
        tst     r1,r1
        bf/s    .L12
        mov     r2,r3
        add     #1,r3
        mov     r4,r0
        rts
        sub     r3,r0


Best tuning compared to the "compact" version I got on is ~1% for c++
regular expression benchmark, but well, code looks best this way.

regtested tested for -m2, -m4

OK for trunk ?
Kaz Kojima - March 26, 2014, 11:52 a.m.
Christian Bruel <christian.bruel@st.com> wrote:
> This patches adds a few instructions to the inlined builtin_strlen to
> unroll the remaining bytes for word-at-a-time loop. This enables to have
> 2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
> allowing block alignment assignation. This partially improves the
> problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
> string functions ignore loop and label alignment
[snip]
> Best tuning compared to the "compact" version I got on is ~1% for c++
> regular expression benchmark, but well, code looks best this way.
> 
> regtested tested for -m2, -m4
> 
> OK for trunk ?

OK for trunk when it returns to stage 1 or 2.

Regards,
	kaz
Oleg Endo - March 30, 2014, 9:02 p.m.
Hi,

On Wed, 2014-03-26 at 08:58 +0100, Christian Bruel wrote:

> This patches adds a few instructions to the inlined builtin_strlen to
> unroll the remaining bytes for word-at-a-time loop. This enables to have
> 2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
> allowing block alignment assignation. This partially improves the
> problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
> string functions ignore loop and label alignment

Actually, my original concern was the (mis)alignment of the 4 byte inner
loop.  AFAIR it's better for the SH pipeline if the first insn of a loop
is 4 byte aligned.

> 
> whereas the test now expands (-O2 -m4) as
>         mov     r4,r0
>         tst     #3,r0
>         mov     r4,r2
>         bf/s    .L12
>         mov     r4,r3
>         mov     #0,r2
> .L4:
>         mov.l   @r4+,r1
>         cmp/str r2,r1
>         bf      .L4
>         add     #-4,r4
>         mov.b   @r4,r1
>         tst     r1,r1
>         bt      .L2
>         add     #1,r4
>         mov.b   @r4,r1
>         tst     r1,r1
>         bt      .L2
>         add     #1,r4
>         mov.b   @r4,r1
>         tst     r1,r1
>         mov     #-1,r1
>         negc    r1,r1
>         add     r1,r4
> .L2:
>         mov     r4,r0
>         rts
>         sub     r3,r0
>         .align 1
> .L12:
>         mov.b   @r4+,r1
>         tst     r1,r1
>         bf/s    .L12
>         mov     r2,r3
>         add     #1,r3
>         mov     r4,r0
>         rts
>         sub     r3,r0
> 
> 
> Best tuning compared to the "compact" version I got on is ~1% for c++
> regular expression benchmark, but well, code looks best this way.

I haven't done any measurements but doesn't this introduce some
performance regressions here and there due to the increased code size?
Maybe the byte unrolling should not be done at -O2 but at -O3?

Moreover, post-inc addressing on the bytes could be used.  Ideally we'd
get something like this:

        mov     r4,r0
        tst     #3,r0
        bf/s    .L12
        mov     r4,r3
        mov     #0,r2
.L4:
        mov.l   @r4+,r1
        cmp/str r2,r1
        bf      .L4

        add     #-4,r4

        mov.b   @r4+,r1
        tst     r1,r1
        bt      .L2

        mov.b   @r4+,r1
        tst     r1,r1
        bt      .L2

        mov.b   @r4+,r1
        tst     r1,r1
        mov     #-1,r1
        subc    r1,r4
        sett
.L2:
        mov     r4,r0
        rts
        subc    r3,r0
        .align 1
.L12:
        mov.b   @r4+,r1
        tst     r1,r1
        bf     .L12

        mov     r4,r0
        rts
        subc    r3,r0


I'll have a look at the missed 'subc' cases.

Cheers,
Oleg
Christian Bruel - March 31, 2014, 7:44 a.m.
On 03/30/2014 11:02 PM, Oleg Endo wrote:
> Hi,
>
> On Wed, 2014-03-26 at 08:58 +0100, Christian Bruel wrote:
>
>> This patches adds a few instructions to the inlined builtin_strlen to
>> unroll the remaining bytes for word-at-a-time loop. This enables to have
>> 2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
>> allowing block alignment assignation. This partially improves the
>> problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
>> string functions ignore loop and label alignment
> Actually, my original concern was the (mis)alignment of the 4 byte inner
> loop.  AFAIR it's better for the SH pipeline if the first insn of a loop
> is 4 byte aligned.

yes, this is why I haven't closed the PR. IMHO the problem is with the
non-aligned loop stems from to the generic alignment code in final.c.
changing branch frequencies is quite impacting to BB reordering as well.
Further tuning of static branch estimations, or tuning of the LOOP_ALIGN
macro is needed. Note that my branch estimations in this code is very
empirical, a dynamic profiling benchmarking would be nice as well.
My point was just that forcing a local .align in this code is a
workaround, as we should be able to rely on generic reordering/align
code  for this. So the tuning of loop alignment is more global (and well
exhibited here indeed)

>
>> whereas the test now expands (-O2 -m4) as
>>         mov     r4,r0
>>         tst     #3,r0
>>         mov     r4,r2
>>         bf/s    .L12
>>         mov     r4,r3
>>         mov     #0,r2
>> .L4:
>>         mov.l   @r4+,r1
>>         cmp/str r2,r1
>>         bf      .L4
>>         add     #-4,r4
>>         mov.b   @r4,r1
>>         tst     r1,r1
>>         bt      .L2
>>         add     #1,r4
>>         mov.b   @r4,r1
>>         tst     r1,r1
>>         bt      .L2
>>         add     #1,r4
>>         mov.b   @r4,r1
>>         tst     r1,r1
>>         mov     #-1,r1
>>         negc    r1,r1
>>         add     r1,r4
>> .L2:
>>         mov     r4,r0
>>         rts
>>         sub     r3,r0
>>         .align 1
>> .L12:
>>         mov.b   @r4+,r1
>>         tst     r1,r1
>>         bf/s    .L12
>>         mov     r2,r3
>>         add     #1,r3
>>         mov     r4,r0
>>         rts
>>         sub     r3,r0
>>
>>
>> Best tuning compared to the "compact" version I got on is ~1% for c++
>> regular expression benchmark, but well, code looks best this way.
> I haven't done any measurements but doesn't this introduce some
> performance regressions here and there due to the increased code size?
> Maybe the byte unrolling should not be done at -O2 but at -O3?

Maybe, Actually from my I said I have seen only small improvements and
no regressions on my test base which is already very large.  This is
tuned for a 32 byte cache line so the exit code fits.

>
> Moreover, post-inc addressing on the bytes could be used.  Ideally we'd
> get something like this:

Maybe, from what I measured (I tried this) it was slightly worse (but
small peanuts). I preferred to avoid the last pointer reparation, and
since we are on a very small number of unrolled loops (3), it counts.
This is a proposed implementation,I there are always alternatives and I
might have missed some coding patterns, I'd be happy to benchmarks other
coding if you have a patch to try.

Cheers,

>
>         mov     r4,r0
>         tst     #3,r0
>         bf/s    .L12
>         mov     r4,r3
>         mov     #0,r2
> .L4:
>         mov.l   @r4+,r1
>         cmp/str r2,r1
>         bf      .L4
>
>         add     #-4,r4
>
>         mov.b   @r4+,r1
>         tst     r1,r1
>         bt      .L2
>
>         mov.b   @r4+,r1
>         tst     r1,r1
>         bt      .L2
>
>         mov.b   @r4+,r1
>         tst     r1,r1
>         mov     #-1,r1
>         subc    r1,r4
>         sett
> .L2:
>         mov     r4,r0
>         rts
>         subc    r3,r0
>         .align 1
> .L12:
>         mov.b   @r4+,r1
>         tst     r1,r1
>         bf     .L12
>
>         mov     r4,r0
>         rts
>         subc    r3,r0
>
>
> I'll have a look at the missed 'subc' cases.
> Cheers,
> Oleg
>
Oleg Endo - April 18, 2014, 12:41 p.m.
Sorry for the delayed reply.

On Mon, 2014-03-31 at 09:44 +0200, Christian Bruel wrote:
> On 03/30/2014 11:02 PM, Oleg Endo wrote:
> > Hi,
> >
> > On Wed, 2014-03-26 at 08:58 +0100, Christian Bruel wrote:
> >
> >> This patches adds a few instructions to the inlined builtin_strlen to
> >> unroll the remaining bytes for word-at-a-time loop. This enables to have
> >> 2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
> >> allowing block alignment assignation. This partially improves the
> >> problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
> >> string functions ignore loop and label alignment
> > Actually, my original concern was the (mis)alignment of the 4 byte inner
> > loop.  AFAIR it's better for the SH pipeline if the first insn of a loop
> > is 4 byte aligned.
> 
> yes, this is why I haven't closed the PR. IMHO the problem is with the
> non-aligned loop stems from to the generic alignment code in final.c.
> changing branch frequencies is quite impacting to BB reordering as well.
> Further tuning of static branch estimations, or tuning of the LOOP_ALIGN
> macro is needed. 

OK, I've updated PR 60539 accordingly.

> Note that my branch estimations in this code is very
> empirical, a dynamic profiling benchmarking would be nice as well.
> My point was just that forcing a local .align in this code is a
> workaround, as we should be able to rely on generic reordering/align
> code  for this. So the tuning of loop alignment is more global (and well
> exhibited here indeed)

I think that those two are separate issues.  I've opened a new PR 60884
for this.  Let's continue the discussions and experiments there.

Cheers,
Oleg

Patch

2014-03-20  Christian Bruel  <christian.bruel@st.com>

	* config/sh/sh-mem.cc (sh_expand_strlen): Unroll last word.

Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc	(revision 208745)
+++ gcc/config/sh/sh-mem.cc	(working copy)
@@ -586,9 +586,35 @@  sh_expand_strlen (rtx *operands)
 
   emit_move_insn (current_addr, plus_constant (Pmode, current_addr, -4));
 
-  /* start byte loop.  */
   addr1 = adjust_address (addr1, QImode, 0);
 
+  /* unroll remaining bytes.  */
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_return));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  jump = emit_jump_insn (gen_jump_compact (L_return));
+  emit_barrier_after (jump);
+
+  /* start byte loop.  */
   emit_label (L_loop_byte);
 
   emit_insn (gen_extendqisi2 (tmp1, addr1));
@@ -600,11 +626,12 @@  sh_expand_strlen (rtx *operands)
 
   /* end loop.  */
 
+  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
+
   emit_label (L_return);
 
-  emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
   emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));
 
   return true;
 }
+