Patchwork [RFC] Doloop pattern for ARM

login
register
mail settings
Submitter Revital1 Eres
Date Jan. 25, 2011, 9:22 a.m.
Message ID <OF1AFBE97D.0758D9DF-ONC225781A.0041394C-C2257823.003387EB@il.ibm.com>
Download mbox | patch
Permalink /patch/80339/
State New
Headers show

Comments

Revital1 Eres - Jan. 25, 2011, 9:22 a.m.
Hello,

The attached patch models doloop pattern for ARM.
I followed Chung-Lin Tang's suggestion to use
subs+bne sequence.
The patch is also fixed according to comments I
received when posting an initial version of it
to gcc@.

Bootstrap and regtest on ppc64-redhat-linux with SMS flags.
On SPU and ARM regtested on c,c++ and fortran.

On EEMBC/telecom/autcor benchmark I see 19% improvements
running on cortex-a9 with the following flags:
 -O3 -mcpu=cortex-a9  -mtune=cortex-a9  -funsafe-loop-optimizations
-fmodulo-sched

and 27% improvements when running with:
-O3 -mcpu=cortex-a9  -mtune=cortex-a9  -funsafe-loop-optimizations  -mthumb
-fmodulo-sched

Thanks,
Revital


ChangeLog:

        * modulo-sched.c (sms_schedule): Support new form of doloop pattern
        * loop-doloop.c (doloop_condition_get): Likewise.
        * config/arm/thumb2.md (*thumb2_addsi3_compare0): Remove "*".
        (doloop_end): New.
        * config/arm/arm.md (*addsi3_compare0): Remove "*".


(See attached file: patch_doloop_arm_23.txt)
Richard Earnshaw - Jan. 25, 2011, 5:42 p.m.
On Tue, 2011-01-25 at 11:22 +0200, Revital1 Eres wrote:

> 
> ChangeLog:
> 
>         * modulo-sched.c (sms_schedule): Support new form of doloop pattern
>         * loop-doloop.c (doloop_condition_get): Likewise.
>         * config/arm/thumb2.md (*thumb2_addsi3_compare0): Remove "*".
>         (doloop_end): New.
>         * config/arm/arm.md (*addsi3_compare0): Remove "*".
> 
> 
> (See attached file: patch_doloop_arm_23.txt)
+        For the third form we expect:
+
+        (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
+                   (set (reg) (plus (reg) (const_int -1)))])
+        (set (pc) (if_then_else (cc == NE)
+                                (label_ref (label))
+                                (pc))) 
+
+        which is equivalent to the following:
+
+        (parallel [(set (cc) (compare (reg,  1))
+                   (set (reg) (plus (reg) (const_int -1)))
+                   (set (pc) (if_then_else (NE == cc)
+                                           (label_ref (label))
+                                           (pc))))])

No it's not equivalent.  The first one sets CC and then examines it in
the second insn.  The second one sets CC but examines the /previous/
value.

+     if (INTVAL (operands[3]) > 1)
+       FAIL;
+     if (GET_MODE (operands[0]) != SImode)

blank line at the end of independent if blocks

+     if (TARGET_THUMB2)
+       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+     else
+       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+     cmp = XVECEXP (PATTERN (insn), 0, 0);

likewise.

+     bcomp = gen_rtx_NE(VOIDmode, cc_reg, const0_rtx);

missing space before parenthesis.

The ARM bits are OK for 4.7 (not 4.6) with the above changes, but I've
not reviewed the generic changes.

R

Patch

Index: modulo-sched.c

===================================================================
--- modulo-sched.c	(revision 168987)

+++ modulo-sched.c	(working copy)

@@ -1009,9 +1009,11 @@  sms_schedule (void)

 	continue;
       }
 
-      /* Don't handle BBs with calls or barriers, or !single_set insns,

-         or auto-increment insns (to avoid creating invalid reg-moves

-         for the auto-increment insns).

+      /* Don't handle BBs with calls or barriers or auto-increment insns 

+	 (to avoid creating invalid reg-moves for the auto-increment insns),

+	 or !single_set with the exception of instructions that include

+	 count_reg---these instructions are part of the control part

+	 that do-loop recognizes.

          ??? Should handle auto-increment insns.
          ??? Should handle insns defining subregs.  */
      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
@@ -1021,7 +1023,8 @@  sms_schedule (void)

         if (CALL_P (insn)
             || BARRIER_P (insn)
             || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
-                && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)

+                && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE

+                && !reg_mentioned_p (count_reg, insn))

             || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
             || (INSN_P (insn) && (set = single_set (insn))
                 && GET_CODE (SET_DEST (set)) == SUBREG))
Index: loop-doloop.c

===================================================================
--- loop-doloop.c	(revision 168987)

+++ loop-doloop.c	(working copy)

@@ -78,6 +78,8 @@  doloop_condition_get (rtx doloop_pat)

   rtx inc_src;
   rtx condition;
   rtx pattern;
+  rtx cc_reg = NULL_RTX;

+  rtx reg_orig = NULL_RTX;

 
   /* The canonical doloop pattern we expect has one of the following
      forms:
@@ -96,7 +98,16 @@  doloop_condition_get (rtx doloop_pat)

      2)  (set (reg) (plus (reg) (const_int -1))
          (set (pc) (if_then_else (reg != 0)
 	                         (label_ref (label))
-			         (pc))).  */

+			         (pc))).  

+

+     Some targets (ARM) do the comparison before the branch, as in the

+     following form:

+

+     3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))

+                   (set (reg) (plus (reg) (const_int -1)))])

+        (set (pc) (if_then_else (cc == NE)

+                                (label_ref (label))

+                                (pc))) */

 
   pattern = PATTERN (doloop_pat);
 
@@ -104,19 +115,47 @@  doloop_condition_get (rtx doloop_pat)

     {
       rtx cond;
       rtx prev_insn = prev_nondebug_insn (doloop_pat);
+      rtx cmp_arg1, cmp_arg2;

+      rtx cmp_orig;

 
-      /* We expect the decrement to immediately precede the branch.  */

+      /* In case the pattern is not PARALLEL we expect two forms

+	 of doloop which are cases 2) and 3) above: in case 2) the

+	 decrement immediately precedes the branch, while in case 3)

+	 the compare and decrement instructions immediately precede

+	 the branch.  */

 
       if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
         return 0;
 
       cmp = pattern;
-      inc = PATTERN (PREV_INSN (doloop_pat));

+      if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)

+        {

+	  /* The third case: the compare and decrement instructions

+	     immediately precede the branch.  */

+	  cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);

+	  if (GET_CODE (cmp_orig) != SET)

+	    return 0;

+	  if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)

+	    return 0;

+	  cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);

+          cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);

+	  if (cmp_arg2 != const0_rtx 

+	      || GET_CODE (cmp_arg1) != PLUS)

+	    return 0;

+	  reg_orig = XEXP (cmp_arg1, 0);

+	  if (XEXP (cmp_arg1, 1) != GEN_INT (-1) 

+	      || !REG_P (reg_orig))

+	    return 0;

+	  cc_reg = SET_DEST (cmp_orig);

+	  

+	  inc = XVECEXP (PATTERN (prev_insn), 0, 1);

+	}

+      else

+        inc = PATTERN (PREV_INSN (doloop_pat));

       /* We expect the condition to be of the form (reg != 0)  */
       cond = XEXP (SET_SRC (cmp), 0);
       if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
         return 0;
-

     }
   else
     {
@@ -162,11 +201,15 @@  doloop_condition_get (rtx doloop_pat)

     return 0;
 
   if ((XEXP (condition, 0) == reg)
+      /* For the third case:  */  

+      || ((cc_reg != NULL_RTX)

+	  && (XEXP (condition, 0) == cc_reg)

+	  && (reg_orig == reg))

       || (GET_CODE (XEXP (condition, 0)) == PLUS
-		   && XEXP (XEXP (condition, 0), 0) == reg))

+	  && XEXP (XEXP (condition, 0), 0) == reg))

    {
      if (GET_CODE (pattern) != PARALLEL)
-     /*  The second form we expect:

+     /*  For the second form we expect:

 
          (set (reg) (plus (reg) (const_int -1))
          (set (pc) (if_then_else (reg != 0)
@@ -181,7 +224,24 @@  doloop_condition_get (rtx doloop_pat)

                      (set (reg) (plus (reg) (const_int -1)))
                      (additional clobbers and uses)])
 
-         So we return that form instead.

+        For the third form we expect:

+

+        (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))

+                   (set (reg) (plus (reg) (const_int -1)))])

+        (set (pc) (if_then_else (cc == NE)

+                                (label_ref (label))

+                                (pc))) 

+

+        which is equivalent to the following:

+

+        (parallel [(set (cc) (compare (reg,  1))

+                   (set (reg) (plus (reg) (const_int -1)))

+                   (set (pc) (if_then_else (NE == cc)

+                                           (label_ref (label))

+                                           (pc))))])

+

+        So we return the second form instead for the two cases.

+

      */
         condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
 
Index: config/arm/thumb2.md

===================================================================
--- config/arm/thumb2.md	(revision 168987)

+++ config/arm/thumb2.md	(working copy)

@@ -836,7 +836,7 @@ 

   "operands[4] = GEN_INT (- INTVAL (operands[2]));"
 )
 
-(define_insn "*thumb2_addsi3_compare0"

+(define_insn "thumb2_addsi3_compare0"

   [(set (reg:CC_NOOV CC_REGNUM)
 	(compare:CC_NOOV
 	  (plus:SI (match_operand:SI 1 "s_register_operand" "l,  0, r")
@@ -1118,3 +1118,53 @@ 

   "
   operands[2] = GEN_INT (32 - INTVAL (operands[2]));
   ")
+

+;; Define the subtract-one-and-jump insns so loop.c

+;; knows what to generate.

+(define_expand "doloop_end"

+  [(use (match_operand 0 "" ""))      ; loop pseudo

+   (use (match_operand 1 "" ""))      ; iterations; zero if unknown

+   (use (match_operand 2 "" ""))      ; max iterations

+   (use (match_operand 3 "" ""))      ; loop level

+   (use (match_operand 4 "" ""))]     ; label

+  "TARGET_32BIT"

+  "

+ {

+   /* Currently SMS relies on the do-loop pattern to recognize loops

+      where (1) the control part consists of all insns defining and/or

+      using a certain 'count' register and (2) the loop count can be

+      adjusted by modifying this register prior to the loop.

+      ??? The possible introduction of a new block to initialize the

+      new IV can potentially affect branch optimizations.  */

+   if (optimize > 0 && flag_modulo_sched)

+   {

+     rtx s0;

+     rtx bcomp;

+     rtx loc_ref;

+     rtx cc_reg;

+     rtx insn;

+     rtx cmp;

+

+     /* Only use this on innermost loops.  */

+     if (INTVAL (operands[3]) > 1)

+       FAIL;

+     if (GET_MODE (operands[0]) != SImode)

+       FAIL;

+

+     s0 = operands [0];

+     if (TARGET_THUMB2)

+       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));

+     else

+       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));

+     cmp = XVECEXP (PATTERN (insn), 0, 0);

+     cc_reg = SET_DEST (cmp);

+     bcomp = gen_rtx_NE(VOIDmode, cc_reg, const0_rtx);

+     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]);

+     emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,

+                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,

+                                                        loc_ref, pc_rtx)));

+     DONE;

+   }else

+      FAIL;

+ }")

+

Index: config/arm/arm.md

===================================================================
--- config/arm/arm.md	(revision 168987)

+++ config/arm/arm.md	(working copy)

@@ -791,7 +791,7 @@ 

   ""
 )
 
-(define_insn "*addsi3_compare0"

+(define_insn "addsi3_compare0"

   [(set (reg:CC_NOOV CC_REGNUM)
 	(compare:CC_NOOV
 	 (plus:SI (match_operand:SI 1 "s_register_operand" "r, r")