Patchwork Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion

login
register
mail settings
Submitter Wei Mi
Date Sept. 13, 2013, 5:28 p.m.
Message ID <CA+4CFy7mAS=Lz5errn3dTnYcXcndRKBQ8O998QebMv5ggF=-EQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/274815/
State New
Headers show

Comments

Wei Mi - Sept. 13, 2013, 5:28 p.m.
> Thanks.  At this point you need feedback from x86 and scheduler maintainers.
> I would recommend you to resubmit the patch with a Changelog text, and with
> the text of the patch inline in the email (your last mail has the patch as a
> binary attachment, which makes it harder to review and respond to).  Please
> mention if the updated patch passes bootstrap and regtest.

Thanks! Here is the new patch. bootstrap and regression pass. ok for trunk?

2013-09-13  Wei Mi  <wmi@google.com>

        * sched-rgn.c (add_branch_dependences): Keep insns in
        a SCHED_GROUP at the end of bb to remain their locations.
        * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for
        X86_TUNE_FUSE_CMP_AND_BRANCH.
        * config/i386/i386.c (ix86_macro_fusion_p): New Function.
        (ix86_macro_fusion_pair_p): Ditto.
        * doc/tm.texi.in: Generated.
        * doc/tm.texi: Ditto.
        * target.def: Add two hooks: macro_fusion_p and
        macro_fusion_pair_p.
        * haifa-sched.c (try_group_insn): New function.
        (group_insns_for_macro_fusion): New function.
        (sched_init): Call group_insns_for_macro_fusion.
H.J. Lu - Sept. 13, 2013, 6:07 p.m.
On Fri, Sep 13, 2013 at 10:28 AM, Wei Mi <wmi@google.com> wrote:
>> Thanks.  At this point you need feedback from x86 and scheduler maintainers.
>> I would recommend you to resubmit the patch with a Changelog text, and with
>> the text of the patch inline in the email (your last mail has the patch as a
>> binary attachment, which makes it harder to review and respond to).  Please
>> mention if the updated patch passes bootstrap and regtest.
>
> Thanks! Here is the new patch. bootstrap and regression pass. ok for trunk?
>
> 2013-09-13  Wei Mi  <wmi@google.com>
>
>         * sched-rgn.c (add_branch_dependences): Keep insns in
>         a SCHED_GROUP at the end of bb to remain their locations.
>         * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for
>         X86_TUNE_FUSE_CMP_AND_BRANCH.
>         * config/i386/i386.c (ix86_macro_fusion_p): New Function.
>         (ix86_macro_fusion_pair_p): Ditto.
>         * doc/tm.texi.in: Generated.
>         * doc/tm.texi: Ditto.
>         * target.def: Add two hooks: macro_fusion_p and
>         macro_fusion_pair_p.
>         * haifa-sched.c (try_group_insn): New function.
>         (group_insns_for_macro_fusion): New function.
>         (sched_init): Call group_insns_for_macro_fusion.
>

> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 201963)
> +++ config/i386/i386.c  (working copy)
> @@ -24850,6 +24850,99 @@ ia32_multipass_dfa_lookahead (void)
>      }
>  }
>
> +/* Return true if target platform supports macro-fusion.  */
> +
> +static bool
> +ix86_macro_fusion_p ()
> +{
> +  if (TARGET_FUSE_CMP_AND_BRANCH)
> +    return true;
> +  else
> +    return false;
> +}
> +
> +/* Check whether current microarchitecture support macro fusion
> +   for insn pair "CONDGEN + CONDJMP". Refer to
> +   "Intel Architectures Optimization Reference Manual". */
> +
> +static bool
> +ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
> +{
> +  rtx src;
> +  if (!strcmp (ix86_tune_string, "corei7"))
> +    {
> +      /* For Nehalem.  */
> +      rtx single_set = single_set (condgen);
> +      /* Nehalem doesn't support macro-fusion for add/sub+jmp.  */
> +      if (single_set == NULL_RTX)
> +        return false;
> +
> +      src = SET_SRC (single_set);
> +      if (GET_CODE (src) != COMPARE)
> +       return false;
> +
> +      /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM
> +        insn pattern.  */
> +      if ((MEM_P (XEXP (src, 0))
> +          && CONST_INT_P (XEXP (src, 1)))
> +         || (MEM_P (XEXP (src, 1))
> +             && CONST_INT_P (XEXP (src, 0))))
> +       return false;
> +
> +      /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp.  */
> +      if (get_attr_type (condgen) != TYPE_TEST
> +         && get_attr_type (condgen) != TYPE_ICMP)
> +       return false;
> +      return true;
> +    }
> +  else if (!strcmp (ix86_tune_string, "corei7-avx"))
> +    {
> +      /* For Sandybridge.  */
> +      enum rtx_code ccode;
> +      rtx compare_set = NULL_RTX, test_if, cond;
> +      rtx single_set = single_set (condgen);
> +      if (single_set != NULL_RTX)
> +        compare_set = single_set;
> +      else
> +       {
> +         int i;
> +         rtx pat = PATTERN (condgen);
> +         for (i = 0; i < XVECLEN (pat, 0); i++)
> +           if (GET_CODE (XVECEXP (pat, 0, i)) == SET
> +               && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
> +             compare_set = XVECEXP (pat, 0, i);
> +       }
> +
> +      if (compare_set == NULL_RTX)
> +       return false;
> +      src = SET_SRC (compare_set);
> +      if (GET_CODE (src) != COMPARE)
> +       return false;
> +
> +      /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM
> +        insn pattern.  */
> +      if ((MEM_P (XEXP (src, 0))
> +           && CONST_INT_P (XEXP (src, 1)))
> +          || (MEM_P (XEXP (src, 1))
> +              && CONST_INT_P (XEXP (src, 0))))
> +        return false;
> +
> +      /* Sandybridge doesn't support macro-fusion for inc/dec +
> +        unsigned comparison jmp.  */
> +      test_if = SET_SRC (pc_set (condjmp));
> +      cond = XEXP (test_if, 0);
> +      ccode = GET_CODE (cond);
> +      if (get_attr_type (condgen) == TYPE_INCDEC
> +         && (ccode == GEU
> +             || ccode == GTU
> +             || ccode == LEU
> +             || ccode == LTU))
> +       return false;
> +      return true;
> +    }
> +  return false;
> +}
> +

Checking corei7/corei7-avx explicitly isn't a good idea.
It is also useful for Ivy Bridge and Haswell.  I think you
should use a variable to control it, similar to
TARGET_FUSE_CMP_AND_BRANCH.
Wei Mi - Sept. 13, 2013, 6:28 p.m.
> Checking corei7/corei7-avx explicitly isn't a good idea.
> It is also useful for Ivy Bridge and Haswell.  I think you
> should use a variable to control it, similar to
> TARGET_FUSE_CMP_AND_BRANCH.
>
>
> --
> H.J.

Different x86 microarchitectures support macro-fusion for different
compare and branch combinations. I need to differentiate various x86
microarchitectures. If use TARGET_FUSE_CMP_AND_BRANCH like vars to
control it, it requires a bunch of them. That is why I choose to check
corei7/corei7-avx in that function. I don't add core-avx-i/core-avx2
for now because I don't have those machines for testing.

Thanks,
Wei Mi.
Andi Kleen - Sept. 13, 2013, 11:50 p.m.
Wei Mi <wmi@google.com> writes:

>> Checking corei7/corei7-avx explicitly isn't a good idea.
>> It is also useful for Ivy Bridge and Haswell.  I think you
>> should use a variable to control it, similar to
>> TARGET_FUSE_CMP_AND_BRANCH.
>>
>>
>> --
>> H.J.
>
> Different x86 microarchitectures support macro-fusion for different
> compare and branch combinations. I need to differentiate various x86
> microarchitectures. If use TARGET_FUSE_CMP_AND_BRANCH like vars to
> control it, it requires a bunch of them. That is why I choose to check
> corei7/corei7-avx in that function. I don't add core-avx-i/core-avx2
> for now because I don't have those machines for testing.

They are normally a super set of each other.
So flags as suggested by HJ will work better than explicit matches.

-Andi

Patch

Index: sched-rgn.c
===================================================================
--- sched-rgn.c (revision 201963)
+++ sched-rgn.c (working copy)
@@ -2443,6 +2443,8 @@  add_branch_dependences (rtx head, rtx ta
      cc0 setters remain at the end because they can't be moved away from
      their cc0 user.

+     Predecessors of SCHED_GROUP_P instructions at the end remain at the end.
+
      COND_EXEC insns cannot be moved past a branch (see e.g. PR17808).

      Insns setting TARGET_CLASS_LIKELY_SPILLED_P registers (usually return
@@ -2465,7 +2467,8 @@  add_branch_dependences (rtx head, rtx ta
 #endif
  || (!reload_completed
      && sets_likely_spilled (PATTERN (insn)))))
- || NOTE_P (insn))
+ || NOTE_P (insn)
+ || (last != 0 && SCHED_GROUP_P (last)))
     {
       if (!NOTE_P (insn))
  {
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def    (revision 201963)
+++ config/i386/x86-tune.def    (working copy)
@@ -196,7 +196,8 @@  DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
    with a subsequent conditional jump instruction into a single
    compare-and-branch uop.  */
-DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch",
+          m_COREI7 | m_BDVER)
 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
    will impact LEA instruction selection. */
 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 201963)
+++ config/i386/i386.c  (working copy)
@@ -24850,6 +24850,99 @@  ia32_multipass_dfa_lookahead (void)
     }
 }

+/* Return true if target platform supports macro-fusion.  */
+
+static bool
+ix86_macro_fusion_p ()
+{
+  if (TARGET_FUSE_CMP_AND_BRANCH)
+    return true;
+  else
+    return false;
+}
+
+/* Check whether current microarchitecture support macro fusion
+   for insn pair "CONDGEN + CONDJMP". Refer to
+   "Intel Architectures Optimization Reference Manual". */
+
+static bool
+ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
+{
+  rtx src;
+  if (!strcmp (ix86_tune_string, "corei7"))
+    {
+      /* For Nehalem.  */
+      rtx single_set = single_set (condgen);
+      /* Nehalem doesn't support macro-fusion for add/sub+jmp.  */
+      if (single_set == NULL_RTX)
+        return false;
+
+      src = SET_SRC (single_set);
+      if (GET_CODE (src) != COMPARE)
+       return false;
+
+      /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM
+        insn pattern.  */
+      if ((MEM_P (XEXP (src, 0))
+          && CONST_INT_P (XEXP (src, 1)))
+         || (MEM_P (XEXP (src, 1))
+             && CONST_INT_P (XEXP (src, 0))))
+       return false;
+
+      /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp.  */
+      if (get_attr_type (condgen) != TYPE_TEST
+         && get_attr_type (condgen) != TYPE_ICMP)
+       return false;
+      return true;
+    }
+  else if (!strcmp (ix86_tune_string, "corei7-avx"))
+    {
+      /* For Sandybridge.  */
+      enum rtx_code ccode;
+      rtx compare_set = NULL_RTX, test_if, cond;
+      rtx single_set = single_set (condgen);
+      if (single_set != NULL_RTX)
+        compare_set = single_set;
+      else
+       {
+         int i;
+         rtx pat = PATTERN (condgen);
+         for (i = 0; i < XVECLEN (pat, 0); i++)
+           if (GET_CODE (XVECEXP (pat, 0, i)) == SET
+               && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
+             compare_set = XVECEXP (pat, 0, i);
+       }
+
+      if (compare_set == NULL_RTX)
+       return false;
+      src = SET_SRC (compare_set);
+      if (GET_CODE (src) != COMPARE)
+       return false;
+
+      /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM
+        insn pattern.  */
+      if ((MEM_P (XEXP (src, 0))
+           && CONST_INT_P (XEXP (src, 1)))
+          || (MEM_P (XEXP (src, 1))
+              && CONST_INT_P (XEXP (src, 0))))
+        return false;
+
+      /* Sandybridge doesn't support macro-fusion for inc/dec +
+        unsigned comparison jmp.  */
+      test_if = SET_SRC (pc_set (condjmp));
+      cond = XEXP (test_if, 0);
+      ccode = GET_CODE (cond);
+      if (get_attr_type (condgen) == TYPE_INCDEC
+         && (ccode == GEU
+             || ccode == GTU
+             || ccode == LEU
+             || ccode == LTU))
+       return false;
+      return true;
+    }
+  return false;
+}
+
 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
    execution. It is applied if
    (1) IMUL instruction is on the top of list;
@@ -42982,6 +43075,10 @@  ix86_memmodel_check (unsigned HOST_WIDE_
 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
   ia32_multipass_dfa_lookahead
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p

 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in      (revision 201963)
+++ doc/tm.texi.in      (working copy)
@@ -4940,6 +4940,10 @@  them: try the first ones in this list fi

 @hook TARGET_SCHED_REORDER2

+@hook TARGET_SCHED_MACRO_FUSION_P
+
+@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
+
 @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK

 @hook TARGET_SCHED_INIT
Index: doc/tm.texi
===================================================================
--- doc/tm.texi (revision 201963)
+++ doc/tm.texi (working copy)
@@ -6553,6 +6553,17 @@  scheduling one insn causes other insns t
 cycle.  These other insns can then be taken into account properly.
 @end deftypefn

+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
+This hook is used to check whether target platform supports macro fusion.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
@var{condgen}, rtx @var{condjmp})
+This hook is used to check whether two insns could be macro fused for
+target microarchitecture. If this hook returns true for the given insn pair
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
+group, and they will not be scheduled apart.
+@end deftypefn
+
 @deftypefn {Target Hook} void
TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
@var{tail})
 This hook is called after evaluation forward dependencies of insns in
 chain given by two parameter values (@var{head} and @var{tail}
Index: target.def
===================================================================
--- target.def  (revision 201963)
+++ target.def  (working copy)
@@ -1041,6 +1041,19 @@  scheduling one insn causes other insns t
 cycle.  These other insns can then be taken into account properly.",
  int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)

+DEFHOOK
+(macro_fusion_p,
+ "This hook is used to check whether target platform supports macro fusion.",
+ bool, (void), NULL)
+
+DEFHOOK
+(macro_fusion_pair_p,
+ "This hook is used to check whether two insns could be macro fused for\n\
+target microarchitecture. If this hook returns true for the given insn pair\n\
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
+group, and they will not be scheduled apart.",
+ bool, (rtx condgen, rtx condjmp), NULL)
+
 /* The following member value is a pointer to a function called
    after evaluation forward dependencies of insns in chain given
    by two parameter values (head and tail correspondingly).  */
Index: haifa-sched.c
===================================================================
--- haifa-sched.c       (revision 201963)
+++ haifa-sched.c       (working copy)
@@ -6519,6 +6519,44 @@  setup_sched_dump (void)
                ? stderr : dump_file);
 }

+static void
+try_group_insn (rtx insn)
+{
+  unsigned int condreg1, condreg2;
+  rtx cc_reg_1;
+  rtx prev;
+
+  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+  prev = prev_nonnote_nondebug_insn (insn);
+  if (!any_condjump_p (insn)
+      || !reg_referenced_p (cc_reg_1, PATTERN (insn))
+      || !prev
+      || !modified_in_p (cc_reg_1, prev))
+    return;
+
+  /* Different microarchitectures support macro fusions for different
+     combinations of insn pairs.  */
+  if (!targetm.sched.macro_fusion_pair_p
+      || !targetm.sched.macro_fusion_pair_p (prev, insn))
+    return;
+
+  SCHED_GROUP_P (insn) = 1;
+}
+
+/* If the last cond jump and the cond register defining insn are consecutive
+   before scheduling, we want them to be in a schedule group. This is good
+   for performance on microarchitectures supporting macro-fusion.  */
+
+static void
+group_insns_for_macro_fusion ()
+{
+  basic_block bb;
+
+  FOR_EACH_BB (bb)
+    try_group_insn (BB_END (bb));
+}
+
 /* Initialize some global state for the scheduler.  This function works
    with the common data shared between all the schedulers.  It is called
    from the scheduler specific initialization routine.  */
@@ -6645,6 +6683,11 @@  sched_init (void)
     }

   curr_state = xmalloc (dfa_state_size);
+
+  /* Group compare and branch insns for macro-fusion.  */
+  if (targetm.sched.macro_fusion_p
+      && targetm.sched.macro_fusion_p ())
+    group_insns_for_macro_fusion ();
 }

 static void haifa_init_only_bb (basic_block, basic_block);