diff mbox

Fwd: [PATCH] Scheduling result adjustment to enable macro-fusion

Message ID CA+4CFy6dJFq7VCKXu3Mb7EZN4Xk3V3m_Ss=x6dXz41wR31mYpA@mail.gmail.com
State New
Headers show

Commit Message

Wei Mi Sept. 9, 2013, 5:46 p.m. UTC
Add a testcase. bootstrap and regression ok for the patch in last mail.

2013-09-09  Wei Mi  <wmi@google.com>

        * gcc/testsuite/gcc.dg/macro-fusion-1.c: New.

+

On Fri, Sep 6, 2013 at 10:39 AM, Wei Mi <wmi@google.com> wrote:
> SCHED_GROUP works after I add chain_to_prev_insn after
> add_branch_dependences, in order to chain control dependences to prev
> insn for sched group. Here is the new patch. Testing is going on.
>
> Thanks,
> Wei Mi.
>
> 2013-09-06  Wei Mi  <wmi@google.com>
>
>         * config/i386/i386.c (ix86_macro_fusion_p): New function.
>         (ix86_macro_fusion_pair_p): Ditto.
>         * config/i386/x86-tune.def (DEF_TUNE): Add m_COREI7 for
>         X86_TUNE_FUSE_CMP_AND_BRANCH.
>         * sched-deps.c (group_insns_for_macro_fusion): New function.
>         (sched_analyze_insn): Call group_insns_for_macro_fusion.
>         (chain_to_prev_insn): Change it from static to extern.
>         (chain_to_prev_insn_p): Ditto.
>         * doc/tm.texi: Generated.
>         * doc/tm.texi.in: Ditto.
>         * sched-int.h: New declarations.
>         * sched-rgn.c (add_branch_dependences): Chain control
>         dependences to prev insn for sched group.
>         * target.def: Add macro_fusion_p and macro_fusion_pair_p.
>
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 201963)
> +++ config/i386/i386.c  (working copy)
> @@ -24850,6 +24850,99 @@ ia32_multipass_dfa_lookahead (void)
>      }
>  }
>
> +/* Return true if target platform supports macro-fusion.  */
> +
> +static bool
> +ix86_macro_fusion_p ()
> +{
> +  if (TARGET_FUSE_CMP_AND_BRANCH)
> +    return true;
> +  else
> +    return false;
> +}
> +
> +/* Check whether current microarchitecture support macro fusion
> +   for insn pair "CONDGEN + CONDJMP". Refer to
> +   "Intel Architectures Optimization Reference Manual". */
> +
> +static bool
> +ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
> +{
> +  rtx src;
> +  if (!strcmp (ix86_tune_string, "corei7"))
> +    {
> +      /* For Nehalem.  */
> +      rtx single_set = single_set (condgen);
> +      /* Nehalem doesn't support macro-fusion for add/sub+jmp.  */
> +      if (single_set == NULL_RTX)
> +        return false;
> +
> +      src = SET_SRC (single_set);
> +      if (GET_CODE (src) != COMPARE)
> +       return false;
> +
> +      /* Nehalem doesn't support macro-fusion for cmp/test MEM-IMM
> +        insn pattern.  */
> +      if ((MEM_P (XEXP (src, 0))
> +          && CONST_INT_P (XEXP (src, 1)))
> +         || (MEM_P (XEXP (src, 1))
> +             && CONST_INT_P (XEXP (src, 0))))
> +       return false;
> +
> +      /* Nehalem doesn't support macro-fusion for add/sub/dec/inc + jmp.  */
> +      if (get_attr_type (condgen) != TYPE_TEST
> +         && get_attr_type (condgen) != TYPE_ICMP)
> +       return false;
> +      return true;
> +    }
> +  else if (!strcmp (ix86_tune_string, "corei7-avx"))
> +    {
> +      /* For Sandybridge.  */
> +      enum rtx_code ccode;
> +      rtx compare_set = NULL_RTX, test_if, cond;
> +      rtx single_set = single_set (condgen);
> +      if (single_set != NULL_RTX)
> +        compare_set = single_set;
> +      else
> +       {
> +         int i;
> +         rtx pat = PATTERN (condgen);
> +         for (i = 0; i < XVECLEN (pat, 0); i++)
> +           if (GET_CODE (XVECEXP (pat, 0, i)) == SET
> +               && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
> +             compare_set = XVECEXP (pat, 0, i);
> +       }
> +
> +      if (compare_set == NULL_RTX)
> +       return false;
> +      src = SET_SRC (compare_set);
> +      if (GET_CODE (src) != COMPARE)
> +       return false;
> +
> +      /* Sandybridge doesn't support macro-fusion for cmp/test MEM-IMM
> +        insn pattern.  */
> +      if ((MEM_P (XEXP (src, 0))
> +           && CONST_INT_P (XEXP (src, 1)))
> +          || (MEM_P (XEXP (src, 1))
> +              && CONST_INT_P (XEXP (src, 0))))
> +        return false;
> +
> +      /* Sandybridge doesn't support macro-fusion for inc/dec +
> +        unsigned comparison jmp.  */
> +      test_if = SET_SRC (pc_set (condjmp));
> +      cond = XEXP (test_if, 0);
> +      ccode = GET_CODE (cond);
> +      if (get_attr_type (condgen) == TYPE_INCDEC
> +         && (ccode == GEU
> +             || ccode == GTU
> +             || ccode == LEU
> +             || ccode == LTU))
> +       return false;
> +      return true;
> +    }
> +  return false;
> +}
> +
>  /* Try to reorder ready list to take advantage of Atom pipelined IMUL
>     execution. It is applied if
>     (1) IMUL instruction is on the top of list;
> @@ -42982,6 +43075,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_
>  #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
>  #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
>    ia32_multipass_dfa_lookahead
> +#undef TARGET_SCHED_MACRO_FUSION_P
> +#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
> +#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
> +#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
>
>  #undef TARGET_FUNCTION_OK_FOR_SIBCALL
>  #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
> Index: config/i386/x86-tune.def
> ===================================================================
> --- config/i386/x86-tune.def    (revision 201963)
> +++ config/i386/x86-tune.def    (working copy)
> @@ -196,7 +196,8 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
>  /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
>     with a subsequent conditional jump instruction into a single
>     compare-and-branch uop.  */
> -DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
> +DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch",
> +          m_COREI7 | m_BDVER)
>  /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>     will impact LEA instruction selection. */
>  DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
> Index: sched-deps.c
> ===================================================================
> --- sched-deps.c        (revision 201963)
> +++ sched-deps.c        (working copy)
> @@ -487,7 +487,6 @@ static void add_dependence_list (rtx, rt
>  static void add_dependence_list_and_free (struct deps_desc *, rtx,
>                                           rtx *, int, enum reg_note, bool);
>  static void delete_all_dependences (rtx);
> -static void chain_to_prev_insn (rtx);
>
>  static void flush_pending_lists (struct deps_desc *, rtx, int, int);
>  static void sched_analyze_1 (struct deps_desc *, rtx, rtx);
> @@ -1660,7 +1659,7 @@ delete_all_dependences (rtx insn)
>     chains backwards. Then we add the dependencies for the group to
>     the previous nonnote insn.  */
>
> -static void
> +void
>  chain_to_prev_insn (rtx insn)
>  {
>    sd_iterator_def sd_it;
> @@ -2821,6 +2820,35 @@ sched_analyze_2 (struct deps_desc *deps,
>      sched_deps_info->finish_rhs ();
>  }
>
> +/* If the last cond jump and the cond register defining insn are consecutive
> +   before scheduling, we want them to be in a schedule group. This is good
> +   for performance on microarchitectures supporting macro-fusion.  */
> +
> +static void
> +group_insns_for_macro_fusion (rtx insn)
> +{
> +  unsigned int condreg1, condreg2;
> +  rtx cc_reg_1;
> +  rtx prev;
> +
> +  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
> +  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
> +  prev = prev_nonnote_nondebug_insn (insn);
> +  if (!any_condjump_p (insn)
> +      || !reg_referenced_p (cc_reg_1, PATTERN (insn))
> +      || !prev
> +      || !modified_in_p (cc_reg_1, prev))
> +    return;
> +
> +  /* Different microarchitectures support macro fusions for different
> +     combinations of insn pairs.  */
> +  if (!targetm.sched.macro_fusion_pair_p
> +      || !targetm.sched.macro_fusion_pair_p (prev, insn))
> +    return;
> +
> +  SCHED_GROUP_P (insn) = 1;
> +}
> +
>  /* Analyze an INSN with pattern X to find all dependencies.  */
>  static void
>  sched_analyze_insn (struct deps_desc *deps, rtx x, rtx insn)
> @@ -2844,6 +2872,10 @@ sched_analyze_insn (struct deps_desc *de
>    can_start_lhs_rhs_p = (NONJUMP_INSN_P (insn)
>                          && code == SET);
>
> +  if (targetm.sched.macro_fusion_p
> +      && targetm.sched.macro_fusion_p ())
> +    group_insns_for_macro_fusion (insn);
> +
>    if (may_trap_p (x))
>      /* Avoid moving trapping instructions across function calls that might
>         not always return.  */
> @@ -3504,7 +3536,7 @@ call_may_noreturn_p (rtx insn)
>     group, and if all INSN's dependencies should be moved to the first
>     instruction of that group.  */
>
> -static bool
> +bool
>  chain_to_prev_insn_p (rtx insn)
>  {
>    rtx prev, x;
> Index: doc/tm.texi
> ===================================================================
> --- doc/tm.texi (revision 201963)
> +++ doc/tm.texi (working copy)
> @@ -6553,6 +6553,17 @@ scheduling one insn causes other insns t
>  cycle.  These other insns can then be taken into account properly.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
> +This hook is used to check whether target platform supports macro fusion.
> +@end deftypefn
> +
> +@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
> @var{condgen}, rtx @var{condjmp})
> +This hook is used to check whether two insns could be macro fused for
> +target microarchitecture. If this hook returns true for the given insn pair
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
> +group, and they will not be scheduled apart.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} void
> TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
> @var{tail})
>  This hook is called after evaluation forward dependencies of insns in
>  chain given by two parameter values (@var{head} and @var{tail}
> Index: doc/tm.texi.in
> ===================================================================
> --- doc/tm.texi.in      (revision 201963)
> +++ doc/tm.texi.in      (working copy)
> @@ -4940,6 +4940,10 @@ them: try the first ones in this list fi
>
>  @hook TARGET_SCHED_REORDER2
>
> +@hook TARGET_SCHED_MACRO_FUSION_P
> +
> +@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
> +
>  @hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
>
>  @hook TARGET_SCHED_INIT
> Index: sched-int.h
> ===================================================================
> --- sched-int.h (revision 201963)
> +++ sched-int.h (working copy)
> @@ -1302,6 +1302,8 @@ extern void finish_deps_global (void);
>  extern void deps_analyze_insn (struct deps_desc *, rtx);
>  extern void remove_from_deps (struct deps_desc *, rtx);
>  extern void init_insn_reg_pressure_info (rtx);
> +extern bool chain_to_prev_insn_p (rtx insn);
> +extern void chain_to_prev_insn (rtx);
>
>  extern dw_t get_dep_weak (ds_t, ds_t);
>  extern ds_t set_dep_weak (ds_t, ds_t, dw_t);
> Index: sched-rgn.c
> ===================================================================
> --- sched-rgn.c (revision 201963)
> +++ sched-rgn.c (working copy)
> @@ -2507,7 +2507,7 @@ add_branch_dependences (rtx head, rtx ta
>        }
>
>    if (!targetm.have_conditional_execution ())
> -    return;
> +    goto chain_to_prev_insn;
>
>    /* Finally, if the block ends in a jump, and we are doing intra-block
>       scheduling, make sure that the branch depends on any COND_EXEC insns
> @@ -2543,7 +2543,7 @@ add_branch_dependences (rtx head, rtx ta
>       could remove always-true predicates.  */
>
>    if (!reload_completed || ! (JUMP_P (tail) || JUMP_TABLE_DATA_P (tail)))
> -    return;
> +    goto chain_to_prev_insn;
>
>    insn = tail;
>    while (insn != head)
> @@ -2557,6 +2557,23 @@ add_branch_dependences (rtx head, rtx ta
>        if (INSN_P (insn) && GET_CODE (PATTERN (insn)) == COND_EXEC)
>         add_dependence (tail, insn, REG_DEP_ANTI);
>      }
> +
> + chain_to_prev_insn:
> +  /* Control dependences also need to be chained to the prev insn
> +     for sched group.  */
> +  insn = tail;
> +  while (insn != head)
> +    {
> +      /* Fixup the dependencies in the sched group.  */
> +      if (JUMP_P (insn)
> +         && chain_to_prev_insn_p (insn)
> +         && !sel_sched_p ())
> +       chain_to_prev_insn (insn);
> +
> +      insn = PREV_INSN (insn);
> +    }
> +
> +  return;
>  }
>
>  /* Data structures for the computation of data dependences in a regions.  We
> Index: target.def
> ===================================================================
> --- target.def  (revision 201963)
> +++ target.def  (working copy)
> @@ -1041,6 +1041,19 @@ scheduling one insn causes other insns t
>  cycle.  These other insns can then be taken into account properly.",
>   int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)
>
> +DEFHOOK
> +(macro_fusion_p,
> + "This hook is used to check whether target platform supports macro fusion.",
> + bool, (void), NULL)
> +
> +DEFHOOK
> +(macro_fusion_pair_p,
> + "This hook is used to check whether two insns could be macro fused for\n\
> +target microarchitecture. If this hook returns true for the given insn pair\n\
> +(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
> +group, and they will not be scheduled apart.",
> + bool, (rtx condgen, rtx condjmp), NULL)
> +
>  /* The following member value is a pointer to a function called
>     after evaluation forward dependencies of insns in chain given
>     by two parameter values (head and tail correspondingly).  */
diff mbox

Patch

Index: gcc/testsuite/gcc.dg/macro-fusion-1.c
===================================================================
--- gcc/testsuite/gcc.dg/macro-fusion-1.c       (revision 0)
+++ gcc/testsuite/gcc.dg/macro-fusion-1.c       (revision 0)
@@ -0,0 +1,14 @@ 
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -mtune=corei7 -fdump-rtl-sched2" } */
+/* { dg-final { scan-rtl-dump-not
"compare.*insn.*jump_insn.*jump_insn" "sched2" } } */
+
+int a[100];
+
+double bar (double sum)
+{
+  int i;
+  for (i = 0; i < 1000000; i++)
+   sum += (0.5 + (a[i%100] - 128));
+  return sum;
+}