diff mbox

Patch for AMD Dispatch Scheduler

Message ID 47785.88689.qm@web33003.mail.mud.yahoo.com
State New
Headers show

Commit Message

reza yazdani July 8, 2010, 9:35 p.m. UTC
This patch is an implementation of the Dispatch Scheduler in upcoming AMD Bulldozer processor. It is implemented as an extension to Haifa scheduler.

Dispatch scheduling is a new BD feature. It is composed of two parts: the scheduling part and the alignment part.

The scheduling part (this patch) arranges instructions to maximize the throughput of the hardware dispatcher. It makes sure dispatch widow boundaries are roughly observed. It is roughly, because the lengths of instructions, in number of bytes, are not known at the scheduling time. In x86 some instruction lengths may not be known until assembly time where information such as branch offsets are computed. Scheduling part is called once before register allocation and once after register allocation.

The alignment part (not in this patch) makes sure dispatch widows align at the correct boundaries.

A new command flag –fdispatch-sched is defined. This flag sets a new flag_dispatch_sched which is a default if “-march=bdver1” is selected on the command line.

Testing
-------

Self compile ran with “–fdispatch-sched –O3”. No new test added for this implementation. Make check of i386 tests passes. No difference in the number of failures with and without the dispatch flag.

ChangeLog

2010-07-08  Reza Yazdani  <reza.yazdani@amd.com>

    * doc/invoke.texi (fdispatch-sched): Documented.
    * haifa-sched.c (choose_ready): Add support for dispatch
    scheduling for bdver1.
    (schedule_block): Call add_to_dispatch_window.
    (sched_init): Call init_dispatch_sched.
    (ready_remove_first_dispatch): New.
    (print_ready_dispatch): New.
    (debug_ready_dispatch): New.
    * common.opt (fdispatch-sched): New.
    * sched-int.h (init_dispatch_sched): Declared.
    (add_to_dispatch_window): Declared.
    (fits_dispatch_window): Declared.
    (is_cmp): Declared.
    (dispatch_violation): Declared.
    (debug_print_dispatch_window): Declared.
    (debug_print_insn_dispatch_info): Declared.
    (debug_ready_dispatch): Declared.
    * Makefile.in (OBJS-common): Add dispatch-sch.o.
    (dispatch-sch.o): New rule.
    * dispatch-sch.c: New file.
    * config/i386/i386.c (override_options): Set flag_dispatch_sched
    for PROCESSOR_BDVER1.

Reza Yazdani
diff mbox

Patch

Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 161301)
+++ doc/invoke.texi	(working copy)
@@ -376,6 +376,7 @@  Objective-C and Objective-C++ Dialects}.
 -fsched-spec-insn-heuristic -fsched-rank-heuristic @gol
 -fsched-last-insn-heuristic -fsched-dep-count-heuristic @gol
 -fschedule-insns -fschedule-insns2 -fsection-anchors @gol
+-fdispatch-sched @gol
 -fselective-scheduling -fselective-scheduling2 @gol
 -fsel-sched-pipelining -fsel-sched-pipelining-outer-loops @gol
 -fsignaling-nans -fsingle-precision-constant -fsplit-ivs-in-unroller @gol
@@ -6464,6 +6465,16 @@  generated code and decrease its size by 
 increase above the number of available hard registers and as a
 consequence register spills in the register allocation.
 
+@item -fdispatch-sched
+@opindex fdispatch-sched
+Enable dispatch scheduling.  Dispatch scheduling is on when both
+@option{-march = bdver1} and instruction scheduling are enabled,
+i.e.@: with @option{-fschedule-insns}.  Some machines such as AMD
+Bulldozer perform more efficiently if a group of instructions have
+certain alignment and certain operation mix.  Dispatch scheduler
+aligns windows and puts the desired mix of instructions in dispatch
+windows.
+
 @item -fsched-spec-load
 @opindex fsched-spec-load
 Allow speculative motion of some load instructions.  This only makes
Index: haifa-sched.c
===================================================================
--- haifa-sched.c	(revision 161542)
+++ haifa-sched.c	(working copy)
@@ -532,6 +532,7 @@  static void extend_h_i_d (void);
 
 static void ready_add (struct ready_list *, rtx, bool);
 static rtx ready_remove_first (struct ready_list *);
+static rtx ready_remove_first_dispatch (struct ready_list *ready);
 
 static void queue_to_ready (struct ready_list *);
 static int early_queue_to_ready (state_t, struct ready_list *);
@@ -2646,7 +2647,10 @@  choose_ready (struct ready_list *ready, 
   if (lookahead <= 0 || SCHED_GROUP_P (ready_element (ready, 0))
       || DEBUG_INSN_P (ready_element (ready, 0)))
     {
-      *insn_ptr = ready_remove_first (ready);
+      if (flag_dispatch_sched)
+	*insn_ptr = ready_remove_first_dispatch (ready);
+      else
+	*insn_ptr = ready_remove_first (ready);
       return 0;
     }
   else
@@ -2656,6 +2660,7 @@  choose_ready (struct ready_list *ready, 
       rtx insn;
       int try_data = 1, try_control = 1;
       ds_t ts;
+      bool fits_in_dispatch_window_p = false;
 
       insn = ready_element (ready, 0);
       if (INSN_CODE (insn) < 0)
@@ -2664,6 +2669,14 @@  choose_ready (struct ready_list *ready, 
 	  return 0;
 	}
 
+      if (flag_dispatch_sched)
+	  for (i = 1; i < ready->n_ready; i++)
+	    if (fits_dispatch_window (ready_element (ready, i)))
+	      {
+		fits_in_dispatch_window_p = true;
+		break;
+	      }
+
       if (spec_info
 	  && spec_info->flags & (PREFER_NON_DATA_SPEC
 				 | PREFER_NON_CONTROL_SPEC))
@@ -2715,9 +2728,14 @@  choose_ready (struct ready_list *ready, 
 	{
 	  insn = ready_element (ready, i);
 
-	  ready_try [i]
-	    = ((!try_data && (TODO_SPEC (insn) & DATA_SPEC))
-               || (!try_control && (TODO_SPEC (insn) & CONTROL_SPEC)));
+	  if (fits_in_dispatch_window_p)
+	    ready_try [i] = fits_dispatch_window (insn)
+	      && ((!try_data && (TODO_SPEC (insn) & DATA_SPEC))
+		  || (!try_control && (TODO_SPEC (insn) & CONTROL_SPEC)));
+	  else
+	    ready_try [i] = ((!try_data && (TODO_SPEC (insn) & DATA_SPEC))
+			     || (!try_control
+				 && (TODO_SPEC (insn) & CONTROL_SPEC)));
 	}
 
       /* Let the target filter the search space.  */
@@ -3144,6 +3162,10 @@  schedule_block (basic_block *target_bb)
 						       last_scheduled_insn);
 
 	  move_insn (insn, last_scheduled_insn, current_sched_info->next_tail);
+
+	  if (flag_dispatch_sched)
+	    add_to_dispatch_window (insn);
+
 	  reemit_notes (insn);
 	  last_scheduled_insn = insn;
 
@@ -3368,8 +3390,11 @@  sched_init (void)
   flag_schedule_speculative_load = 0;
 #endif
 
+  if (flag_dispatch_sched)
+    init_dispatch_sched ();
   sched_pressure_p = (flag_sched_pressure && ! reload_completed
 		      && common_sched_info->sched_pass_id == SCHED_RGN_PASS);
+
   if (sched_pressure_p)
     ira_setup_eliminable_regset ();
 
@@ -5567,4 +5592,73 @@  sched_emit_insn (rtx pat)
   return insn;
 }
 
+/* Get first insn that fits in the dispatch window.  If none exists return
+   first insn.  */
+
+static rtx
+ready_remove_first_dispatch (struct ready_list *ready)
+{
+  int i;
+  rtx insn = ready_element (ready, 0);
+
+  if (ready->n_ready == 1
+      || INSN_CODE (insn) < 0
+      || !INSN_P (insn)
+      || !active_insn_p (insn)
+      || fits_dispatch_window (insn))
+    return ready_remove_first (ready);
+
+  for (i = 1; i < ready->n_ready; i++)
+    {
+      insn = ready_element (ready, i);
+      if (INSN_CODE (insn) < 0
+	  || !INSN_P (insn)
+	  || !active_insn_p (insn))
+	continue;
+      if (fits_dispatch_window (insn))
+	{
+	  /* Return ith element of ready.  */
+	  insn = ready_remove (ready, i);
+	  return insn;
+	}
+    }
+
+  if (dispatch_violation ())
+    return ready_remove_first (ready);
+
+  for (i = 1; i < ready->n_ready; i++)
+    {
+      insn = ready_element (ready, i);
+      if (INSN_CODE (insn) < 0
+	  || !INSN_P (insn)
+	  || !active_insn_p (insn))
+	continue;
+      if (is_cmp (insn))
+	  /* Return ith element of ready.  */
+	  return ready_remove (ready, i);
+    }
+
+  return ready_remove_first (ready);
+}
+
+/* Print status of the ready list with respect to dispach windows.  */
+static void
+print_ready_dispatch (FILE *file)
+{
+  int i;
+
+  fprintf (file, "Number of ready: %d\n", ready.n_ready);
+  for (i = 0; i < ready.n_ready; i++)
+    debug_print_insn_dispatch_info (ready_element (&ready, i));
+}
+
+/* Print to STDERR the status of the ready list with respect to
+   dispach windows.  */
+
+DEBUG_FUNCTION void
+debug_ready_dispatch (void)
+{
+  print_ready_dispatch (stderr);
+}
+
 #endif /* INSN_SCHEDULING */
Index: common.opt
===================================================================
--- common.opt	(revision 161301)
+++ common.opt	(working copy)
@@ -1078,6 +1078,10 @@  fschedule-insns2
 Common Report Var(flag_schedule_insns_after_reload) Optimization
 Reschedule instructions after register allocation
 
+fdispatch-sched
+Common Report Var(flag_dispatch_sched) Optimization
+Perform dispatch scheduling before and after register allocation
+
 ; This flag should be on when a target implements non-trivial
 ; scheduling hooks, maybe saving some information for its own sake.
 ; On IA64, for example, this is used for correct bundling. 
Index: sched-int.h
===================================================================
--- sched-int.h	(revision 161301)
+++ sched-int.h	(working copy)
@@ -1497,4 +1497,14 @@  extern void print_insn (char *, const_rt
 extern void print_pattern (char *, const_rtx, int);
 extern void print_value (char *, const_rtx, int);
 
+/* Functions in dispatch_sch.c.  */
+extern void init_dispatch_sched (void);
+extern void add_to_dispatch_window (rtx);
+extern bool fits_dispatch_window (rtx);
+extern bool is_cmp (rtx insn);
+extern bool dispatch_violation (void);
+extern void debug_print_dispatch_window (int);
+extern void debug_print_insn_dispatch_info (rtx);
+extern void debug_ready_dispatch (void);
+
 #endif /* GCC_SCHED_INT_H */
Index: Makefile.in
===================================================================
--- Makefile.in	(revision 161301)
+++ Makefile.in	(working copy)
@@ -1203,6 +1203,7 @@  OBJS-common = \
 	df-scan.o \
 	dfp.o \
 	diagnostic.o \
+	dispatch-sch.o \
 	dojump.o \
 	dominance.o \
 	domwalk.o \
@@ -2790,6 +2791,11 @@  fold-const.o : fold-const.c $(CONFIG_H) 
    $(GIMPLE_H) realmpfr.h
 diagnostic.o : diagnostic.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    version.h $(INPUT_H) intl.h $(DIAGNOSTIC_H) diagnostic.def
+dispatch-sch.o : dispatch-sch.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
+   $(TM_H) $(RTL_H) $(SCHED_INT_H) $(REGS_H) hard-reg-set.h \
+   $(FLAGS_H) insn-config.h $(FUNCTION_H) \
+   $(INSN_ATTR_H) $(TOPLEV_H) $(RECOG_H) $(EXCEPT_H) $(TM_P_H) \
+   $(TARGET_H) output.h $(PARAMS_H) $(DBGCNT_H) $(CFGLOOP_H) ira.h
 opts.o : opts.c opts.h options.h $(TOPLEV_H) $(CONFIG_H) $(SYSTEM_H) \
    coretypes.h $(TREE_H) $(TM_H) langhooks.h $(GGC_H) $(EXPR_H) $(RTL_H) \
    output.h $(DIAGNOSTIC_H) $(TM_P_H) $(INSN_ATTR_H) intl.h $(TARGET_H) \
Index: dispatch-sch.c
===================================================================
--- dispatch-sch.c	(revision 0)
+++ dispatch-sch.c	(revision 0)
@@ -0,0 +1,874 @@ 
+/* Dispatch scheduling pass.
+   Copyright (C) 2010
+   Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Dispatch Scheduling is part of the Haifa scheduler.  Some machines
+   such as AMD Bulldozer perform more efficiently if a group of
+   instructions have certain alignment and certain operation mix.
+
+   Scheduled instructions are dispatched in windows of size W.  N of
+   these windows are scheduled together as a dispatch group.
+
+   There are two phases in dispatch scheduling, before and after
+   register allocation.
+
+   Dispatch scheduling is done before register allocation, to avoid
+   barriers in moving instructions cased by extra dependencies imposed
+   by GRA.  It also is done after register allocation, because GRA may
+   change the instruction sequence by generating spill code or eliminating
+   register copies, etc.
+
+   The same algorithm is used before and after register allocation.
+   There is an extra path after register allocation to generate
+   alignments code.  This path is done by the assembler, because the
+   exact length of instructions may not be known until then.
+
+   The main interface between dispatch scheduler and the Haifa scheduler
+   is the functions fits_dispatch_window.  This function returns true
+   if the dispatch restrictions are satisfied for the scheduling candidate.
+
+   After an instruction is scheduled, the Haifa scheduler calls
+   add_to_dispatch_window to update the dispatch scheduler data base.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "toplev.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "hard-reg-set.h"
+#include "regs.h"
+#include "function.h"
+#include "insn-config.h"
+#include "insn-attr.h"
+#include "toplev.h"
+#include "recog.h"
+#include "sched-int.h"
+#include "target.h"
+#include "output.h"
+#include "params.h"
+#include "vecprim.h"
+#include "dbgcnt.h"
+#include "cfgloop.h"
+#include "ira.h"
+#include "cselib.h"
+
+/* The size of the dispatch window is the total number of bytes of
+   object code allowed in a window.  */
+#define DISPATCH_WINDOW_SIZE 16
+
+/* Number of dispatch windows considered for scheduling.  */
+#define MAX_DISPATCH_WINDOWS 3
+
+/* Maximum number of instructions in a window.  */
+#define MAX_INSN 4
+
+/* Maximum number of immediate operands in a window.  */
+#define MAX_IMM 4
+
+/* Maximum number of immediate bits allowed in a window.  */
+#define MAX_IMM_SIZE 128
+
+/* Maximum number of 32 bit immediates allowed in a window.  */
+#define MAX_IMM_32 4
+
+/* Maximum number of 64 bit immediates allowed in a window.  */
+#define MAX_IMM_64 2
+
+/* Maximum total of loads or prefetches allowed in a window.  */
+#define MAX_LOAD 2
+
+/* Maximum total of stores allowed in a window.  */
+#define MAX_STORE 1
+
+#undef BIG
+#define BIG 100
+
+
+/* Dispatch groups.  Istructions that affect the mix in a dispatch window.  */
+enum dispatch_group {
+  disp_no_group = 0,
+  disp_load,
+  disp_store,
+  disp_load_store,
+  disp_prefetch,
+  disp_imm,
+  disp_imm_32,
+  disp_imm_64,
+  disp_branch,
+  disp_cmp,
+  disp_jcc,
+  disp_last
+};
+
+/* Number of allowable groups in a dispatch window.  It is an array
+   indexed by dispatch_group enum.  100 is used as a big number,
+   because the number of these kind of operations does not have any
+   effect in dispatch window, but we need them for other reasons in
+   the table.  */
+static int num_allowable_groups[disp_last] = {
+  0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
+};
+
+char group_name[disp_last + 1][16] = {
+  "disp_no_group", "disp_load", "disp_store", "disp_load_store",
+  "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
+  "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
+};
+
+/* Instruction path.  */
+enum insn_path {
+  no_path = 0,
+  path_single, /* Single micro op.  */
+  path_double, /* Double micro op.  */
+  path_multi,  /* Instructions with more than 2 micro op..  */
+  last_path
+};
+
+/* sched_insn_info defines a window to the instructions scheduled in
+   the basic block.  It contains a pointer to the insn_info table and
+   the instruction scheduled.
+
+   Windows are allocated for each basic block and are linked
+   together.  */
+typedef struct sched_insn_info_s {
+  rtx insn;
+  enum dispatch_group group;
+  enum insn_path path;
+  int byte_len;
+  int imm_bytes;
+} sched_insn_info;
+
+/* Linked list of dispatch windows.  This is a two way list of
+   dispatch windows of a basic block.  It contains information about
+   the number of uops in the window and the total number of
+   instructions and of bytes in the object code for this dispatch
+   window.  */
+typedef struct dispatch_windows_s {
+  int num_insn;            /* Number of insn in the window.  */
+  int num_uops;            /* Number of uops in the window.  */
+  int window_size;         /* Number of bytes in the window.  */
+  int window_num;          /* Window number between 0 or 1.  */
+  int num_imm;             /* Number of immediates in an insn.  */
+  int num_imm_32;          /* Number of 32 bit immediates in an insn.  */
+  int num_imm_64;          /* Number of 64 bit immediates in an insn.  */
+  int imm_size;            /* Total immediates in the window.  */
+  int num_loads;           /* Total memory loads in the window.  */
+  int num_stores;          /* Total memory stores in the window.  */
+  int violation;          /* Violation exists in window.  */
+  sched_insn_info *window; /* Pointer to the window.  */
+  struct dispatch_windows_s *next;
+  struct dispatch_windows_s *prev;
+} dispatch_windows;
+
+static dispatch_windows *dispatch_window_list;
+static dispatch_windows *dispatch_window_list1;
+
+/* Returns decode type on an AMDFAM10 machine which can be
+   "DIRECT", "DOUBLE", "VECTOR" which are decoded
+   to 1, 2 or more than 2 micro-ops respectively.  */
+static enum attr_amdfam10_decode
+dispatch_group_amdfam10_decode (rtx insn)
+{
+  enum attr_amdfam10_decode amdfam10_decode;
+  amdfam10_decode = get_attr_amdfam10_decode (insn);
+  return amdfam10_decode;
+}
+
+/* Get dispatch group of insn.  */
+
+static enum dispatch_group
+get_mem_group (rtx insn)
+{
+  enum attr_memory memory;
+
+  if (INSN_CODE (insn) < 0)
+    return disp_no_group;
+  memory = get_attr_memory (insn);
+  if (memory == MEMORY_STORE)
+    return disp_store;
+
+  if (memory == MEMORY_LOAD)
+    return disp_load;
+
+  if (memory == MEMORY_BOTH)
+    return disp_load_store;
+
+  return disp_no_group;
+}
+
+/* Return true if insn is a compare instruction.  */
+
+bool
+is_cmp (rtx insn)
+{
+  enum attr_type type;
+  type = get_attr_type (insn);
+  return (type == TYPE_TEST || type == TYPE_ICMP || type == TYPE_FCMP);
+}
+
+/* Return true if a dispatch violation encountered.  */
+
+bool
+dispatch_violation (void)
+{
+  if (dispatch_window_list->next)
+    return dispatch_window_list->next->violation;
+  return dispatch_window_list->violation;
+}
+
+/* Return true if insn is a branch instruction.  */
+
+static bool
+is_branch (rtx insn)
+{
+  return (CALL_P (insn) || JUMP_P (insn));
+}
+
+static bool
+is_prefetch (rtx insn)
+{
+  return ((strcmp (GET_RTX_NAME (GET_CODE (insn)), "prefetch_sse") == 0)
+	  || (strcmp (GET_RTX_NAME (GET_CODE (insn)),
+		      "prefetch_sse_rex") == 0)
+	  || (strcmp (GET_RTX_NAME (GET_CODE (insn)),
+		      "prefetch_3dnow") == 0)
+	  || (strcmp (GET_RTX_NAME (GET_CODE (insn)),
+		      "prefetch_3dnow_rex") == 0));
+}
+
+/* This function initializes a dispatch window and the list container holding a
+   pointer to the window.  */
+
+static void
+init_window (int window_num)
+{
+  int i;
+  dispatch_windows *new_list;
+
+  if (window_num == 0)
+    new_list = dispatch_window_list;
+  else
+    new_list = dispatch_window_list1;
+
+  new_list->num_insn = 0;
+  new_list->num_uops = 0;
+  new_list->window_size = 0;
+  new_list->next = NULL;
+  new_list->prev = NULL;
+  new_list->window_num = window_num;
+  new_list->num_imm = 0;
+  new_list->num_imm_32 = 0;
+  new_list->num_imm_64 = 0;
+  new_list->imm_size = 0;
+  new_list->num_loads = 0;
+  new_list->num_stores = 0;
+  new_list->violation = false;
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      new_list->window[i].insn = NULL;
+      new_list->window[i].group = disp_no_group;
+      new_list->window[i].path = no_path;
+      new_list->window[i].byte_len = 0;
+      new_list->window[i].imm_bytes = 0;
+    }
+  return;
+}
+
+/* This function allocates and initializes a dispatch window and the
+   list container holding a pointer to the window.  */
+
+static dispatch_windows *
+allocate_window (void)
+{
+  dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
+  new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
+
+  return new_list;
+}
+
+/* This routine initializes the dispatch scheduling information.  It
+   initiates building dispatch scheduler tables and constructs the
+   first dispatch window.  */
+
+void
+init_dispatch_sched (void)
+{
+  /* Allocate a dispatch list and a window.  */
+  dispatch_window_list = allocate_window ();
+  dispatch_window_list1 = allocate_window ();
+  init_window (0);
+  init_window (1);
+}
+
+
+/* This function returns true if a branch is detected.  End of a basic block
+   does not have to be a branch, but here we assume only branches end a
+   window.  */
+
+static bool
+is_end_basic_block (enum dispatch_group group)
+{
+  return group == disp_branch;
+}
+
+/* This function is called when the end of a window processing is reached.  */
+
+static void
+process_end_window (void)
+{
+  gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
+  if (dispatch_window_list->next)
+    {
+      gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
+      gcc_assert (dispatch_window_list->window_size + dispatch_window_list1->window_size <= 48);
+      init_window (1);
+    }
+  init_window (0);
+}
+
+/* Allocates a new dispatch window and adds it to WINDOW_LIST.
+   WINDOW_NUM is either 0 or 1.  A maximum of two windows are generated
+   for 48 bytes of instructions.  Note that these windows are not dispatch
+   windows that their sizes are DISPATCH_WINDOW_SIZE.  */
+
+static dispatch_windows *
+allocate_next_window (int window_num)
+{
+  if (window_num == 0)
+    {
+      if (dispatch_window_list->next)
+	  init_window (1);
+      init_window (0);
+      return dispatch_window_list;
+    }
+
+  dispatch_window_list->next = dispatch_window_list1;
+  dispatch_window_list1->prev = dispatch_window_list;
+
+  return dispatch_window_list1;
+}
+
+/* Recursive function returning total sizes of immediate operands of an
+   instruction along with number of corresponding immediate-operands.  */
+
+static void
+find_con (const_rtx in_rtx, int *imm, int *imm32, int *imm64)
+{
+  int i = 0;
+  int j;
+  const char *format_ptr;
+  enum rtx_code code;
+
+  if (in_rtx == 0)
+    return;
+
+  code = GET_CODE (in_rtx);
+  if (code == CONST_INT)
+    {
+      (*imm)++;
+      (*imm32)++;
+    }
+  else if (code == CONST_DOUBLE)
+    {
+      (*imm)++;
+      (*imm64)++;
+    }
+  else if (GET_CODE (in_rtx) == CODE_LABEL)
+    {
+      if (LABEL_KIND (in_rtx) == LABEL_NORMAL)
+	{
+	  (*imm)++;
+	  (*imm32)++;
+	}
+    }
+
+  if (GET_CODE (in_rtx) == VAR_LOCATION)
+    {
+      find_con (PAT_VAR_LOCATION_LOC (in_rtx), imm, imm32, imm64);
+      i = GET_RTX_LENGTH (VAR_LOCATION);
+    }
+
+  if (GET_CODE (in_rtx) == CONST_DOUBLE && FLOAT_MODE_P (GET_MODE (in_rtx)))
+    i = 5;
+
+  format_ptr = GET_RTX_FORMAT (GET_CODE (in_rtx)) + i;
+  for (; i < GET_RTX_LENGTH (GET_CODE (in_rtx)); i++)
+    switch (*format_ptr++)
+      {
+      case 'e':
+	find_con (XEXP (in_rtx, i), imm, imm32, imm64);
+	break;
+
+      case 'E':
+      case 'V':
+	if (XVEC (in_rtx, i) != NULL)
+	    for (j = 0; j < XVECLEN (in_rtx, i); j++)
+	      find_con (XVECEXP (in_rtx, i, j), imm, imm32, imm64);
+	break;
+      }
+}
+
+/* Return total sizes of immediate operands of an instruction along with number
+   of corresponding immediate-operands.  */
+
+static int
+get_num_imm (rtx insn, int *imm, int *imm32, int *imm64)
+{
+  *imm = 0;
+  *imm32 = 0;
+  *imm64 = 0;
+  find_con (insn, imm, imm32, imm64);
+  return *imm32 * 4 + *imm64 * 8;
+}
+
+/* This function indicates if an operand of an instruction is an
+   immediate.  */
+
+static bool
+has_imm (rtx insn)
+{
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (insn)
+    return get_num_imm (insn, &num_imm_operand, &num_imm32_operand,
+			&num_imm64_operand);
+  return false;
+}
+
+/* Get bytes length of INSN.
+   This function is very similar to the static function min_insn_size
+   in i386.c.  The main difference is the use of get_attr_length.  */
+
+static int
+get_insn_length (rtx insn)
+{
+  int len, l = 0;
+
+  if (!INSN_P (insn) || !active_insn_p (insn))
+    return 0;
+
+  len = get_attr_length (insn);
+  /* For normal instructions we rely on get_attr_length being exact,
+     with a few exceptions.  */
+  if (!JUMP_P (insn))
+    {
+      enum attr_type type = get_attr_type (insn);
+
+      switch (type)
+	{
+	case TYPE_MULTI:
+	  if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (insn)) >= 0)
+	    return 0;
+	  break;
+	case TYPE_OTHER:
+	case TYPE_FCMP:
+	  break;
+	default:
+	  /* Otherwise trust get_attr_length.  */
+	  return len;
+	}
+
+      l = get_attr_length_address (insn);
+      if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
+	l = 4;
+    }
+  if (l)
+    return 1+l;
+  else
+    return 2;
+}
+
+
+/* Return single or double path for instructions.  */
+
+static enum insn_path
+get_insn_path (rtx insn)
+{
+  enum attr_amdfam10_decode path = dispatch_group_amdfam10_decode (insn);
+
+  if ((int)path == 0)
+    return path_single;
+
+  if ((int)path == 1)
+    return path_double;
+
+  return path_multi;
+}
+
+/* Return insn dispatch group.  */
+
+static enum
+dispatch_group get_insn_group (rtx insn)
+{
+  enum dispatch_group group = get_mem_group (insn);
+  if (group)
+    return group;
+
+  if (is_branch (insn))
+    return disp_branch;
+
+  if (is_cmp (insn))
+    return disp_cmp;
+
+  if (has_imm (insn))
+    return disp_imm;
+
+  if (is_prefetch (insn))
+    return disp_prefetch;
+
+  return disp_no_group;
+}
+
+/* Add an instruction INSN with NUM_UOPS micro-operations to the
+   dispatch window WINDOW_LIST.  */
+
+static void
+add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
+{
+  int byte_len = get_insn_length (insn);
+  int num_insn = window_list->num_insn;
+  int imm_size;
+  sched_insn_info *window = window_list->window;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (!window_list->violation && group != disp_cmp
+      && !fits_dispatch_window (insn))
+    window_list->violation = true;
+
+  imm_size = get_num_imm (insn, &num_imm_operand, &num_imm32_operand,
+			  &num_imm64_operand);
+    /* Initialize window with new instruction.  */
+  window[num_insn].insn = insn;
+  window[num_insn].byte_len = byte_len;
+  window[num_insn].group = group;
+  window[num_insn].path = path;
+  window[num_insn].imm_bytes = imm_size;
+
+  window_list->window_size += byte_len;
+  window_list->num_insn = num_insn + 1;
+  window_list->num_uops = window_list->num_uops + num_uops;
+  window_list->imm_size += imm_size;
+  window_list->num_imm += num_imm_operand;
+  window_list->num_imm_32 += num_imm32_operand;
+  window_list->num_imm_64 += num_imm64_operand;
+
+  if (group == disp_store)
+    window_list->num_stores += 1;
+  else if (group == disp_load
+	   || group == disp_prefetch)
+    window_list->num_loads += 1;
+  else if (group == disp_load_store)
+    {
+      window_list->num_stores += 1;
+      window_list->num_loads += 1;
+    }
+}
+
+/* Count number of GROUP restricted instructions in a dispatch
+   window WINDOW_LIST.  */
+
+static int
+count_num_restricted (rtx insn, dispatch_windows *window_list)
+{
+  enum dispatch_group group = get_insn_group (insn);
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (group == disp_no_group)
+    return 0;
+
+  if (group == disp_imm)
+    {
+      imm_size = get_num_imm (insn, &num_imm_operand, &num_imm32_operand,
+			      &num_imm64_operand);
+      if (window_list->imm_size + imm_size > MAX_IMM_SIZE
+	  || num_imm_operand + window_list->num_imm > MAX_IMM
+	  || (num_imm32_operand > 0
+	      && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
+		  || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
+	  || (num_imm64_operand > 0
+	      && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
+		  || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
+	  || (window_list->imm_size + imm_size == MAX_IMM_SIZE
+	      && num_imm64_operand > 0
+	      && ((window_list->num_imm_64 > 0
+		   && window_list->num_insn >= 2)
+		  || window_list->num_insn >= 3)))
+	return BIG;
+
+      return 1;
+    }
+
+  if ((group == disp_load_store
+       && (window_list->num_loads >= MAX_LOAD
+	   || window_list->num_stores >= MAX_STORE))
+      || ((group == disp_load
+	   || group == disp_prefetch)
+	  && window_list->num_loads >= MAX_LOAD)
+      || (group == disp_store
+	  && window_list->num_stores >= MAX_STORE))
+    return BIG;
+
+  return 1;
+}
+
+/* Adds a scheduled instruction, INSN, to the current dispatch window.
+   If the total bytes of instructions or the number of instructions in
+   the window exceed allowable, it allocates a new window.  */
+
+void
+add_to_dispatch_window (rtx insn)
+{
+  int byte_len;
+  dispatch_windows *window_list;
+  dispatch_windows *next_list;
+  dispatch_windows *window0_list;
+  enum insn_path path;
+  enum dispatch_group insn_group;
+  bool insn_fits;
+  int num_insn;
+  int num_uops;
+  int window_num;
+  int insn_num_uops;
+  int sum;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = get_insn_length (insn);
+  window_list = dispatch_window_list;
+  next_list = window_list->next;
+  path = get_insn_path (insn);
+  insn_group = get_insn_group (insn);
+
+  /* Get the last dispatch window.  */
+  if (next_list)
+      window_list = dispatch_window_list->next;
+
+  if (path == path_single)
+    insn_num_uops = 1;
+  else if (path == path_double)
+    insn_num_uops = 2;
+  else
+    insn_num_uops = (int) path;
+
+  /* If current window is full, get a new window.
+     Window number zero is full, if MAX_INSN uops are scheduled in it.
+     Window number one is full, if window zero's bytes plus window
+     one's bytes is 32, or if the bytes of the new instruction added
+     to the total makes it greater than 48, or it has already MAX_INSN
+     instructions in it.  */
+  num_insn = window_list->num_insn;
+  num_uops = window_list->num_uops;
+  window_num = window_list->window_num;
+  insn_fits = fits_dispatch_window (insn);
+
+  if (num_insn >= MAX_INSN
+      || num_uops + insn_num_uops > MAX_INSN
+      || !(insn_fits))
+    {
+      window_num = ~window_num & 1;
+      window_list = allocate_next_window (window_num);
+    }
+
+  if (window_num == 0)
+    {
+      add_insn_window (insn, window_list, insn_num_uops);
+      if (window_list->num_insn >= MAX_INSN
+	  && insn_group == disp_branch)
+	{
+	  process_end_window ();
+	  return;
+	}
+    }
+  else if (window_num == 1)
+    {
+      window0_list = window_list->prev;
+      sum = window0_list->window_size + window_list->window_size;
+      if (sum == 32
+	  || (byte_len + sum) >= 48)
+	{
+	  process_end_window ();
+	  window_list = dispatch_window_list;
+	}
+
+      add_insn_window (insn, window_list, insn_num_uops);
+    }
+  else
+    gcc_unreachable ();
+
+  if (is_end_basic_block (insn_group))
+    {
+      /* End of basic block is reached do end-basic-block process.  */
+      process_end_window ();
+      return;
+    }
+}
+
+/* This function returns true if insn satisfies dispatch rules on the
+   last window scheduled.  */
+
+bool
+fits_dispatch_window (rtx insn)
+{
+  dispatch_windows *window_list = dispatch_window_list;
+  dispatch_windows *window_list_next = dispatch_window_list->next;
+  int num_restrict;
+  enum dispatch_group group = get_insn_group (insn);
+  enum insn_path path = get_insn_path (insn);
+  int sum;
+
+  /* Make disp_cmp and disp_jcc get scheduled at the latest.  These
+     instructions should be given the lowest priority in the
+     scheduling process in Haifa scheduler to make sure they will be
+     scheduled in the same dispatch window as the refrence to them.  */
+  if (group == disp_jcc || group == disp_cmp)
+    return false;
+
+  /* Check nonrestricted.  */
+  if (group == disp_no_group || group == disp_branch)
+    return true;
+
+  /* Get last dispatch window.  */
+  if (window_list_next)
+    window_list = window_list_next;
+
+  if (window_list->window_num == 1)
+    {
+     sum = window_list->prev->window_size + window_list->window_size;
+      if (sum == 32
+	  || (get_insn_length (insn) + sum) >= 48)
+	/* Window 1 is full.  Go for next window.  */
+	return true;
+    }
+
+  num_restrict = count_num_restricted (insn, window_list);
+
+  if (num_restrict > num_allowable_groups[group])
+    return false;
+
+  /* See if it fits in the first window.  */
+  if (window_list->window_num == 0)
+    {
+      /* The first widow should have only single and double path
+	 uops.  */
+      if (path == path_double
+	  && (window_list->num_uops + 2) > MAX_INSN)
+	return false;
+      else if (path != path_single)
+        return false;
+    }
+  return true;
+}
+
+/* Print a dispatch window.  */
+
+static void
+print_dispatch_window (FILE *file, int window_num)
+{
+  dispatch_windows *list;
+  int i;
+
+  if (window_num == 0)
+    list = dispatch_window_list;
+  else
+    list = dispatch_window_list1;
+
+  fprintf (file, "Window #%d:\n", list->window_num);
+  fprintf (file, "  num_insn = %d, num_uops = %d, window_size = %d\n",
+	  list->num_insn, list->num_uops, list->window_size);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d,\
+ imm_size = %d\n",
+	   list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
+
+  fprintf (file, "  num_loads = %d, num_stores = %d\n", list->num_loads,
+	  list->num_stores);
+  fprintf (file, " insn info:\n");
+
+  for (i = 0; i < MAX_INSN; i++)
+    {
+      if (!list->window[i].insn)
+	break;
+      fprintf (file, "    group[%d] = %s, insn[%d] = %p, path[%d] = %d\
+ byte_len[%d] = %d, imm_bytes[%d] = %d\n",
+	      i, group_name[list->window[i].group],
+	      i, (void *)list->window[i].insn,
+	      i, list->window[i].path,
+	      i, list->window[i].byte_len,
+	      i, list->window[i].imm_bytes);
+    }
+}
+
+/* Print to stderr a dispatch window.  */
+
+DEBUG_FUNCTION void
+debug_print_dispatch_window (int window_num)
+{
+  print_dispatch_window (stderr, window_num);
+}
+
+/* Print insn dispatch information.  */
+
+static void
+print_insn_dispatch_info (FILE *file, rtx insn)
+{
+  int byte_len;
+  enum insn_path path;
+  enum dispatch_group group;
+  int imm_size;
+  int num_imm_operand;
+  int num_imm32_operand;
+  int num_imm64_operand;
+
+  if (INSN_CODE (insn) < 0)
+    return;
+
+  byte_len = get_insn_length (insn);
+  path = get_insn_path (insn);
+  group = get_insn_group (insn);
+  imm_size = get_num_imm (insn, &num_imm_operand, &num_imm32_operand,
+			  &num_imm64_operand);
+
+  fprintf (file, " insn info:\n");
+  fprintf (file, "  group = %s, path = %d, byte_len = %d\n", group_name[group],
+	   path, byte_len);
+  fprintf (file, "  num_imm = %d, num_imm_32 = %d, num_imm_64 = %d,\
+ imm_size = %d\n",
+	   num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
+}
+
+/* Print insn dispatch information.  */
+
+DEBUG_FUNCTION void
+debug_print_insn_dispatch_info (rtx insn)
+{
+  print_insn_dispatch_info (stderr, insn);
+}
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 161301)
+++ config/i386/i386.c	(working copy)
@@ -3573,6 +3573,9 @@  override_options (bool main_args_p)
   if (main_args_p)
     target_option_default_node = target_option_current_node
       = build_target_option_node ();
+
+  if (ix86_tune == PROCESSOR_BDVER1)
+    flag_dispatch_sched = 1;
 }
 
 /* Update register usage after having seen the compiler flags.  */