diff mbox

[11/12] i386: Always use TARGET_DEEP_BRANCH_PREDICTION.

Message ID 1309384152-25027-12-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson June 29, 2011, 9:49 p.m. UTC
While it could be possible to output_set_got such that we can
individually annotate the instructions, it's simpler to simply
admit that all processors currently being manufactured do want
deep branch prediction.  At which point all of the complication
simply goes away.
---
 gcc/config/i386/i386.c |  105 +++++++----------------------------------------
 gcc/config/i386/i386.h |    3 -
 2 files changed, 16 insertions(+), 92 deletions(-)

Comments

Jan Hubicka Sept. 2, 2011, 11:16 a.m. UTC | #1
> While it could be possible to output_set_got such that we can
> individually annotate the instructions, it's simpler to simply
> admit that all processors currently being manufactured do want
> deep branch prediction.  At which point all of the complication
> simply goes away.

Note that most of modern CPUs special case call to next instruction, so 
they will work well with !X86_TUNE_DEEP_BRANCH_PREDICTION code.

Honza
> ---
>  gcc/config/i386/i386.c |  105 +++++++----------------------------------------
>  gcc/config/i386/i386.h |    3 -
>  2 files changed, 16 insertions(+), 92 deletions(-)
> 
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 014401b..332e65b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -55,7 +55,6 @@ along with GCC; see the file COPYING3.  If not see
>  #include "params.h"
>  #include "cselib.h"
>  #include "debug.h"
> -#include "dwarf2out.h"
>  #include "sched-int.h"
>  #include "sbitmap.h"
>  #include "fibheap.h"
> @@ -1847,10 +1846,6 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
>    m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
>    | m_CORE2I7 | m_GENERIC,
>  
> -  /* X86_TUNE_DEEP_BRANCH_PREDICTION */
> -  m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
> -  | m_CORE2I7 | m_GENERIC,
> -
>    /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
>       on simulation result. But after P4 was made, no performance benefit
>       was observed with branch hints.  It also increases the code size.
> @@ -8323,31 +8318,11 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  
>    xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
>  
> -  if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
> +  if (!flag_pic)
>      {
>        xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
>  
> -      if (!flag_pic)
> -	output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
> -      else
> -	{
> -	  output_asm_insn ("call\t%a2", xops);
> -#ifdef DWARF2_UNWIND_INFO
> -	  /* The call to next label acts as a push.  */
> -	  if (dwarf2out_do_frame ())
> -	    {
> -	      rtx insn;
> -	      start_sequence ();
> -	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> -					     gen_rtx_PLUS (Pmode,
> -							   stack_pointer_rtx,
> -							   GEN_INT (-4))));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	      dwarf2out_frame_debug (insn, true);
> -	      end_sequence ();
> -	    }
> -#endif
> -	}
> +      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
>  
>  #if TARGET_MACHO
>        /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
> @@ -8358,29 +8333,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  
>        targetm.asm_out.internal_label (asm_out_file, "L",
>  				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
> -
> -      if (flag_pic)
> -	{
> -	  output_asm_insn ("pop%z0\t%0", xops);
> -#ifdef DWARF2_UNWIND_INFO
> -	  /* The pop is a pop and clobbers dest, but doesn't restore it
> -	     for unwind info purposes.  */
> -	  if (dwarf2out_do_frame ())
> -	    {
> -	      rtx insn;
> -	      start_sequence ();
> -	      insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
> -	      dwarf2out_frame_debug (insn, true);
> -	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
> -					     gen_rtx_PLUS (Pmode,
> -							   stack_pointer_rtx,
> -							   GEN_INT (4))));
> -	      RTX_FRAME_RELATED_P (insn) = 1;
> -	      dwarf2out_frame_debug (insn, true);
> -	      end_sequence ();
> -	    }
> -#endif
> -	}
>      }
>    else
>      {
> @@ -8388,12 +8340,6 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>        get_pc_thunk_name (name, REGNO (dest));
>        pic_labels_used |= 1 << REGNO (dest);
>  
> -#ifdef DWARF2_UNWIND_INFO
> -      /* Ensure all queued register saves are flushed before the
> -	 call.  */
> -      if (dwarf2out_do_frame ())
> -	dwarf2out_flush_queued_reg_saves ();
> -#endif
>        xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
>        xops[2] = gen_rtx_MEM (QImode, xops[2]);
>        output_asm_insn ("call\t%X2", xops);
> @@ -8408,13 +8354,8 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
>  #endif
>      }
>  
> -  if (TARGET_MACHO)
> -    return "";
> -
> -  if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
> +  if (!TARGET_MACHO)
>      output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
> -  else
> -    output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
>  
>    return "";
>  }
> @@ -10138,7 +10079,11 @@ ix86_expand_prologue (void)
>              insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
>  	}
>        else
> -        insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> +	{
> +          insn = emit_insn (gen_set_got (pic_offset_table_rtx));
> +	  RTX_FRAME_RELATED_P (insn) = 1;
> +	  add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
> +	}
>      }
>  
>    /* In the pic_reg_used case, make sure that the got load isn't deleted
> @@ -28979,12 +28924,7 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub)
>    if (MACHOPIC_ATT_STUB)
>      switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
>    else if (MACHOPIC_PURE)
> -    {
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -	switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
> -      else
> -    switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
> -    }
> +    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
>    else
>      switch_to_section (darwin_sections[machopic_symbol_stub_section]);
>  
> @@ -28998,19 +28938,11 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub)
>    else if (MACHOPIC_PURE)
>      {
>        /* PIC stub.  */
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -	{
> -	  /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> -	  rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> -	  output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
> -	  fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label, lazy_ptr_name, label);
> -	}
> -      else
> -	{
> -	  /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax".  */
> -	  fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label);
> -	  fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name, label);
> -	}
> +      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> +      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
> +      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
> +      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
> +	       label, lazy_ptr_name, label);
>        fprintf (file, "\tjmp\t*%%ecx\n");
>      }
>    else
> @@ -29039,13 +28971,8 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub)
>       compatibility with existing dylibs.  */
>    if (MACHOPIC_PURE)
>      {
> -      /* PIC stubs.  */
> -      if (TARGET_DEEP_BRANCH_PREDICTION)
> -	/* 25-byte PIC stub using "CALL get_pc_thunk".  */
> -	switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
> -      else
> -	/* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx".  */
> -  switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
> +      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
> +      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
>      }
>    else
>      /* 16-byte -mdynamic-no-pic stub.  */
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 8badcbb..1452226 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -249,7 +249,6 @@ enum ix86_tune_indices {
>    X86_TUNE_PUSH_MEMORY,
>    X86_TUNE_ZERO_EXTEND_WITH_AND,
>    X86_TUNE_UNROLL_STRLEN,
> -  X86_TUNE_DEEP_BRANCH_PREDICTION,
>    X86_TUNE_BRANCH_PREDICTION_HINTS,
>    X86_TUNE_DOUBLE_WITH_ADD,
>    X86_TUNE_USE_SAHF,
> @@ -323,8 +322,6 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>  #define TARGET_ZERO_EXTEND_WITH_AND \
>  	ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
>  #define TARGET_UNROLL_STRLEN	ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
> -#define TARGET_DEEP_BRANCH_PREDICTION \
> -	ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
>  #define TARGET_BRANCH_PREDICTION_HINTS \
>  	ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
>  #define TARGET_DOUBLE_WITH_ADD	ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
> -- 
> 1.7.5.4
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 014401b..332e65b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -55,7 +55,6 @@  along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "cselib.h"
 #include "debug.h"
-#include "dwarf2out.h"
 #include "sched-int.h"
 #include "sbitmap.h"
 #include "fibheap.h"
@@ -1847,10 +1846,6 @@  static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
   m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
   | m_CORE2I7 | m_GENERIC,
 
-  /* X86_TUNE_DEEP_BRANCH_PREDICTION */
-  m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
-  | m_CORE2I7 | m_GENERIC,
-
   /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
      on simulation result. But after P4 was made, no performance benefit
      was observed with branch hints.  It also increases the code size.
@@ -8323,31 +8318,11 @@  output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
 
   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
 
-  if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
+  if (!flag_pic)
     {
       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
 
-      if (!flag_pic)
-	output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
-      else
-	{
-	  output_asm_insn ("call\t%a2", xops);
-#ifdef DWARF2_UNWIND_INFO
-	  /* The call to next label acts as a push.  */
-	  if (dwarf2out_do_frame ())
-	    {
-	      rtx insn;
-	      start_sequence ();
-	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
-					     gen_rtx_PLUS (Pmode,
-							   stack_pointer_rtx,
-							   GEN_INT (-4))));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      dwarf2out_frame_debug (insn, true);
-	      end_sequence ();
-	    }
-#endif
-	}
+      output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
 
 #if TARGET_MACHO
       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
@@ -8358,29 +8333,6 @@  output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
 
       targetm.asm_out.internal_label (asm_out_file, "L",
 				      CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
-
-      if (flag_pic)
-	{
-	  output_asm_insn ("pop%z0\t%0", xops);
-#ifdef DWARF2_UNWIND_INFO
-	  /* The pop is a pop and clobbers dest, but doesn't restore it
-	     for unwind info purposes.  */
-	  if (dwarf2out_do_frame ())
-	    {
-	      rtx insn;
-	      start_sequence ();
-	      insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
-	      dwarf2out_frame_debug (insn, true);
-	      insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
-					     gen_rtx_PLUS (Pmode,
-							   stack_pointer_rtx,
-							   GEN_INT (4))));
-	      RTX_FRAME_RELATED_P (insn) = 1;
-	      dwarf2out_frame_debug (insn, true);
-	      end_sequence ();
-	    }
-#endif
-	}
     }
   else
     {
@@ -8388,12 +8340,6 @@  output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
       get_pc_thunk_name (name, REGNO (dest));
       pic_labels_used |= 1 << REGNO (dest);
 
-#ifdef DWARF2_UNWIND_INFO
-      /* Ensure all queued register saves are flushed before the
-	 call.  */
-      if (dwarf2out_do_frame ())
-	dwarf2out_flush_queued_reg_saves ();
-#endif
       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
       xops[2] = gen_rtx_MEM (QImode, xops[2]);
       output_asm_insn ("call\t%X2", xops);
@@ -8408,13 +8354,8 @@  output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
 #endif
     }
 
-  if (TARGET_MACHO)
-    return "";
-
-  if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
+  if (!TARGET_MACHO)
     output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
-  else
-    output_asm_insn ("add%z0\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
 
   return "";
 }
@@ -10138,7 +10079,11 @@  ix86_expand_prologue (void)
             insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
 	}
       else
-        insn = emit_insn (gen_set_got (pic_offset_table_rtx));
+	{
+          insn = emit_insn (gen_set_got (pic_offset_table_rtx));
+	  RTX_FRAME_RELATED_P (insn) = 1;
+	  add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
+	}
     }
 
   /* In the pic_reg_used case, make sure that the got load isn't deleted
@@ -28979,12 +28924,7 @@  machopic_output_stub (FILE *file, const char *symb, const char *stub)
   if (MACHOPIC_ATT_STUB)
     switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
   else if (MACHOPIC_PURE)
-    {
-      if (TARGET_DEEP_BRANCH_PREDICTION)
-	switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
-      else
-    switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
-    }
+    switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
   else
     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
 
@@ -28998,19 +28938,11 @@  machopic_output_stub (FILE *file, const char *symb, const char *stub)
   else if (MACHOPIC_PURE)
     {
       /* PIC stub.  */
-      if (TARGET_DEEP_BRANCH_PREDICTION)
-	{
-	  /* 25-byte PIC stub using "CALL get_pc_thunk".  */
-	  rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
-	  output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
-	  fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label, lazy_ptr_name, label);
-	}
-      else
-	{
-	  /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax".  */
-	  fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label);
-	  fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name, label);
-	}
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+      output_set_got (tmp, NULL_RTX);	/* "CALL ___<cpu>.get_pc_thunk.cx".  */
+      fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
+	       label, lazy_ptr_name, label);
       fprintf (file, "\tjmp\t*%%ecx\n");
     }
   else
@@ -29039,13 +28971,8 @@  machopic_output_stub (FILE *file, const char *symb, const char *stub)
      compatibility with existing dylibs.  */
   if (MACHOPIC_PURE)
     {
-      /* PIC stubs.  */
-      if (TARGET_DEEP_BRANCH_PREDICTION)
-	/* 25-byte PIC stub using "CALL get_pc_thunk".  */
-	switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
-      else
-	/* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx".  */
-  switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
+      /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+      switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
     }
   else
     /* 16-byte -mdynamic-no-pic stub.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8badcbb..1452226 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -249,7 +249,6 @@  enum ix86_tune_indices {
   X86_TUNE_PUSH_MEMORY,
   X86_TUNE_ZERO_EXTEND_WITH_AND,
   X86_TUNE_UNROLL_STRLEN,
-  X86_TUNE_DEEP_BRANCH_PREDICTION,
   X86_TUNE_BRANCH_PREDICTION_HINTS,
   X86_TUNE_DOUBLE_WITH_ADD,
   X86_TUNE_USE_SAHF,
@@ -323,8 +322,6 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_ZERO_EXTEND_WITH_AND \
 	ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
 #define TARGET_UNROLL_STRLEN	ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
-#define TARGET_DEEP_BRANCH_PREDICTION \
-	ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
 #define TARGET_BRANCH_PREDICTION_HINTS \
 	ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
 #define TARGET_DOUBLE_WITH_ADD	ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]