diff mbox

Speed up genattrtab

Message ID 20100616143334.GP7811@tyan-ft48-01.lab.bos.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek June 16, 2010, 2:33 p.m. UTC
On Tue, Jun 15, 2010 at 09:16:33PM +0200, Jakub Jelinek wrote:
> On Tue, Jun 15, 2010 at 11:39:47AM -0700, Mark Mitchell wrote:
> I believe on x86_64/i686 the most time is spent in compiling
> internal_dfa_insn_code, primarily because there are so many different
> schedulings.
> The insn is a big switch on recog_memoized, where most of the cases first
> compare ix86_schedule var to some enum.  I guess it would be certainly
> faster to compile to instead split the big function into separate function
> for each schedule and make internal_dfa_insn_code a function pointer, would
> need to be benchmarked how it would actually perform at runtime.

Here is a WIP untested patch.

A little bit hackish (I hardcode "cpu" attribute name - the final variant
could perhaps look at all EQ_ATTRs in first decl->condexp and see whether
any of the attributes is used in EQ_ATTR in decl->condexp of all
reservations) so far and unfinished (it generates
internal_dfa_insn_code_generic64
internal_dfa_insn_code_amdfam10
internal_dfa_insn_code_core2
...
functions (and similarly for insn_default_latency_), but so far
doesn't emit the glue (in particular some function that would be called
to initialize the fn pointers based on ix86_schedule in x86_64/i686 case
(not sure if we would need to call it once post options, or e.g. during
expand, or before each scheduling), emit the actual fn pointer
and how to ensure the callers know it is a fn pointer instead of a function
(or whether we need to emit insn_dfa_insn_code as a wrapper that calls
a fn pointer).

In any case, the patch (assuming there are no bugs, would in the end
need to test whether these subfunctions give the same return values
as original function) speeded up genattrtab (-g -O0 build, i.e.
non-optimized) from 1m9.429s down to 0m3.621s,  insn-attrtab.c shrunk
from 5139992 bytes to 3822187, compilation time using optimized release
checking cc1 went down from 40.949s to 19.403s, insn-attrtab.o .text
section went down from 1003956 bytes to 635384 bytes.



	Jakub

Comments

Jan Hubicka June 16, 2010, 3:53 p.m. UTC | #1
> On Tue, Jun 15, 2010 at 09:16:33PM +0200, Jakub Jelinek wrote:
> > On Tue, Jun 15, 2010 at 11:39:47AM -0700, Mark Mitchell wrote:
> > I believe on x86_64/i686 the most time is spent in compiling
> > internal_dfa_insn_code, primarily because there are so many different
> > schedulings.
> > The insn is a big switch on recog_memoized, where most of the cases first
> > compare ix86_schedule var to some enum.  I guess it would be certainly
> > faster to compile to instead split the big function into separate function
> > for each schedule and make internal_dfa_insn_code a function pointer, would
> > need to be benchmarked how it would actually perform at runtime.
> 
> Here is a WIP untested patch.
> 
> A little bit hackish (I hardcode "cpu" attribute name - the final variant
> could perhaps look at all EQ_ATTRs in first decl->condexp and see whether
> any of the attributes is used in EQ_ATTR in decl->condexp of all
> reservations) so far and unfinished (it generates
> internal_dfa_insn_code_generic64
> internal_dfa_insn_code_amdfam10
> internal_dfa_insn_code_core2
> ...

Another breakup of the large functions I was thinking if is actually based
on INSN code.  At the moment most of get_attr_* functions are organized as
calls to extract_insn that is followed by swithc on insn code that is
calling other get_attr_* functions doing the same.

Breaking up the individual cases into separate functions _and_ making those
functions not call get_attr_* but the subfunction for specific insn code
should IMO let GCC IPA to do the right thing and improve both compile time
and runtime.

It is very dificult to make IPA to do the right thing without this breakup
since extract_insn is in the way.

Honza
diff mbox

Patch

--- gcc/genattrtab.c.jj	2010-06-11 09:38:08.000000000 +0200
+++ gcc/genattrtab.c	2010-06-16 16:00:00.000000000 +0200
@@ -4379,28 +4379,115 @@  make_automaton_attrs (void)
   int i;
   struct insn_reserv *decl;
   rtx code_exp, lats_exp, byps_exp;
+  const char *cpu_name;
+  struct attr_desc *cpu_attr;
 
   if (n_insn_reservs == 0)
     return;
 
-  code_exp = rtx_alloc (COND);
-  lats_exp = rtx_alloc (COND);
-
-  XVEC (code_exp, 0) = rtvec_alloc (n_insn_reservs * 2);
-  XVEC (lats_exp, 0) = rtvec_alloc (n_insn_reservs * 2);
+  cpu_name = "cpu";
+  cpu_attr = find_attr (&cpu_name, 0);
+  if (cpu_attr != NULL && 0)
+    {
+      rtx *condexps = XNEWVEC (rtx, n_insn_reservs * 3);
+      struct attr_value *val;
 
-  XEXP (code_exp, 1) = make_numeric_value (n_insn_reservs + 1);
-  XEXP (lats_exp, 1) = make_numeric_value (0);
+      gcc_assert (cpu_attr->is_const
+		  && !cpu_attr->is_special
+		  && !cpu_attr->is_numeric);
+      for (val = cpu_attr->first_value; val; val = val->next)
+	{
+	  int j;
+	  char *name;
+	  rtx test = attr_rtx (EQ_ATTR, cpu_name, XSTR (val->value, 0));
+
+	  if (val == cpu_attr->default_val)
+	    continue;
+	  gcc_assert (GET_CODE (val->value) == CONST_STRING);
+	  for (decl = all_insn_reservs, i = 0;
+	       decl;
+	       decl = decl->next)
+	    {
+	      rtx ctest = test;
+	      rtx condexp
+		= simplify_and_tree (decl->condexp, &ctest, -2, 0);
+	      if (condexp == false_rtx)
+		continue;
+	      if (condexp == true_rtx)
+		break;
+	      condexps[i] = condexp;
+	      condexps[i + 1] = make_numeric_value (decl->insn_num);
+	      condexps[i + 2] = make_numeric_value (decl->default_latency);
+	      i += 3;
+	    }
+
+	  code_exp = rtx_alloc (COND);
+	  lats_exp = rtx_alloc (COND);
+
+	  j = i / 3 * 2;
+	  XVEC (code_exp, 0) = rtvec_alloc (j);
+	  XVEC (lats_exp, 0) = rtvec_alloc (j);
+
+	  if (decl)
+	    {
+	      XEXP (code_exp, 1) = make_numeric_value (decl->insn_num);
+	      XEXP (lats_exp, 1) = make_numeric_value (decl->default_latency);
+	    }
+	  else
+	    {
+	      XEXP (code_exp, 1) = make_numeric_value (n_insn_reservs + 1);
+	      XEXP (lats_exp, 1) = make_numeric_value (0);
+	    }
+
+	  while (i > 0)
+	    {
+	      i -= 3;
+	      j -= 2;
+	      XVECEXP (code_exp, 0, j) = condexps[i];
+	      XVECEXP (lats_exp, 0, j) = condexps[i];
+
+	      XVECEXP (code_exp, 0, j + 1) = condexps[i + 1];
+	      XVECEXP (lats_exp, 0, j + 1) = condexps[i + 2];
+	    }
+
+	  name = XNEWVEC (char,
+			  sizeof ("*internal_dfa_insn_code_")
+			  + strlen (XSTR (val->value, 0)));
+	  strcpy (name, "*internal_dfa_insn_code_");
+	  strcat (name, XSTR (val->value, 0));
+	  make_internal_attr (name, code_exp, ATTR_NONE);
+	  strcpy (name, "*insn_default_latency_");
+	  strcat (name, XSTR (val->value, 0));
+	  make_internal_attr (name, lats_exp, ATTR_NONE);
+	  XDELETEVEC (name);
+	}
 
-  for (decl = all_insn_reservs, i = 0;
-       decl;
-       decl = decl->next, i += 2)
+      XDELETEVEC (condexps);
+    }
+  else
     {
-      XVECEXP (code_exp, 0, i)   = decl->condexp;
-      XVECEXP (lats_exp, 0, i)   = decl->condexp;
+      code_exp = rtx_alloc (COND);
+      lats_exp = rtx_alloc (COND);
+
+      XVEC (code_exp, 0) = rtvec_alloc (n_insn_reservs * 2);
+      XVEC (lats_exp, 0) = rtvec_alloc (n_insn_reservs * 2);
 
-      XVECEXP (code_exp, 0, i+1) = make_numeric_value (decl->insn_num);
-      XVECEXP (lats_exp, 0, i+1) = make_numeric_value (decl->default_latency);
+      XEXP (code_exp, 1) = make_numeric_value (n_insn_reservs + 1);
+      XEXP (lats_exp, 1) = make_numeric_value (0);
+
+      for (decl = all_insn_reservs, i = 0;
+	   decl;
+	   decl = decl->next, i += 2)
+	{
+	  XVECEXP (code_exp, 0, i)   = decl->condexp;
+	  XVECEXP (lats_exp, 0, i)   = decl->condexp;
+
+	  XVECEXP (code_exp, 0, i+1) = make_numeric_value (decl->insn_num);
+	  XVECEXP (lats_exp, 0, i+1)
+	    = make_numeric_value (decl->default_latency);
+	}
+      make_internal_attr ("*internal_dfa_insn_code", code_exp, ATTR_NONE);
+      make_internal_attr ("*insn_default_latency",   lats_exp, ATTR_NONE);
     }
 
   if (n_bypasses == 0)
@@ -4423,8 +4510,6 @@  make_automaton_attrs (void)
 	  }
     }
 
-  make_internal_attr ("*internal_dfa_insn_code", code_exp, ATTR_NONE);
-  make_internal_attr ("*insn_default_latency",   lats_exp, ATTR_NONE);
   make_internal_attr ("*bypass_p",               byps_exp, ATTR_NONE);
 }