diff mbox series

[3/4] Fix PR90001

Message ID 06c5e43724c4964b455a41c32626115c@ispras.ru
State New
Headers show
Series Addressing modulo-scheduling bugs | expand

Commit Message

Roman Zhuykov April 16, 2019, 12:05 p.m. UTC
Current algorithm which finds recurrence_length for all DDG strongly 
connected components works in like O(N^6) time, where N in the number of 
nodes in DDG.  The time is so bad mostly for graphs with lots of edges, 
like almost N^2 edges.  My proposed algorithm works in O(N^3).  
Algorithm of finding SCCs itself is also not optimal (maybe up to 
O(N^4)), but here it left untouched.

For some situations, when amount of edges is smaller (like equal to N), 
new algorithm can be unfortunately slower than old one.  But I think 
it's better here to add some bail-out when we got more than 1000 nodes 
for example.

Before creating this patch, I tested special version of it, where both 
approaches were in action and asserts were inserted to check that 
algorithms results (longest_simple_path values) are absolutely the same. 
  I can publish this special version if needed.

I’ve described patch testing in cover letter. Ok for trunk?

gcc/ChangeLog:

2019-04-08  Roman Zhuykov  <zhroma@ispras.ru>

	PR rtl-optimization/90001
	* ddg.c (create_ddg): Init max_dist array for each node.
	(free_ddg): Free max_dist array.
	(create_ddg_edge): Use bool field instead of aux union.
	(set_recurrence_length): Use prepared max_dist information instead
	of calling longest_simple_path.
	(create_scc): Remove graph argument, fill node's aux.count with
	SCC id, and move set_recurrence_length call to...
	(create_ddg_all_sccs): ...here, after filling all max_dist arrays
	using Floyd–Warshall-like algorithm.
	(update_dist_to_successors): Remove the whole function.
	(longest_simple_path): Likewise.
	* ddg.h (struct ddg_node): Add max_dist pointer.
	(struct ddg_edge): Use bool field instead of unused aux union.


*/
@@ -178,7 +179,6 @@ ddg_all_sccs_ptr create_ddg_all_sccs (ddg_ptr);
  void free_ddg_all_sccs (ddg_all_sccs_ptr);

  int find_nodes_on_paths (sbitmap result, ddg_ptr, sbitmap from, sbitmap 
to);
-int longest_simple_path (ddg_ptr, int from, int to, sbitmap via);

  bool autoinc_var_is_used_p (rtx_insn *, rtx_insn *);

Comments

Jeff Law April 26, 2019, 3:40 p.m. UTC | #1
On 4/16/19 6:05 AM, Roman Zhuykov wrote:
> Current algorithm which finds recurrence_length for all DDG strongly
> connected components works in like O(N^6) time, where N in the number of
> nodes in DDG.  The time is so bad mostly for graphs with lots of edges,
> like almost N^2 edges.  My proposed algorithm works in O(N^3). 
> Algorithm of finding SCCs itself is also not optimal (maybe up to
> O(N^4)), but here it left untouched.
> 
> For some situations, when amount of edges is smaller (like equal to N),
> new algorithm can be unfortunately slower than old one.  But I think
> it's better here to add some bail-out when we got more than 1000 nodes
> for example.
> 
> Before creating this patch, I tested special version of it, where both
> approaches were in action and asserts were inserted to check that
> algorithms results (longest_simple_path values) are absolutely the same.
>  I can publish this special version if needed.
> 
> I’ve described patch testing in cover letter. Ok for trunk?
> 
> gcc/ChangeLog:
> 
> 2019-04-08  Roman Zhuykov  <zhroma@ispras.ru>
> 
>     PR rtl-optimization/90001
>     * ddg.c (create_ddg): Init max_dist array for each node.
>     (free_ddg): Free max_dist array.
>     (create_ddg_edge): Use bool field instead of aux union.
>     (set_recurrence_length): Use prepared max_dist information instead
>     of calling longest_simple_path.
>     (create_scc): Remove graph argument, fill node's aux.count with
>     SCC id, and move set_recurrence_length call to...
>     (create_ddg_all_sccs): ...here, after filling all max_dist arrays
>     using Floyd–Warshall-like algorithm.
>     (update_dist_to_successors): Remove the whole function.
>     (longest_simple_path): Likewise.
>     * ddg.h (struct ddg_node): Add max_dist pointer.
>     (struct ddg_edge): Use bool field instead of unused aux union.
So just an FYI.  If ddg.c is only used by the modulo scheduler, then it
falls under your maintainership and you can decide when to install this
patch.

We generally try to avoid major changes right after the branch is cut,
but I doubt we'll be doing a lot of backporting in this code, so I think
you can go whenever you're ready.

jeff
>
Roman Zhuykov April 26, 2019, 5:34 p.m. UTC | #2
> So just an FYI.  If ddg.c is only used by the modulo scheduler, then it
> falls under your maintainership and you can decide when to install this
> patch.

Yes, I understand about ddg.c and ddg.h. I also studied everything
about loop-doloop.c because it is connected. Will try to participate
in doloop discussions, and soon present some (mostly refactoring)
doloop patch for approval.

> We generally try to avoid major changes right after the branch is cut,
> but I doubt we'll be doing a lot of backporting in this code, so I think
> you can go whenever you're ready.

I'll now wait the "not disrupting code RC phase" to finish, and
than will commit it together with posting my next patchset.
diff mbox series

Patch

diff --git a/gcc/ddg.c b/gcc/ddg.c
--- a/gcc/ddg.c
+++ b/gcc/ddg.c
@@ -32,9 +32,6 @@  along with GCC; see the file COPYING3.  If not see

  #ifdef INSN_SCHEDULING

-/* A flag indicating that a ddg edge belongs to an SCC or not.  */
-enum edge_flag {NOT_IN_SCC = 0, IN_SCC};
-
  /* Forward declarations.  */
  static void add_backarc_to_ddg (ddg_ptr, ddg_edge_ptr);
  static void add_backarc_to_scc (ddg_scc_ptr, ddg_edge_ptr);
@@ -564,7 +561,7 @@  create_ddg (basic_block bb, int closing_branch_deps)
  {
    ddg_ptr g;
    rtx_insn *insn, *first_note;
-  int i;
+  int i, j;
    int num_nodes = 0;

    g = (ddg_ptr) xcalloc (1, sizeof (struct ddg));
@@ -632,6 +629,12 @@  create_ddg (basic_block bb, int 
closing_branch_deps)
        g->nodes[i].predecessors = sbitmap_alloc (num_nodes);
        bitmap_clear (g->nodes[i].predecessors);
        g->nodes[i].first_note = (first_note ? first_note : insn);
+
+      g->nodes[i].aux.count = -1;
+      g->nodes[i].max_dist = XCNEWVEC (int, num_nodes);
+      for (j = 0; j < num_nodes; j++)
+         g->nodes[i].max_dist[j] = -1;
+
        g->nodes[i++].insn = insn;
        first_note = NULL;
      }
@@ -668,6 +671,7 @@  free_ddg (ddg_ptr g)
  	}
        sbitmap_free (g->nodes[i].successors);
        sbitmap_free (g->nodes[i].predecessors);
+      free (g->nodes[i].max_dist);
      }
    if (g->num_backarcs > 0)
      free (g->backarcs);
@@ -792,7 +796,7 @@  create_ddg_edge (ddg_node_ptr src, ddg_node_ptr 
dest,
    e->latency = l;
    e->distance = d;
    e->next_in = e->next_out = NULL;
-  e->aux.info = 0;
+  e->in_scc = false;
    return e;
  }

@@ -820,7 +824,7 @@  add_edge_to_ddg (ddg_ptr g ATTRIBUTE_UNUSED, 
ddg_edge_ptr e)
     for now that cycles in the data dependence graph contain a single 
backarc.
     This simplifies the algorithm, and can be generalized later.  */
  static void
-set_recurrence_length (ddg_scc_ptr scc, ddg_ptr g)
+set_recurrence_length (ddg_scc_ptr scc)
  {
    int j;
    int result = -1;
@@ -828,17 +832,14 @@  set_recurrence_length (ddg_scc_ptr scc, ddg_ptr g)
    for (j = 0; j < scc->num_backarcs; j++)
      {
        ddg_edge_ptr backarc = scc->backarcs[j];
-      int length;
        int distance = backarc->distance;
        ddg_node_ptr src = backarc->dest;
        ddg_node_ptr dest = backarc->src;
+      int length = src->max_dist[dest->cuid];
+
+      if (length < 0)
+        continue;

-      length = longest_simple_path (g, src->cuid, dest->cuid, 
scc->nodes);
-      if (length < 0 )
-	{
-	  /* fprintf (stderr, "Backarc not on simple cycle in SCC.\n"); */
-	  continue;
-	}
        length += backarc->latency;
        result = MAX (result, (length / distance));
      }
@@ -846,9 +847,9 @@  set_recurrence_length (ddg_scc_ptr scc, ddg_ptr g)
  }

  /* Create a new SCC given the set of its nodes.  Compute its 
recurrence_length
-   and mark edges that belong to this scc as IN_SCC.  */
+   and mark edges that belong to this scc.  */
  static ddg_scc_ptr
-create_scc (ddg_ptr g, sbitmap nodes)
+create_scc (ddg_ptr g, sbitmap nodes, int id)
  {
    ddg_scc_ptr scc;
    unsigned int u = 0;
@@ -866,16 +867,18 @@  create_scc (ddg_ptr g, sbitmap nodes)
        ddg_edge_ptr e;
        ddg_node_ptr n = &g->nodes[u];

+      gcc_assert (n->aux.count == -1);
+      n->aux.count = id;
+
        for (e = n->out; e; e = e->next_out)
  	if (bitmap_bit_p (nodes, e->dest->cuid))
  	  {
-	    e->aux.count = IN_SCC;
+	    e->in_scc = true;
  	    if (e->distance > 0)
  	      add_backarc_to_scc (scc, e);
  	  }
      }

-  set_recurrence_length (scc, g);
    return scc;
  }

@@ -1018,7 +1021,7 @@  check_sccs (ddg_all_sccs_ptr sccs, int num_nodes)
  ddg_all_sccs_ptr
  create_ddg_all_sccs (ddg_ptr g)
  {
-  int i;
+  int i, j, k, scc, way;
    int num_nodes = g->num_nodes;
    auto_sbitmap from (num_nodes);
    auto_sbitmap to (num_nodes);
@@ -1038,7 +1041,7 @@  create_ddg_all_sccs (ddg_ptr g)
        ddg_node_ptr dest = backarc->dest;

        /* If the backarc already belongs to an SCC, continue.  */
-      if (backarc->aux.count == IN_SCC)
+      if (backarc->in_scc)
  	continue;

        bitmap_clear (scc_nodes);
@@ -1049,10 +1052,52 @@  create_ddg_all_sccs (ddg_ptr g)

        if (find_nodes_on_paths (scc_nodes, g, from, to))
  	{
-	  scc = create_scc (g, scc_nodes);
+	  scc = create_scc (g, scc_nodes, sccs->num_sccs);
  	  add_scc_to_ddg (sccs, scc);
  	}
      }
+
+  /* Init max_dist arrays for Floyd–Warshall-like
+     longest patch calculation algorithm.  */
+  for (k = 0; k < num_nodes; k++)
+    {
+      ddg_edge_ptr e;
+      ddg_node_ptr n = &g->nodes[k];
+
+      if (n->aux.count == -1)
+        continue;
+
+      n->max_dist[k] = 0;
+      for (e = n->out; e; e = e->next_out)
+        if (e->distance == 0 && g->nodes[e->dest->cuid].aux.count == 
n->aux.count)
+          n->max_dist[e->dest->cuid] = e->latency;
+    }
+
+  /* Run main Floid-Warshall loop.  We use only non-backarc edges
+     inside each scc.  */
+  for (k = 0; k < num_nodes; k++)
+    {
+      scc = g->nodes[k].aux.count;
+      if (scc != -1)
+        {
+          for (i = 0; i < num_nodes; i++)
+            if (g->nodes[i].aux.count == scc)
+              for (j = 0; j < num_nodes; j++)
+                if (g->nodes[j].aux.count == scc
+                    && g->nodes[i].max_dist[k] >= 0
+                    && g->nodes[k].max_dist[j] >= 0)
+                  {
+                    way = g->nodes[i].max_dist[k] + 
g->nodes[k].max_dist[j];
+                    if (g->nodes[i].max_dist[j] < way)
+                      g->nodes[i].max_dist[j] = way;
+                  }
+        }
+    }
+
+  /* Calculate recurrence_length using max_dist info.  */
+  for (i = 0; i < sccs->num_sccs; i++)
+    set_recurrence_length (sccs->sccs[i]);
+
    order_sccs (sccs);

    if (flag_checking)
@@ -1155,72 +1200,4 @@  find_nodes_on_paths (sbitmap result, ddg_ptr g, 
sbitmap from, sbitmap to)
    return bitmap_and (result, reachable_from, reach_to);
  }

-
-/* Updates the counts of U_NODE's successors (that belong to NODES) to 
be
-   at-least as large as the count of U_NODE plus the latency between 
them.
-   Sets a bit in TMP for each successor whose count was changed 
(increased).
-   Returns nonzero if any count was changed.  */
-static int
-update_dist_to_successors (ddg_node_ptr u_node, sbitmap nodes, sbitmap 
tmp)
-{
-  ddg_edge_ptr e;
-  int result = 0;
-
-  for (e = u_node->out; e; e = e->next_out)
-    {
-      ddg_node_ptr v_node = e->dest;
-      int v = v_node->cuid;
-
-      if (bitmap_bit_p (nodes, v)
-	  && (e->distance == 0)
-	  && (v_node->aux.count < u_node->aux.count + e->latency))
-	{
-	  v_node->aux.count = u_node->aux.count + e->latency;
-	  bitmap_set_bit (tmp, v);
-	  result = 1;
-	}
-    }
-  return result;
-}
-
-
-/* Find the length of a longest path from SRC to DEST in G,
-   going only through NODES, and disregarding backarcs.  */
-int
-longest_simple_path (struct ddg * g, int src, int dest, sbitmap nodes)
-{
-  int i;
-  unsigned int u = 0;
-  int change = 1;
-  int num_nodes = g->num_nodes;
-  auto_sbitmap workset (num_nodes);
-  auto_sbitmap tmp (num_nodes);
-  for (i = 0; i < g->num_nodes; i++)
-    g->nodes[i].aux.count = -1;
-  g->nodes[src].aux.count = 0;
-
-  bitmap_clear (tmp);
-  bitmap_set_bit (tmp, src);
-
-  while (change)
-    {
-      sbitmap_iterator sbi;
-
-      change = 0;
-      bitmap_copy (workset, tmp);
-      bitmap_clear (tmp);
-      EXECUTE_IF_SET_IN_BITMAP (workset, 0, u, sbi)
-	{
-	  ddg_node_ptr u_node = &g->nodes[u];
-
-	  change |= update_dist_to_successors (u_node, nodes, tmp);
-	}
-    }
-  return g->nodes[dest].aux.count;
-}
-
  #endif /* INSN_SCHEDULING */
diff --git a/gcc/ddg.h b/gcc/ddg.h
--- a/gcc/ddg.h
+++ b/gcc/ddg.h
@@ -64,6 +64,10 @@  struct ddg_node
    sbitmap successors;
    sbitmap predecessors;

+  /* Temporary array used for Floyd-Warshall algorithm to find
+     scc recurrence length.  */
+  int *max_dist;
+
    /* For general use by algorithms manipulating the ddg.  */
    union {
      int count;
@@ -95,11 +99,8 @@  struct ddg_edge
    ddg_edge_ptr next_in;
    ddg_edge_ptr next_out;

-  /* For general use by algorithms manipulating the ddg.  */
-  union {
-    int count;
-    void *info;
-  } aux;
+  /* Is true when edge is already in scc.  */
+  bool in_scc;
  };

  /* This structure holds the Data Dependence Graph for a basic block.