[05/10,OpenACC] Handle conditional execution of loops in OpenACC, kernels regions
diff mbox series

Message ID a0b54348-883d-2789-61b2-ce0370296674@codesourcery.com
State New
Headers show
Series
  • Rework handling of OpenACC kernels regions
Related show

Commit Message

Kwok Cheung Yeung July 17, 2019, 9:10 p.m. UTC
Any OpenACC loop controlled by an if statement or a non-OpenACC loop must be 
executed in a gang-single region. Detecting such loops is not trivial as OpenACC 
kernels expansion is done on GIMPLE but before computation of the control flow 
graph. This patch adds an auxiliary analysis for determining whether a statement 
is inside a conditionally executed region (relative to the kernels region'sentry).

2019-07-16  Gergö Barany  <gergo@codesourcery.com>

	gcc/
	* omp-oacc-kernels.c (control_flow_regions): New class.
	(control_flow_regions::control_flow_regions): New constructor.
	(control_flow_regions::is_unconditional_oacc_for_loop): New method.
	(control_flow_regions::find_rep): Likewise.
	(control_flow_regions::union_reps): Likewise.
	(control_flow_regions::compute_regions): Likewise.
	(decompose_kernels_region_body): Use test for conditional execution.

	gcc/testsuite/
	* c-c++-common/goacc/kernels-conversion.c: Add test for conditionally
	executed code.
	* gfortran.dg/goacc/kernels-conversion.f95: Likewise.
---
  gcc/omp-oacc-kernels.c                             | 216 ++++++++++++++++++++-
  .../c-c++-common/goacc/kernels-conversion.c        |  20 +-
  .../gfortran.dg/goacc/kernels-conversion.f95       |  21 +-
  3 files changed, 245 insertions(+), 12 deletions(-)


@@ -29,10 +42,10 @@ end program main
  ! parallel regions.
  ! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 
"convert_oacc_kernels" } }

-! The two loop regions are parallelized, the sequential part in between is
-! made gang-single.
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 2 
"convert_oacc_kernels" } }
-! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 
"convert_oacc_kernels" } }
+! The three unconditional loop regions are parallelized, the sequential part
+! in between and the conditional loop are made gang-single.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } }

  ! Check that the original kernels region is removed.
  ! { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" }}

Patch
diff mbox series

diff --git a/gcc/omp-oacc-kernels.c b/gcc/omp-oacc-kernels.c
index 6e08366..80a82fa 100644
--- a/gcc/omp-oacc-kernels.c
+++ b/gcc/omp-oacc-kernels.c
@@ -385,6 +385,208 @@  maybe_build_inner_data_region (location_t loc, gimple*body,
    return body;
  }

+/* Auxiliary analysis of the body of a kernels region, to determine for each
+   OpenACC loop whether it is control-dependent (i.e., not necessarily
+   executed every time the kernels region is entered) or not.
+   We say that a loop is control-dependent if there is some cond, switch, or
+   goto statement that jumps over it, forwards or backwards.  For example,
+   if the loop is controlled by an if statement, then a jump to the true
+   block, the false block, or from one of those blocks to the control flow
+   join point will necessarily jump over the loop.
+   This analysis implements an ad-hoc union-find data structure classifying
+   statements into "control-flow regions" as follows: Most statements are in
+   the same region as their predecessor, except that each OpenACC loop is in
+   a region of its own, and each OpenACC loop's successor starts a new
+   region.  We then unite the regions of any statements linked by jumps,
+   placing any cond, switch, or goto statement in the same region as its
+   target label(s).
+   In the end, control dependence of OpenACC loops can be determined by
+   comparing their immediate predecessor and successor statements' regions.
+   A jump crosses the loop if and only if the predecessor and successor are
+   in the same region.  (If there is no predecessor or successor, the loop
+   is executed unconditionally.)
+   The methods in this class identify statements by their index in the
+   kernels region's body.  */
+
+class control_flow_regions
+{
+  public:
+    /* Initialize an instance and pre-compute the control-flow region
+       information for the statement sequence SEQ.  */
+    control_flow_regions (gimple_seq seq);
+
+    /* Return true if the STMT with the given index IDX in the analyzed
+       statement sequence is an unconditionally executed OpenACC loop.  */
+    bool is_unconditional_oacc_for_loop (gimple *stmt, size_t idx);
+
+  private:
+    /* Find the region representative for the statement identified by index
+       STMT_IDX.  */
+    size_t find_rep (size_t stmt_idx);
+
+    /* Union the regions containing the statements represented by
+       representatives A and B.  */
+    void union_reps (size_t a, size_t b);
+
+    /* Helper for the constructor.  Performs the actual computation of the
+       control-flow regions in the statement sequence SEQ.  */
+    void compute_regions (gimple_seq seq);
+
+    /* The mapping from statement indices to region representatives.  */
+    vec <size_t> representatives;
+
+    /* A cache mapping statement indices to a flag indicating whether the
+       statement is a top level OpenACC for loop.  */
+    vec <bool> omp_for_loops;
+};
+
+control_flow_regions::control_flow_regions (gimple_seq seq)
+{
+  representatives.create (1);
+  omp_for_loops.create (1);
+  compute_regions (seq);
+}
+
+bool
+control_flow_regions::is_unconditional_oacc_for_loop (gimple *stmt, size_tidx)
+{
+  if (top_level_omp_for_in_stmt (stmt) == NULL)
+    /* Not an OpenACC for loop.  */
+    return false;
+  if (idx == 0 || idx == representatives.length () - 1)
+    /* The first or last statement in the kernels region.  This means that
+       there is no room before or after it for a jump or a label.  Thus
+       there cannot be a jump across it, so it is unconditional.  */
+    return true;
+  /* Otherwise, the loop is unconditional if the statements before and after
+     it are in different control flow regions.  Scan forward and backward,
+     skipping over neighboring OpenACC for loops, to find these preceding
+     statements.  */
+  size_t prev_index = idx - 1;
+  while (prev_index > 0 && omp_for_loops [prev_index] == true)
+    prev_index--;
+  /* If all preceding statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (prev_index == 0)
+    return true;
+  size_t succ_index = idx + 1;
+  while (succ_index < omp_for_loops.length ()
+         && omp_for_loops [succ_index] == true)
+    succ_index++;
+  /* If all following statements are also OpenACC loops, all of these are
+     unconditional.  */
+  if (succ_index == omp_for_loops.length ())
+    return true;
+  return (find_rep (prev_index) != find_rep (succ_index));
+}
+
+size_t
+control_flow_regions::find_rep (size_t stmt_idx)
+{
+  size_t rep = stmt_idx, aux = stmt_idx;
+  /* Find the root representative of this statement.  */
+  while (representatives[rep] != rep)
+    rep = representatives[rep];
+  /* Compress the path from the original statement to the representative. */
+  while (representatives[aux] != rep)
+    {
+      size_t tmp = representatives[aux];
+      representatives[aux] = rep;
+      aux = tmp;
+    }
+  return rep;
+}
+
+void
+control_flow_regions::union_reps (size_t a, size_t b)
+{
+  a = find_rep (a);
+  b = find_rep (b);
+  representatives[b] = a;
+}
+
+void
+control_flow_regions::compute_regions (gimple_seq seq)
+{
+  hash_map <gimple *, size_t> control_flow_reps;
+  hash_map <tree, size_t> label_reps;
+  size_t current_region = 0, idx = 0;
+
+  /* In a first pass, assign an initial region to each statement. Except in
+     the case of OpenACC loops, each statement simply gets the same region
+     representative as its predecessor.  */
+  for (gimple_stmt_iterator gsi = gsi_start (seq);
+       !gsi_end_p (gsi);
+       gsi_next (&gsi))
+    {
+      gimple *stmt = gsi_stmt (gsi);
+      gimple *omp_for = top_level_omp_for_in_stmt (stmt);
+      omp_for_loops.safe_push (omp_for != NULL);
+      if (omp_for != NULL)
+        {
+          /* Assign a new region to this loop and to its successor.  */
+          current_region = idx;
+          representatives.safe_push (current_region);
+          current_region++;
+        }
+      else
+        {
+          representatives.safe_push (current_region);
+          /* Remember any jumps and labels for the second pass below.  */
+          if (gimple_code (stmt) == GIMPLE_COND
+              || gimple_code (stmt) == GIMPLE_SWITCH
+              || gimple_code (stmt) == GIMPLE_GOTO)
+            control_flow_reps.put (stmt, current_region);
+          else if (gimple_code (stmt) == GIMPLE_LABEL)
+            label_reps.put (gimple_label_label (as_a <glabel *> (stmt)),
+                            current_region);
+        }
+      idx++;
+    }
+  gcc_assert (representatives.length () == omp_for_loops.length ());
+
+  /* Revisit all the control flow statements and union the region of each
+     cond, switch, or goto statement with the target labels' regions.  */
+  for (hash_map <gimple *, size_t>::iterator it = control_flow_reps.begin ();
+       it != control_flow_reps.end ();
+       ++it)
+    {
+      gimple *stmt = (*it).first;
+      size_t stmt_rep = (*it).second;
+      switch (gimple_code (stmt))
+        {
+          tree label;
+          unsigned int n;
+
+        case GIMPLE_COND:
+          label = gimple_cond_true_label (as_a <gcond *> (stmt));
+          union_reps (stmt_rep, *label_reps.get (label));
+          label = gimple_cond_false_label (as_a <gcond *> (stmt));
+          union_reps (stmt_rep, *label_reps.get (label));
+          break;
+
+        case GIMPLE_SWITCH:
+          n = gimple_switch_num_labels (as_a <gswitch *> (stmt));
+          for (unsigned int i = 0; i < n; i++)
+            {
+              tree switch_case
+                = gimple_switch_label (as_a <gswitch *> (stmt), i);
+              label = CASE_LABEL (switch_case);
+              union_reps (stmt_rep, *label_reps.get (label));
+            }
+          break;
+
+        case GIMPLE_GOTO:
+          label = gimple_goto_dest (stmt);
+          union_reps (stmt_rep, *label_reps.get (label));
+          break;
+
+        default:
+          gcc_unreachable ();
+        }
+    }
+}
+
  /* Decompose the body of the KERNELS_REGION, which was originally annotated
     with the KERNELS_CLAUSES, into a series of parallel regions.  */

@@ -486,9 +688,14 @@  decompose_kernels_region_body (gimple *kernels_region,tree 
kernels_clauses)
       separated from the loop.  */
    bool only_simple_assignments = true;

+  /* Precompute the control flow region information to determine whether an
+     OpenACC loop is executed conditionally or unconditionally.  */
+  control_flow_regions cf_regions (body_sequence);
+
    /* Iterate over the statements in the kernels region's body.  */
+  size_t idx = 0;
    gimple_stmt_iterator gsi, gsi_n;
-  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n)
+  for (gsi = gsi_start (body_sequence); !gsi_end_p (gsi); gsi = gsi_n,idx++)
      {
        /* Advance the iterator here because otherwise it would be invalidated
           by moving statements below.  */
@@ -497,7 +704,8 @@  decompose_kernels_region_body (gimple *kernels_region, tree 
kernels_clauses)

        gimple *stmt = gsi_stmt (gsi);
        gimple *omp_for = top_level_omp_for_in_stmt (stmt);
-      if (omp_for != NULL)
+      if (omp_for != NULL
+          && cf_regions.is_unconditional_oacc_for_loop (stmt, idx))
          {
            /* This is an OMP for statement, put it into a parallel region.
               But first, construct a gang-single region containing any
@@ -532,8 +740,8 @@  decompose_kernels_region_body (gimple *kernels_region, tree 
kernels_clauses)
          }
        else
          {
-          /* This is not an OMP for statement, so it will be put into a
-             gang-single region.  */
+          /* This is not an unconditional OMP for statement, so it will be
+             put into a gang-single region.  */
            gimple_seq_add_stmt (&gang_single_seq, stmt);
            /* Is this a simple assignment? We call it simple if it is an
               assignment to an artificial local variable.  This captures
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c 
b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
index ec5db02..ed4d642 100644
--- a/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-conversion.c
@@ -12,6 +12,7 @@  main (void)
    unsigned int sum = 1;

  #pragma acc kernels copyin(a[0:N]) copy(sum)
+  /* { dg-bogus "region contains gang partitoned code but is not gang 
partitioned" "gang partitioned" { xfail *-*-* } .-1 } */
    {
      #pragma acc loop
      for (i = 0; i < N; ++i)
@@ -23,6 +24,17 @@  main (void)
      #pragma acc loop
      for (i = 0; i < N; ++i)
        sum += a[i];
+
+    if (sum > 10)
+      {
+        #pragma acc loop
+        for (i = 0; i < N; ++i)
+          sum += a[i];
+      }
+
+    #pragma acc loop
+    for (i = 0; i < N; ++i)
+      sum += a[i];
    }

    return 0;
@@ -32,10 +44,10 @@  main (void)
     parallel regions.  */
  /* { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 
"convert_oacc_kernels" } } */

-/* The two loop regions are parallelized, the sequential part in between is
-   made gang-single.  */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 2 
"convert_oacc_kernels" } } */
-/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 1 
"convert_oacc_kernels" } } */
+/* The three unconditional loop regions are parallelized, the sequential
+   part in between and the conditional loop are made gang-single.  */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_parallelized" 3 
"convert_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-times "oacc_parallel_kernels_gang_single" 2 
"convert_oacc_kernels" } } */

  /* Check that the original kernels region is removed.  */
  /* { dg-final { scan-tree-dump-not "oacc_kernels" "convert_oacc_kernels" } } */
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 
b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
index 4aba2b1..f89e46b 100644
--- a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -22,6 +22,19 @@  program main
      sum = sum + a(i)
    end do

+  if (sum .gt. 10) then
+    !$acc loop
+    do i = 1, N
+      sum = sum + a(i)
+    end do
+  end if
+
+  !$acc loop
+  ! { dg-bogus "region contains gang partitoned code but is not gang 
partitioned" "gang partitioned" { xfail *-*-* } .-1 }
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
    !$acc end kernels
  end program main