diff mbox series

lto: LTO cgraph support for late declare variant resolution

Message ID 20201022112714.GM2176@tucnak
State New
Headers show
Series lto: LTO cgraph support for late declare variant resolution | expand

Commit Message

Jakub Jelinek Oct. 22, 2020, 11:27 a.m. UTC
On Thu, May 14, 2020 at 02:17:45PM +0200, Jakub Jelinek via Gcc-patches wrote:
> > For LTO, the patch only saves/restores the two cgraph_node bits added in the
> > patch, but doesn't yet stream out and back in the on the side info for the
> > declare_variant_alt.  For the LTO partitioning, I believe those artificial
> > FUNCTION_DECLs with declare_variant_alt need to go into partition together
> > with anything that calls them (possibly duplicated), any way how to achieve
> > that?  Say if declare variant artificial fn foobar is directly
> > called from all of foo, bar and baz and not from qux and we want 4
> > partitions, one for each of foo, bar, baz, qux, then foobar is needed in the
> > first 3 partitions, and the IPA_REF_ADDRs recorded for foobar that right
> > after IPA the foobar call will be replaced with calls to foobar1, foobar2,
> > foobar3 or foobar (non-artificial) can of course stay in different
> > partitions if needed.
> 
> I've tried to add the saving/restoring next to ipa refs saving/restoring, as
> the declare variant alt stuff is kind of extension of those, unfortunately
> following doesn't compile, because I need to also write or read a tree there
> (ctx is a portion of DECL_ATTRIBUTES of the base function), but the ipa refs
> write/read back functions don't have arguments that can be used for that.

This patch adds the streaming out and in of those omp_declare_variant_alt
hash table on the side data for the declare_variant_alt cgraph_nodes and
treats for LTO purposes the declare_variant_alt nodes (which have no body)
as if they contained a body that calls all the possible variants.
After IPA all the calls to these magic declare_variant_alt calls are
replaced with call to one of the variant depending on which one has the
highest score in the context.

Honza, any comments/suggestions on this?

So far tested just on the new testcase.

2020-10-22  Jakub Jelinek  <jakub@redhat.com>

gcc/
	* lto-streamer.h (omp_lto_output_declare_variant_alt,
	omp_lto_input_declare_variant_alt): Declare variant.
	* symtab.c (symtab_node::get_partitioning_class): Return
	SYMBOL_DUPLICATE for declare_variant_alt nodes.
	* passes.c (ipa_write_summaries): Add declare_variant_alt to
	partition.
	* lto-cgraph.c (output_refs): Call omp_lto_output_declare_variant_alt
	on declare_variant_alt nodes.
	(input_refs): Call omp_lto_input_declare_variant_alt on
	declare_variant_alt nodes.
	* lto-streamer-out.c (output_function): Don't call
	collect_block_tree_leafs if DECL_INITIAL is error_mark_node.
	(lto_output): Call output_function even for declare_variant_alt
	nodes.
	* omp-general.c (omp_lto_output_declare_variant_alt,
	omp_lto_input_declare_variant_alt): New functions.
gcc/lto/
	* lto-common.c (lto_fixup_prevailing_decls): Don't use
	LTO_NO_PREVAIL on TREE_LIST's TREE_PURPOSE.
	* lto-partition.c (lto_balanced_map): Treat declare_variant_alt
	nodes like definitions.
libgomp/
	* testsuite/libgomp.c/declare-variant-1.c: New test.



	Jakub
diff mbox series

Patch

--- gcc/lto-streamer.h.jj	2020-10-20 13:11:56.669053784 +0200
+++ gcc/lto-streamer.h	2020-10-22 11:17:37.806472939 +0200
@@ -927,6 +927,12 @@  bool reachable_from_this_partition_p (st
 lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder);
 void select_what_to_stream (void);
 
+/* In omp-general.c.  */
+void omp_lto_output_declare_variant_alt (lto_simple_output_block *,
+					 cgraph_node *, lto_symtab_encoder_t);
+void omp_lto_input_declare_variant_alt (lto_input_block *, cgraph_node *,
+					vec<symtab_node *>);
+
 /* In options-save.c.  */
 void cl_target_option_stream_out (struct output_block *, struct bitpack_d *,
 				  struct cl_target_option *);
--- gcc/symtab.c.jj	2020-10-20 13:11:56.670053770 +0200
+++ gcc/symtab.c	2020-10-22 11:17:37.824472676 +0200
@@ -1998,7 +1998,7 @@  symtab_node::get_partitioning_class (voi
   if (DECL_ABSTRACT_P (decl))
     return SYMBOL_EXTERNAL;
 
-  if (cnode && cnode->inlined_to)
+  if (cnode && (cnode->inlined_to || cnode->declare_variant_alt))
     return SYMBOL_DUPLICATE;
 
   /* Transparent aliases are always duplicated.  */
--- gcc/passes.c.jj	2020-08-27 18:42:35.622711897 +0200
+++ gcc/passes.c	2020-10-22 12:25:38.173798438 +0200
@@ -2722,7 +2722,8 @@  ipa_write_summaries (void)
     {
       struct cgraph_node *node = order[i];
 
-      if (node->definition && node->need_lto_streaming)
+      if ((node->definition || node->declare_variant_alt)
+	  && node->need_lto_streaming)
 	{
 	  if (gimple_has_body_p (node->decl))
 	    lto_prepare_function_for_streaming (node);
--- gcc/lto-cgraph.c.jj	2020-10-20 13:11:56.664053856 +0200
+++ gcc/lto-cgraph.c	2020-10-22 11:17:37.806472939 +0200
@@ -766,6 +766,9 @@  output_refs (lto_symtab_encoder_t encode
 	  for (int i = 0; node->iterate_reference (i, ref); i++)
 	    lto_output_ref (ob, ref, encoder);
 	}
+      if (cgraph_node *cnode = dyn_cast <cgraph_node *> (node))
+	if (cnode->declare_variant_alt)
+	  omp_lto_output_declare_variant_alt (ob, cnode, encoder);
     }
 
   streamer_write_uhwi_stream (ob->main_stream, 0);
@@ -1614,6 +1617,9 @@  input_refs (class lto_input_block *ib,
 	  input_ref (ib, node, nodes);
 	  count--;
 	}
+      if (cgraph_node *cnode = dyn_cast <cgraph_node *> (node))
+	if (cnode->declare_variant_alt)
+	  omp_lto_input_declare_variant_alt (ib, cnode, nodes);
     }
 }
 	    
--- gcc/lto-streamer-out.c.jj	2020-09-10 20:55:43.000000000 +0200
+++ gcc/lto-streamer-out.c	2020-10-22 12:30:40.129382789 +0200
@@ -2424,7 +2424,7 @@  output_function (struct cgraph_node *nod
   /* As we do not recurse into BLOCK_SUBBLOCKS but only BLOCK_SUPERCONTEXT
      collect block tree leafs and stream those.  */
   auto_vec<tree> block_tree_leafs;
-  if (DECL_INITIAL (function))
+  if (DECL_INITIAL (function) && DECL_INITIAL (function) != error_mark_node)
     collect_block_tree_leafs (DECL_INITIAL (function), block_tree_leafs);
   streamer_write_uhwi (ob, block_tree_leafs.length ());
   for (unsigned i = 0; i < block_tree_leafs.length (); ++i)
@@ -2788,7 +2788,8 @@  lto_output (void)
 		  && flag_incremental_link != INCREMENTAL_LINK_LTO)
 	      /* Thunks have no body but they may be synthetized
 		 at WPA time.  */
-	      || DECL_ARGUMENTS (cnode->decl)))
+	      || DECL_ARGUMENTS (cnode->decl)
+	      || cnode->declare_variant_alt))
 	output_function (cnode);
       else if ((vnode = dyn_cast <varpool_node *> (snode))
 	       && (DECL_INITIAL (vnode->decl) != error_mark_node
--- gcc/lto/lto-common.c.jj	2020-08-27 18:42:35.620711925 +0200
+++ gcc/lto/lto-common.c	2020-10-22 11:18:14.388937402 +0200
@@ -2592,7 +2592,6 @@  lto_fixup_prevailing_decls (tree t)
 	case TREE_LIST:
 	  LTO_SET_PREVAIL (TREE_VALUE (t));
 	  LTO_SET_PREVAIL (TREE_PURPOSE (t));
-	  LTO_NO_PREVAIL (TREE_PURPOSE (t));
 	  break;
 	default:
 	  gcc_unreachable ();
--- gcc/lto/lto-partition.c.jj	2020-01-17 09:31:28.000000000 +0100
+++ gcc/lto/lto-partition.c	2020-10-22 12:50:07.522311010 +0200
@@ -593,7 +593,8 @@  lto_balanced_map (int n_lto_partitions,
 
 	      last_visited_node++;
 
-	      gcc_assert (node->definition || node->weakref);
+	      gcc_assert (node->definition || node->weakref
+			  || node->declare_variant_alt);
 
 	      /* Compute boundary cost of callgraph edges.  */
 	      for (edge = node->callees; edge; edge = edge->next_callee)
@@ -704,7 +705,7 @@  lto_balanced_map (int n_lto_partitions,
 		int index;
 
 		node = dyn_cast <cgraph_node *> (ref->referring);
-		gcc_assert (node->definition);
+		gcc_assert (node->definition || node->declare_variant_alt);
 		index = lto_symtab_encoder_lookup (partition->encoder,
 						   node);
 		if (index != LCC_NOT_FOUND
--- gcc/omp-general.c.jj	2020-10-20 13:11:56.669053784 +0200
+++ gcc/omp-general.c	2020-10-22 11:17:37.807472924 +0200
@@ -42,6 +42,8 @@  along with GCC; see the file COPYING3.
 #include "tree-pass.h"
 #include "omp-device-properties.h"
 #include "tree-iterator.h"
+#include "data-streamer.h"
+#include "streamer-hooks.h"
 
 enum omp_requires omp_requires_mask;
 
@@ -2337,6 +2339,125 @@  omp_resolve_declare_variant (tree base)
 	  ? TREE_PURPOSE (TREE_VALUE (variant1)) : base);
 }
 
+void
+omp_lto_output_declare_variant_alt (lto_simple_output_block *ob,
+				    cgraph_node *node,
+				    lto_symtab_encoder_t encoder)
+{
+  gcc_assert (node->declare_variant_alt);
+
+  omp_declare_variant_base_entry entry;
+  entry.base = NULL;
+  entry.node = node;
+  entry.variants = NULL;
+  omp_declare_variant_base_entry *entryp
+    = omp_declare_variant_alt->find_with_hash (&entry, DECL_UID (node->decl));
+  gcc_assert (entryp);
+
+  int nbase = lto_symtab_encoder_lookup (encoder, entryp->base);
+  gcc_assert (nbase != LCC_NOT_FOUND);
+  streamer_write_hwi_stream (ob->main_stream, nbase);
+
+  streamer_write_hwi_stream (ob->main_stream, entryp->variants->length ());
+
+  unsigned int i;
+  omp_declare_variant_entry *varentry;
+  FOR_EACH_VEC_SAFE_ELT (entryp->variants, i, varentry)
+    {
+      int nvar = lto_symtab_encoder_lookup (encoder, varentry->variant);
+      gcc_assert (nvar != LCC_NOT_FOUND);
+      streamer_write_hwi_stream (ob->main_stream, nvar);
+
+      for (widest_int *w = &varentry->score; ;
+	   w = &varentry->score_in_declare_simd_clone)
+	{
+	  unsigned len = w->get_len ();
+	  streamer_write_hwi_stream (ob->main_stream, len);
+	  const HOST_WIDE_INT *val = w->get_val ();
+	  for (unsigned j = 0; j < len; j++)
+	    streamer_write_hwi_stream (ob->main_stream, val[j]);
+	  if (w == &varentry->score_in_declare_simd_clone)
+	    break;
+	}
+
+      HOST_WIDE_INT cnt = -1;
+      HOST_WIDE_INT i = varentry->matches ? 1 : 0;
+      for (tree attr = DECL_ATTRIBUTES (entryp->base->decl);
+	   attr; attr = TREE_CHAIN (attr), i += 2)
+	{
+	  attr = lookup_attribute ("omp declare variant base", attr);
+	  if (attr == NULL_TREE)
+	    break;
+
+	  if (varentry->ctx == TREE_VALUE (TREE_VALUE (attr)))
+	    {
+	      cnt = i;
+	      break;
+	    }
+	}
+
+      gcc_assert (cnt != -1);
+      streamer_write_hwi_stream (ob->main_stream, cnt);
+    }
+}
+
+void
+omp_lto_input_declare_variant_alt (lto_input_block *ib, cgraph_node *node,
+				   vec<symtab_node *> nodes)
+{
+  gcc_assert (node->declare_variant_alt);
+  omp_declare_variant_base_entry *entryp
+    = ggc_cleared_alloc<omp_declare_variant_base_entry> ();
+  entryp->base = dyn_cast<cgraph_node *> (nodes[streamer_read_hwi (ib)]);
+  entryp->node = node;
+  unsigned int len = streamer_read_hwi (ib);
+  vec_alloc (entryp->variants, len);
+
+  for (unsigned int i = 0; i < len; i++)
+    {
+      omp_declare_variant_entry varentry;
+      varentry.variant
+	= dyn_cast<cgraph_node *> (nodes[streamer_read_hwi (ib)]);
+      for (widest_int *w = &varentry.score; ;
+	   w = &varentry.score_in_declare_simd_clone)
+	{
+	  unsigned len2 = streamer_read_hwi (ib);
+	  HOST_WIDE_INT arr[WIDE_INT_MAX_ELTS];
+	  gcc_assert (len2 <= WIDE_INT_MAX_ELTS);
+	  for (unsigned int j = 0; j < len2; j++)
+	    arr[j] = streamer_read_hwi (ib);
+	  *w = widest_int::from_array (arr, len2, true);
+	  if (w == &varentry.score_in_declare_simd_clone)
+	    break;
+	}
+
+      HOST_WIDE_INT cnt = streamer_read_hwi (ib);
+      HOST_WIDE_INT j = 0;
+      varentry.ctx = NULL_TREE;
+      varentry.matches = (cnt & 1) ? true : false;
+      cnt &= ~HOST_WIDE_INT_1;
+      for (tree attr = DECL_ATTRIBUTES (entryp->base->decl);
+	   attr; attr = TREE_CHAIN (attr), j += 2)
+	{
+	  attr = lookup_attribute ("omp declare variant base", attr);
+	  if (attr == NULL_TREE)
+	    break;
+
+	  if (cnt == j)
+	    {
+	      varentry.ctx = TREE_VALUE (TREE_VALUE (attr));
+	      break;
+	    }
+	}
+      gcc_assert (varentry.ctx != NULL_TREE);
+      entryp->variants->quick_push (varentry);
+    }
+  if (omp_declare_variant_alt == NULL)
+    omp_declare_variant_alt
+      = hash_table<omp_declare_variant_alt_hasher>::create_ggc (64);
+  *omp_declare_variant_alt->find_slot_with_hash (entryp, DECL_UID (node->decl),
+						 INSERT) = entryp;
+}
 
 /* Encode an oacc launch argument.  This matches the GOMP_LAUNCH_PACK
    macro on gomp-constants.h.  We do not check for overflow.  */
--- libgomp/testsuite/libgomp.c/declare-variant-1.c.jj	2020-10-22 12:37:19.528542176 +0200
+++ libgomp/testsuite/libgomp.c/declare-variant-1.c	2020-10-22 13:01:52.875987620 +0200
@@ -0,0 +1,54 @@ 
+/* { dg-do link { target vect_simd_clones } } */
+/* { dg-require-effective-target lto } */
+/* { dg-require-effective-target fpic } */
+/* { dg-require-effective-target shared } */
+/* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized -O2 -fPIC -shared -flto -flto-partition=one" } */
+/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */
+
+int
+f01 (int a)
+{
+  asm volatile ("" : "+g" (a) : "g" (1) : "memory");
+  return a;
+}
+
+int
+f02 (int a)
+{
+  asm volatile ("" : "+g" (a) : "g" (2) : "memory");
+  return a;
+}
+
+int
+f03 (int a)
+{
+  asm volatile ("" : "+g" (a) : "g" (3) : "memory");
+  return a;
+}
+
+#pragma omp declare variant (f01) match (device={isa("avx512f")}) /* 4 or 8 */
+#pragma omp declare variant (f02) match (implementation={vendor(score(3):gnu)},device={kind(cpu)}) /* (1 or 2) + 3 */
+#pragma omp declare variant (f03) match (implementation={vendor(score(5):gnu)},device={kind(host)}) /* (1 or 2) + 5 */
+int
+f04 (int a)
+{
+  asm volatile ("" : "+g" (a) : "g" (4) : "memory");
+  return a;
+}
+
+#pragma omp declare simd
+int
+test1 (int x)
+{
+  /* At gimplification time, we can't decide yet which function to call.  */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
+  /* After simd clones are created, the original non-clone test1 shall
+     call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
+     shall call f01 with score 8.  */
+  /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" } } */
+  int a = f04 (x);
+  int b = f04 (x);
+  return a + b;
+}