diff mbox

[RFA,PR,tree-optimization/61912,2/4] Trimming CONSTRUCTOR stores in DSE

Message ID 10dba827-c4aa-4bfe-83d1-7cf4c95e873a@redhat.com
State New
Headers show

Commit Message

Jeff Law Dec. 16, 2016, 1:54 a.m. UTC
This is the second patch in the kit to improve our DSE implementation.

This patch recognizes when a CONSTRUCTOR assignment could be trimmed at 
the head or tail because those bytes are dead.

The first implementation of this turned the CONSTRUCTOR into a memset. 
This version actually rewrites the RHS and LHS of the CONSTRUCTOR 
assignment.

You'll note that the implementation computes head and tail trim counts, 
then masks them to an even byte count.  We might even consider masking 
off the two low bits in the counts.  This masking keeps higher 
alignments on the CONSTRUCTOR remnant which helps keep things efficient 
when the CONSTRUCTOR results in a memset call.

This patch hits a lot statically in GCC and the testsuite.  There were 
hundreds of hits in each.

There may be some room for tuning.  Trimming shouldn't ever result in 
poorer performance, but it may also not result in any measurable gain 
(it depends on how much gets trimmed relative to the size of the 
CONSTRUCTOR node and how the CONSTRUCTOR node gets expanded, the 
processor's capabilities for merging stores internally, etc etc).  I 
suspect the main benefit comes when the CONSTRUCTOR collapses down to 
some thing small that gets expanded inline, thus exposing the internals 
to the rest of the optimization pipeline.

We could, in theory, split the CONSTRUCTOR to pick up dead bytes in the 
middle of the CONSTRUCTOR.  I haven't looked to see how applicable that 
is in real code and what the cost/benefit analysis might look like.


Bootstrapped and regression tested on x86_64-linux-gnu.  OK for the trunk?
PR tree-optimization/61912
	PR tree-optimization/77485
	* tree-sra.h: New file.
	* ipa-cp.c: Include tree-sra.h
	* ipa-prop.h (build_ref_for_offset): Remove prototype.
	* tree-ssa-dse.c: Include expr.h and tree-sra.h.
	(compute_trims, trim_constructor_store): New functions.
	(trim_partially_dead_store): Call trim_constructor_store.
	

	* g++.dg/tree-ssa/ssa-dse-1.C: New test.
	* gcc.dg/tree-ssa/pr30375: Adjust expected output.
	* gcc.dg/tree-ssa/ssa-dse-24.c: New test.
diff mbox

Patch

diff --git a/gcc/ipa-cp.c b/gcc/ipa-cp.c
index 4ec7cc5..772dd68 100644
--- a/gcc/ipa-cp.c
+++ b/gcc/ipa-cp.c
@@ -122,6 +122,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ipa-inline.h"
 #include "ipa-utils.h"
 #include "tree-ssa-ccp.h"
+#include "tree-sra.h"
 
 template <typename valtype> class ipcp_value;
 
diff --git a/gcc/ipa-prop.h b/gcc/ipa-prop.h
index 0e75cf4..6d7b480 100644
--- a/gcc/ipa-prop.h
+++ b/gcc/ipa-prop.h
@@ -820,10 +820,6 @@  ipa_parm_adjustment *ipa_get_adjustment_candidate (tree **, bool *,
 void ipa_release_body_info (struct ipa_func_body_info *);
 tree ipa_get_callee_param_type (struct cgraph_edge *e, int i);
 
-/* From tree-sra.c:  */
-tree build_ref_for_offset (location_t, tree, HOST_WIDE_INT, bool, tree,
-			   gimple_stmt_iterator *, bool);
-
 /* In ipa-cp.c  */
 void ipa_cp_c_finalize (void);
 
diff --git a/gcc/testsuite/g++.dg/tree-ssa/ssa-dse-1.C b/gcc/testsuite/g++.dg/tree-ssa/ssa-dse-1.C
new file mode 100644
index 0000000..f928947
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/ssa-dse-1.C
@@ -0,0 +1,101 @@ 
+/* { dg-do compile } */
+/* { dg-options "-std=c++14 -O -fdump-tree-dse1-details" } */
+
+using uint = unsigned int;
+
+template<typename C, uint S>
+struct FixBuf
+{
+	C buf[S] = {};
+};
+
+template<typename C>
+struct OutBuf
+{
+	C*	cur;
+	C*	end;
+	C*	beg;
+
+	template<uint S>
+	constexpr
+	OutBuf(FixBuf<C, S>& b) : cur{b.buf}, end{b.buf + S}, beg{b.buf} { }
+
+	OutBuf(C* b, C* e) : cur{b}, end{e} { }
+	OutBuf(C* b, uint s) : cur{b}, end{b + s} { }
+
+	constexpr
+	OutBuf& operator<<(C v)
+	{
+		if (cur < end) {
+			*cur = v;
+		}
+		++cur;
+		return *this;
+	}
+
+	constexpr
+	OutBuf& operator<<(uint v)
+	{
+		uint q = v / 10U;
+		uint r = v % 10U;
+		if (q) {
+			*this << q;
+		}
+		*this << static_cast<C>(r + '0');
+		return *this;
+	}
+};
+
+template<bool BOS>
+struct BufOrSize
+{
+	template<typename C, uint S>
+	static constexpr auto Select(FixBuf<C, S>& fb, OutBuf<C>&)
+	{
+		return fb;
+	}
+};
+
+template<>
+struct BufOrSize<true>
+{
+	template<typename C, uint S>
+	static constexpr auto Select(FixBuf<C, S>&, OutBuf<C>& ob)
+	{
+		return ob.cur - ob.beg;
+	}
+};
+
+// if BOS=1, it will return the size of the generated data, else the data itself
+template<uint N, uint S, bool BOS = 0>
+constexpr
+auto fixbuf()
+{
+	FixBuf<char, S> fb;
+	OutBuf<char> ob{fb};
+	for (uint i = 0; i <= N; ++i) {
+		ob << i << static_cast<char>(i == N ? 0 : ' ');
+	}
+	return BufOrSize<BOS>::Select(fb, ob);
+}
+
+auto foo()
+{
+	constexpr auto x = fixbuf<13, 200>();
+	return x;
+}
+
+auto foo_sized()
+{
+	constexpr auto s = fixbuf<13, 0, 1>();
+	constexpr auto x = fixbuf<13, s>();
+	return x;
+}
+
+int main()
+{
+}
+
+
+/* { dg-final { scan-tree-dump-times "MEM\\\[\\(char\\\[\[0-9\]+\\\] \\*\\)&<retval> \\+ \[0-9\]+B\\\] = {}" 1 "dse1" } } */
+
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr30375.c b/gcc/testsuite/gcc.dg/tree-ssa/pr30375.c
index 0439b1c..cf0a93b 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr30375.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr30375.c
@@ -22,4 +22,5 @@  void test_signed_msg_encoding(void)
     f();
 }
 
-/* { dg-final { scan-tree-dump-times "signInfo = {}" 1 "dse1" } } */
+/* { dg-final { scan-tree-dump-times "MEM\\\[\\(char\\\[\[0-9\]+\\\] \\*\\)&signInfo \\+ \[0-9\]+B\\\] = {}" 1 "dse1" } } */
+
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-24.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-24.c
new file mode 100644
index 0000000..80a35ca
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-24.c
@@ -0,0 +1,62 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-dse1" } */
+
+
+typedef unsigned int wchar_t;
+struct printf_info
+{
+  int prec;
+  int width;
+  wchar_t spec;
+  unsigned int is_long_double:1;
+  unsigned int is_short:1;
+  unsigned int is_long:1;
+  unsigned int alt:1;
+  unsigned int space:1;
+  unsigned int left:1;
+  unsigned int showsign:1;
+  unsigned int group:1;
+  unsigned int extra:1;
+  unsigned int is_char:1;
+  unsigned int wide:1;
+  unsigned int i18n:1;
+  unsigned int __pad:4;
+  unsigned short int user;
+  wchar_t pad;
+} info;
+
+void bar (struct printf_info *);
+
+void foo(int prec,
+  int width,
+  wchar_t spec,
+  unsigned int is_long_double,
+  unsigned int is_short,
+  unsigned int is_long,
+  unsigned int alt,
+  unsigned int space,
+  unsigned int left,
+  unsigned int showsign,
+  unsigned int group,
+  wchar_t pad)
+{
+    struct printf_info info = {
+        .prec = prec,
+        .width = width,
+        .spec = spec,
+        .is_long_double = is_long_double,
+        .is_short = is_short,
+        .is_long = is_long,
+        .alt = alt,
+        .space = space,
+        .left = left,
+        .showsign = showsign,
+        .group = group,
+        .pad = pad,
+        .extra = 0,
+        .wide = sizeof (char) != 1 };
+
+    bar (&info);
+}
+
+/* { dg-final { scan-tree-dump-times "MEM\\\[\\(char\\\[\[0-9\]+\\\] \\*\\)&info \\+ \[0-9\]+B\\\] = {}" 1 "dse1" } } */
diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h
new file mode 100644
index 0000000..6d88ece
--- /dev/null
+++ b/gcc/tree-sra.h
@@ -0,0 +1,27 @@ 
+/* Header file for SRA helper functions
+   Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TREE_SRA_H
+#define GCC_TREE_SRA_H
+
+tree build_ref_for_offset (location_t, tree, HOST_WIDE_INT, bool, tree,
+			   gimple_stmt_iterator *, bool);
+
+
+#endif /* GCC_TREE_SRA_H */
diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c
index eea185c..a868f11 100644
--- a/gcc/tree-ssa-dse.c
+++ b/gcc/tree-ssa-dse.c
@@ -34,6 +34,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "domwalk.h"
 #include "tree-cfgcleanup.h"
 #include "params.h"
+#include "expr.h"
+#include "tree-sra.h"
 
 /* This file implements dead store elimination.
 
@@ -149,6 +151,53 @@  clear_bytes_written_by (bitmap live_bytes, gimple *stmt, ao_ref *ref)
 			write.size / BITS_PER_UNIT);
 }
 
+/* Compute the number of elements that we can trim from the head and
+   tail of ORIG resulting in a bitmap that is a superset of LIVE.
+
+   Store the number of elements trimmed from the head and tail in
+   TRIM_HEAD and TRIM_TAIL.  */
+
+static void
+compute_trims (bitmap orig, bitmap live, int *trim_head, int *trim_tail)
+{
+  bitmap dead = BITMAP_ALLOC (NULL);
+  bitmap_and_compl (dead, orig, live);
+
+  /* Now identify how much, if any of the tail we can chop off.  */
+  *trim_tail = 0;
+  unsigned last = bitmap_last_set_bit (orig);
+  if (last == bitmap_last_set_bit (dead))
+    {
+      /* We don't have a reverse iterator on bitmaps.  */
+      unsigned int i;
+      for (i = last; i > 0; i--)
+	{
+	  if (!bitmap_bit_p (dead, i))
+	    break;
+	}
+
+      /* I is the first bit not set in DEAD walking in reverse.  */
+      *trim_tail = (bitmap_last_set_bit (orig) - i) & ~0x1;
+    }
+
+  /* Identify how much, if any of the head we can chop off.  */
+  *trim_head = 0;
+  unsigned int first = bitmap_first_set_bit (orig);
+  if (first == bitmap_first_set_bit (dead))
+    {
+      unsigned int i;
+      for (i = first; i < last; i++)
+	{
+	  if (!bitmap_bit_p (dead, i))
+	    break;
+	}
+
+      /* I is the first bit not set in DEAD.  */
+      *trim_head = (i - first) & ~0x1;
+    }
+  BITMAP_FREE (dead);
+}
+
 /* STMT initializes an object from COMPLEX_CST where one or more of the
    bytes written are dead stores.  ORIG is the bitmap of bytes stored by
    STMT.  LIVE is the bitmap of stores that are actually live.
@@ -197,6 +246,69 @@  trim_complex_store (bitmap orig, bitmap live, gimple *stmt)
   BITMAP_FREE (dead);
 }
 
+/* STMT initializes an object using a CONSTRUCTOR where one or more of the
+   bytes written are dead stores.  ORIG is the bitmap of bytes stored by
+   STMT.  LIVE is the bitmap of stores that are actually live.
+
+   Attempt to rewrite STMT so that only the real or imaginary part of
+   the object is actually stored.
+
+   The most common case for getting here is a CONSTRUCTOR with no elements
+   being used to zero initialize an object.  We do not try to handle other
+   cases as those would force us to fully cover the object with the
+   CONSTRUCTOR node except for the components that are dead.  */
+
+static void
+trim_constructor_store (bitmap orig, bitmap live, gimple *stmt)
+{
+  tree ctor = gimple_assign_rhs1 (stmt);
+  extern int all_zeros_p (const_tree);
+
+  HOST_WIDE_INT nz_elts, init_elts;
+  bool complete;
+  categorize_ctor_elements (ctor, &nz_elts, &init_elts, &complete);
+
+  /* This is the only case we currently handle.  It actually seems to
+     catch most cases of actual interest.  */
+  if (init_elts == 0 && !CONSTRUCTOR_NO_CLEARING (ctor))
+    {
+      int head_trim = 0;
+      int tail_trim = 0;
+      compute_trims (orig, live, &head_trim, &tail_trim);
+
+      /* Now we want to replace the constructor initializer
+	 with memset (object + head_trim, 0, size - head_trim - tail_trim).  */
+      if (head_trim || tail_trim)
+	{
+	  /* We want &lhs for the MEM_REF expression.  */
+	  tree lhs_addr = build_fold_addr_expr (gimple_assign_lhs (stmt));
+
+	  /* The number of bytes for the new constructor.  */
+	  int count = bitmap_count_bits (orig) - head_trim - tail_trim - 1;
+
+	  /* And the new type for the CONSTRUCTOR.  Essentially it's just
+	     a char array large enough to cover the non-trimmed parts of
+	     the original CONSTRUCTOR.  Note we want explicit bounds here
+	     so that we know how many bytes to clear when expanding the
+	     CONSTRUCTOR.  */
+	  tree type = build_range_type (sizetype, size_zero_node,
+					build_int_cst (sizetype, count));
+	  type = build_array_type (char_type_node, type);
+
+
+	  /* Build a MEM_REF representing the whole accessed area, starting
+	     at the first byte not trimmed.  */
+	  tree exp = fold_build2 (MEM_REF, type, lhs_addr,
+				  build_int_cst (build_pointer_type (type),
+						 head_trim));
+
+	  /* Now update STMT with a new RHS and LHS.  */
+	  gimple_assign_set_lhs (stmt, exp);
+	  gimple_assign_set_rhs1 (stmt, build_constructor (type, NULL));
+	}
+    }
+}
+
 /* STMT is a memory write where one or more bytes written are dead
    stores.  ORIG is the bitmap of bytes stored by STMT.  LIVE is the
    bitmap of stores that are actually live.
@@ -213,6 +325,9 @@  trim_partially_dead_store (bitmap orig, bitmap live, gimple *stmt)
     {
       switch (gimple_assign_rhs_code (stmt))
 	{
+	case CONSTRUCTOR:
+	  trim_constructor_store (orig, live, stmt);
+	  break;
 	case COMPLEX_CST:
 	  trim_complex_store (orig, live, stmt);
 	  break;