diff mbox

Fix up wi::lrshift (PR c++/69399)

Message ID 20160126212616.GK3017@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Jan. 26, 2016, 9:26 p.m. UTC
On Tue, Jan 26, 2016 at 09:45:19PM +0100, Richard Biener wrote:
> On January 26, 2016 8:00:52 PM GMT+01:00, Mike Stump <mikestump@comcast.net> wrote:
> >On Jan 26, 2016, at 10:21 AM, Jakub Jelinek <jakub@redhat.com> wrote
> >> The question is, shall we do what wi::lshift does and have the fast
> >path
> >> only for the constant shifts, as the untested patch below does, or
> >shall we
> >> check shift dynamically (thus use
> >> shift < HOST_BITS_PER_WIDE_INT
> >> instead of
> >> STATIC_CONSTANT_P (shift < HOST_BITS_PER_WIDE_INT)
> >> in the patch),
> >
> >Hum…  I think I prefer the dynamic check.  The reasoning is that when
> >we fast path, we can tolerate the conditional branch to retain the fast
> >path, as most of the time, that condition will usually be true.  If the
> >compiler had troubles with knowing the usual truth value of the
> >expression, seems like we can hint that it will be true and influence
> >the static prediction of the branch.  This permits us to fast path
> >almost all the time in the non-constant, but small enough case.  For
> >known shifts, there is no code gen difference, so it doesn’t matter.
> 
> The original reasoning was to inline only the fast path if it is known at compile time and otherwise have a call. Exactly to avoid bloating callers with inlined conditionals.

I'm now also bootstrapping/regtesting following patch, the previous one
passed bootstrap/regtest, and will do cc1plus size comparison afterwards.
That said, I have done a quick check, where I believe that unless xi and
shift are both compile time constants, for the
STATIC_CONSTANT_P (xi.precision > HOST_BITS_PER_WIDE_INT)
case there should be some comparison and the lrshift_large call with
the non-STATIC_CONSTANT_P variant, but in the bootstrapped (non-LTO)
cc1plus I only see 14 calls to lrshift_large, thus I bet it will likely affect
only <= 14 places right now:

0000000000776990 <_ZL11build_new_1PP3vecIP9tree_node5va_gc8vl_embedES1_S1_S6_bi>:
  777ca4:	e8 17 28 91 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
00000000008b8bc0 <_ZL31adjust_offset_for_component_refP9tree_nodePbPl.part.91>:
  8b8cc2:	e8 f9 17 7d 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000a7b550 <_ZN2wi7rrotateIPK9tree_node16generic_wide_intI16wide_int_storageEEENS_12unary_traitsIT_E11result_typeERKS8_RKT0_j>:
  a7baea:	e8 d1 e9 60 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
0000000000a7bd90 <_ZN2wi7lrotateIPK9tree_node16generic_wide_intI16wide_int_storageEEENS_12unary_traitsIT_E11result_typeERKS8_RKT0_j>:
  a7c457:	e8 64 e0 60 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000a7c710 <_ZN2wi7lrshiftIP9tree_nodelEENS_12unary_traitsIT_E11result_typeERKS4_RKT0_>:
  a7c851:	e8 6a dc 60 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000a7d280 <_ZN2wi7lrshiftIPK9tree_node16generic_wide_intI16wide_int_storageEEENS_12unary_traitsIT_E11result_typeERKS8_RKT0_>:
  a7d3b9:	e8 02 d1 60 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000cc1370 <_Z15real_to_integerPK10real_valuePbi>:
  cc1752:	e8 69 8d 3c 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000cc27f0 <_Z17real_from_integerP10real_value13format_helperRK16generic_wide_intI20wide_int_ref_storageILb0EEE6signop>:
  cc2f05:	e8 b6 75 3c 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000d6c420 <_Z31simplify_const_binary_operation8rtx_code12machine_modeP7rtx_defS2_>:
  d6f5a5:	e8 16 af 31 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000d7ca40 <_ZN2wi7lrotateISt4pairIP7rtx_def12machine_modeES5_EENS_12unary_traitsIT_E11result_typeERKS7_RKT0_j>:
  d7d17a:	e8 41 d3 30 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
0000000000d7d310 <_ZN2wi7rrotateISt4pairIP7rtx_def12machine_modeES5_EENS_12unary_traitsIT_E11result_typeERKS7_RKT0_j>:
  d7d99d:	e8 1e cb 30 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000ea3c40 <_ZN2wi7lrshiftI16generic_wide_intI22fixed_wide_int_storageILi192EEES4_EENS_12unary_traitsIT_E11result_typeERKS6_RKT0_>:
  ea3ca7:	e8 14 68 1e 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
0000000000f435a0 <_ZL27copy_reference_ops_from_refP9tree_nodeP3vecI22vn_reference_op_struct7va_heap6vl_ptrE>:
  f444b2:	e8 09 60 14 00       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>
--
000000000161c840 <_ZL21restructure_referencePP9tree_nodeS1_P16generic_wide_intI22fixed_wide_int_storageILi192EEES1_.constprop.133>:
 161cb51:	e8 6a d9 a6 ff       	callq  108a4c0 <_ZN2wi13lrshift_largeEPlPKljjjj>

2016-01-26  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/69399
	* wide-int.h (wi::lrshift): For larger precisions, only
	use fast path if shift is known to be < HOST_BITS_PER_WIDE_INT.

	* gcc.dg/torture/pr69399.c: New test.



	Jakub

Comments

Mike Stump Jan. 26, 2016, 9:55 p.m. UTC | #1
On Jan 26, 2016, at 1:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> will do cc1plus size comparison afterwards.

We know the dynamic check is larger.  You can’t tell the advantage of speed from size.  Better would be to time compiling any random large translation unit.

Nice to see that only 14 calls remain, that’s way better than the 34 I thought.
Jakub Jelinek Jan. 26, 2016, 11:41 p.m. UTC | #2
On Tue, Jan 26, 2016 at 01:55:41PM -0800, Mike Stump wrote:
> On Jan 26, 2016, at 1:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > will do cc1plus size comparison afterwards.
> 
> We know the dynamic check is larger.  You can’t tell the advantage of
> speed from size.  Better would be to time compiling any random large
> translation unit.
> 
> Nice to see that only 14 calls remain, that’s way better than the 34 I
> thought.

So, it seems probably the PR65656 changes made things actually significantly
worse, while I see a (small) difference in the generated code between the two
patches if I compile say tree-ssa-ccp.c with g++ 5.x, in the bootstrapped
compiler there is no difference at all, the compilers with either patch
have identical objdump -dr cc1plus.  Already at *.gimple time all the
relevant __builtin_constant_p are resolved and it seems all to 0.

So please agree on one of the two patches (don't care which), and I'll try
to distill a testcase to look at for PR65656.

	Jakub
Richard Biener Jan. 27, 2016, 10:38 a.m. UTC | #3
On Wed, Jan 27, 2016 at 12:41 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Jan 26, 2016 at 01:55:41PM -0800, Mike Stump wrote:
>> On Jan 26, 2016, at 1:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> > will do cc1plus size comparison afterwards.
>>
>> We know the dynamic check is larger.  You can’t tell the advantage of
>> speed from size.  Better would be to time compiling any random large
>> translation unit.
>>
>> Nice to see that only 14 calls remain, that’s way better than the 34 I
>> thought.
>
> So, it seems probably the PR65656 changes made things actually significantly
> worse, while I see a (small) difference in the generated code between the two
> patches if I compile say tree-ssa-ccp.c with g++ 5.x, in the bootstrapped
> compiler there is no difference at all, the compilers with either patch
> have identical objdump -dr cc1plus.  Already at *.gimple time all the
> relevant __builtin_constant_p are resolved and it seems all to 0.
>
> So please agree on one of the two patches (don't care which), and I'll try
> to distill a testcase to look at for PR65656.

I don't care if the wi::lshift by LOG2_BITS_PER_UNIT done by
get_ref_base_and_extent
are compiled to as good quality as before (I suspect it doesn't matter
in this case
as the shift amount is constant, but maybe due to PR65656 the
non-STATIC_CONSTANT_P
variant is better).

Richard.

>         Jakub
Jakub Jelinek Jan. 27, 2016, 11:36 a.m. UTC | #4
On Wed, Jan 27, 2016 at 11:38:33AM +0100, Richard Biener wrote:
> On Wed, Jan 27, 2016 at 12:41 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> > On Tue, Jan 26, 2016 at 01:55:41PM -0800, Mike Stump wrote:
> >> On Jan 26, 2016, at 1:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> >> > will do cc1plus size comparison afterwards.
> >>
> >> We know the dynamic check is larger.  You can’t tell the advantage of
> >> speed from size.  Better would be to time compiling any random large
> >> translation unit.
> >>
> >> Nice to see that only 14 calls remain, that’s way better than the 34 I
> >> thought.
> >
> > So, it seems probably the PR65656 changes made things actually significantly
> > worse, while I see a (small) difference in the generated code between the two
> > patches if I compile say tree-ssa-ccp.c with g++ 5.x, in the bootstrapped
> > compiler there is no difference at all, the compilers with either patch
> > have identical objdump -dr cc1plus.  Already at *.gimple time all the
> > relevant __builtin_constant_p are resolved and it seems all to 0.
> >
> > So please agree on one of the two patches (don't care which), and I'll try
> > to distill a testcase to look at for PR65656.
> 
> I don't care if the wi::lshift by LOG2_BITS_PER_UNIT done by
> get_ref_base_and_extent
> are compiled to as good quality as before (I suspect it doesn't matter
> in this case
> as the shift amount is constant, but maybe due to PR65656 the
> non-STATIC_CONSTANT_P
> variant is better).

Ok, I'll commit the non-STATIC_CONSTANT_P variant for now, we can revisit it
later.  I have distilled a testcase for Jason in PR65656.  It seems the
problematic STATIC_CONSTANT_P is only if it is inside of the condition of
?: expression.  So we could work around it by using something like:
-      if (STATIC_CONSTANT_P (xi.precision > HOST_BITS_PER_WIDE_INT)
-          ? (STATIC_CONSTANT_P (shift < HOST_BITS_PER_WIDE_INT - 1)
-             && xi.len == 1
-             && xi.val[0] <= (HOST_WIDE_INT) ((unsigned HOST_WIDE_INT)
-                                              HOST_WIDE_INT_MAX >> shift))
-          : precision <= HOST_BITS_PER_WIDE_INT)
+      bool fast_path = false;
+      if (STATIC_CONSTANT_P (xi.precision > HOST_BITS_PER_WIDE_INT)
+        {
+          if (STATIC_CONSTANT_P (shift < HOST_BITS_PER_WIDE_INT - 1)
+              && xi.len == 1
+              && xi.val[0] <= (HOST_WIDE_INT) ((unsigned HOST_WIDE_INT)
+                                               HOST_WIDE_INT_MAX >> shift))
+            fast_path = true;
+        }
+      else if (precision <= HOST_BITS_PER_WIDE_INT)
+        fast_path = true;
+      if (fast_path)
        {
          val[0] = xi.ulow () << shift;
          result.set_len (1);
        }
      else
        result.set_len (lshift_large (val, xi.val, xi.len,
                                      precision, shift));
and similarly in lrshift.

	Jakub
diff mbox

Patch

--- gcc/wide-int.h.jj	2016-01-04 14:55:50.000000000 +0100
+++ gcc/wide-int.h	2016-01-26 19:05:20.715269366 +0100
@@ -2909,7 +2909,9 @@  wi::lrshift (const T1 &x, const T2 &y)
 	 For variable-precision integers like wide_int, handle HWI
 	 and sub-HWI integers inline.  */
       if (STATIC_CONSTANT_P (xi.precision > HOST_BITS_PER_WIDE_INT)
-	  ? xi.len == 1 && xi.val[0] >= 0
+	  ? (STATIC_CONSTANT_P (shift < HOST_BITS_PER_WIDE_INT)
+	     && xi.len == 1
+	     && xi.val[0] >= 0)
 	  : xi.precision <= HOST_BITS_PER_WIDE_INT)
 	{
 	  val[0] = xi.to_uhwi () >> shift;
--- gcc/testsuite/gcc.dg/torture/pr69399.c.jj	2016-01-26 21:39:45.732927748 +0100
+++ gcc/testsuite/gcc.dg/torture/pr69399.c	2016-01-26 21:41:09.081753051 +0100
@@ -0,0 +1,18 @@ 
+/* { dg-do run { target int128 } } */
+
+static unsigned __attribute__((noinline, noclone))
+foo (unsigned long long u)
+{
+  unsigned __int128 v = u | 0xffffff81U;
+  v >>= 64;
+  return v;
+}
+
+int
+main ()
+{
+  unsigned x = foo (27);
+  if (x != 0)
+    __builtin_abort ();
+  return 0;
+}