Reassociate X == CST1 || X == CST2 if popcount (CST2 - CST1) == 1 into ((X - CST1) & ~(CST2 - CST1)) == 0

On Sat, Oct 12, 2013 at 10:08:12AM +0800, Zhenqiang Chen wrote:
> As you had mentioned, the transition in this patch does not reduce
> instructions. But the preexisting optimization does. So we prefer to do the
> preexisting optimization first. E.g.
> 
> X == 10 || X == 12 || X == 26

Ok, that makes sense.

@@ -2279,6 +2275,71 @@ optimize_range_tests (enum tree_code opcode,
 	}
     }
 
+  /* Optimize X == CST1 || X == CST2
+     if popcount (CST2 - CST1) == 1 into
+     ((X - CST1) & ~(CST2 - CST1)) == 0.  */

Mention here also similarly to the other comment that it works also
with ranges.

+  if (BRANCH_COST (optimize_function_for_speed_p (cfun), false) >= 2)
+    for (i = first; i < length; i++)
+      {
+	tree lowi, highi, lowj, highj, type, tem1, tem2, mask;
+
+	if (ranges[i].exp == NULL_TREE || ranges[i].in_p)
+	  continue;
+	type = TREE_TYPE (ranges[i].exp);
+	if (!INTEGRAL_TYPE_P (type))
+	  continue;
+	lowi = ranges[i].low;
+	if (lowi == NULL_TREE)
+	  continue;

The other loop has here:
      if (lowi == NULL_TREE)
        lowi = TYPE_MIN_VALUE (type);
instead, which is better, can you please change it?

+	highi = ranges[i].high;
+	if (highi == NULL_TREE)
+	  continue;
+	for (j = i + 1; j < length && j < i + 64; j++)
+	  {
+	    lowj = ranges[j].low;
+	    if (lowj == NULL_TREE)
+	      continue;
+	    highj = ranges[j].high;
+	    if (highj == NULL_TREE)
+	      continue;

The other loop has
	if (highj == NULL_TREE)
	  highj = TYPE_MAX_VALUE (type);
here instead.

+	    if (ranges[j].exp == NULL_TREE || ranges[j].in_p
+		|| (ranges[i].exp != ranges[j].exp))
+	      continue;

Can you please move this test the lowj = assignment, and
remove the ranges[j].exp == NULL_TREE test (both here and
in the earlier loop)?  I mean, we've checked at the
beginning of for (i = first; i < length; i++) loop that
ranges[i].exp is not NULL_TREE, and if ranges[j].exp is NULL_TREE,
it will be different than ranges[i].exp.
So
	if (ranges[i].exp != ranges[j].exp || ranges[j].in_p)
	  continue;
(and please also collapse the two checks in the first loop,
so that both look similar).

+	    /* Check lowj > highi.  */
+	    tem1 = fold_binary (GT_EXPR, boolean_type_node,
+			       lowj, highi);
+	    if (tem1 == NULL_TREE || !integer_onep (tem1))
+	      continue;
+	    /* Check highi - lowi == highj - lowj.  */
+	    tem1 = fold_binary (MINUS_EXPR, type, highi, lowi);
+	    if (tem1 == NULL_TREE || TREE_CODE (tem1) != INTEGER_CST)
+	      continue;
+	    tem2 = fold_binary (MINUS_EXPR, type, highj, lowj);
+	    if (tem2 == NULL_TREE || TREE_CODE (tem2) != INTEGER_CST)
+	      continue;
+	    if (!tree_int_cst_equal (tem1, tem2))
+	      continue;
+	    /* Check popcount (lowj - lowi) == 1.  */
+	    tem1 = fold_binary (MINUS_EXPR, type, lowj, lowi);
+	    if (tem1 == NULL_TREE || TREE_CODE (tem1) != INTEGER_CST)
+	      continue;
+	    if (tree_log2 (tem1) < 0)
+	      continue;
+	    mask = fold_build1 (BIT_NOT_EXPR, type, tem1);
+	    tem1 = fold_binary (MINUS_EXPR, type, ranges[i].exp, lowi);
+	    tem1 = fold_build2 (BIT_AND_EXPR, type, tem1, mask);
+	    lowi = build_int_cst (type, 0);

Please use lowj instead of lowi here.  Because, if update_range_test
fails, we continue looking for further matches in following ranges,
and while lowj will be computed again, lowi will be assumed to contain
the low bound of the first range.

+	    if (update_range_test (ranges + i, ranges + j, 1, opcode, ops, tem1,
+				   ranges[i].in_p, lowi, tem2,

And here too.

Or, alternatively to avoid duplication, you could put the whole
for (i = first; i < length; i++)
loop from the first optimization into another
int i;
  for (pass = 0; pass < (BRANCH_COST (optimize_function_for_speed_p (cfun),
				      false) >= 2) + 1; pass++)
  {
  }
loop, use whatever is common in the loop unconditionally, and just
conditionalize the rest on pass == 0 vs. pass == 1.
Or maybe even better just move this whole loop into a new function,
with ranges, opcode, first, length arguments plus another (bool?) argument
which of the two optimizations you want to perform, and return the bool
any_changes.  Then you wouldn't run into line length issues that you could
run into with the extra loop.


int
main ()
{
  volatile int n43, n47, n75, n79;
  n43 = 43; n47 = n43 + 4; n75 = 75; n79 = n75 + 4;
  int i;
  for (i = -10; i <= 100; i++)
    if (test (i, 20, 30) != 30 - ((i >= n43 && i < n47)
				  || (i >= n75 && i < n79)) * 10)
      __builtin_abort ();
  return 0;
}

?

	Jakub


Reassociate X == CST1 || X == CST2 if popcount (CST2 - CST1) == 1 into ((X - CST1) & ~(CST2 - CST1)) == 0

Commit Message

Comments

Patch