Message ID | 20141016220912.GN10376@tucnak.redhat.com |
---|---|
State | New |
Headers | show |
On Fri, 17 Oct 2014, Jakub Jelinek wrote: > Hi! > > This patch optimizes some range tests into bit tests, like we > already do for switches in emit_case_bit_tests, but this time > for a series of ored equality or anded non-equality comparisons. > If at least 3 comparisons (after the contiguous range, xor and > diff + xor optimizations are performed) are needed and range of > values is at most number of bits in a word, we instead check > whether the operand is >= smallest and <= highest number in the > range and if it is, test (word) 1 << operand against a bitmask. > > Bootstrapped/regtested on x86_64-linux and i686-linux (for i686-linux > I had to go back to r216304 because there are multiple ICF related > bootstrap issues on i686-linux), ok for trunk? Ok. Thanks, Richard. > 2014-10-16 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/63464 > * gimple.h (gimple_seq_discard): New prototype. > * gimple.c: Include stringpool.h and tree-ssanames.h. > (gimple_seq_discard): New function. > * optabs.h (lshift_cheap_p): New prototype. > * optabs.c (lshift_cheap_p): New function, moved from... > * tree-switch-conversion.c (lshift_cheap_p): ... here. > * tree-ssa-reassoc.c: Include gimplify.h and optabs.h. > (reassoc_branch_fixups): New variable. > (update_range_test): Add otherrangep and seq arguments. > Unshare exp. If otherrange is NULL, use for other ranges > array of pointers pointed by otherrangep instead. > Emit seq before gimplified statements for tem. > (optimize_range_tests_diff): Adjust update_range_test > caller. > (optimize_range_tests_xor): Likewise. Fix up comment. > (extract_bit_test_mask, optimize_range_tests_to_bit_test): New > functions. > (optimize_range_tests): Adjust update_range_test caller. > Call optimize_range_tests_to_bit_test. > (branch_fixup): New function. > (execute_reassoc): Call branch_fixup. > > * gcc.dg/torture/pr63464.c: New test. > * gcc.dg/tree-ssa/reassoc-37.c: New test. > * gcc.dg/tree-ssa/reassoc-38.c: New test. > > --- gcc/gimple.h.jj 2014-10-15 12:28:06.428498079 +0200 > +++ gcc/gimple.h 2014-10-15 13:43:18.967491428 +0200 > @@ -1269,9 +1269,10 @@ extern bool gimple_asm_clobbers_memory_p > extern void dump_decl_set (FILE *, bitmap); > extern bool nonfreeing_call_p (gimple); > extern bool infer_nonnull_range (gimple, tree, bool, bool); > -extern void sort_case_labels (vec<tree> ); > -extern void preprocess_case_label_vec_for_gimple (vec<tree> , tree, tree *); > -extern void gimple_seq_set_location (gimple_seq , location_t); > +extern void sort_case_labels (vec<tree>); > +extern void preprocess_case_label_vec_for_gimple (vec<tree>, tree, tree *); > +extern void gimple_seq_set_location (gimple_seq, location_t); > +extern void gimple_seq_discard (gimple_seq); > > /* Formal (expression) temporary table handling: multiple occurrences of > the same scalar expression are evaluated into the same temporary. */ > --- gcc/gimple.c.jj 2014-10-15 12:28:19.917235900 +0200 > +++ gcc/gimple.c 2014-10-15 13:43:18.970491368 +0200 > @@ -47,6 +47,8 @@ along with GCC; see the file COPYING3. > #include "demangle.h" > #include "langhooks.h" > #include "bitmap.h" > +#include "stringpool.h" > +#include "tree-ssanames.h" > > > /* All the tuples have their operand vector (if present) at the very bottom > @@ -2826,3 +2828,19 @@ gimple_seq_set_location (gimple_seq seq, > for (gimple_stmt_iterator i = gsi_start (seq); !gsi_end_p (i); gsi_next (&i)) > gimple_set_location (gsi_stmt (i), loc); > } > + > +/* Release SSA_NAMEs in SEQ as well as the GIMPLE statements. */ > + > +void > +gimple_seq_discard (gimple_seq seq) > +{ > + gimple_stmt_iterator gsi; > + > + for (gsi = gsi_start (seq); !gsi_end_p (gsi); ) > + { > + gimple stmt = gsi_stmt (gsi); > + gsi_remove (&gsi, true); > + release_defs (stmt); > + ggc_free (stmt); > + } > +} > --- gcc/optabs.h.jj 2014-10-15 12:28:06.479497088 +0200 > +++ gcc/optabs.h 2014-10-15 13:43:18.970491368 +0200 > @@ -538,5 +538,6 @@ extern void gen_satfractuns_conv_libfunc > enum machine_mode, > enum machine_mode); > extern void init_tree_optimization_optabs (tree); > +extern bool lshift_cheap_p (bool); > > #endif /* GCC_OPTABS_H */ > --- gcc/optabs.c.jj 2014-10-15 12:28:06.433497982 +0200 > +++ gcc/optabs.c 2014-10-15 13:43:18.969491387 +0200 > @@ -8624,4 +8624,31 @@ get_best_mem_extraction_insn (extraction > struct_bits, field_mode); > } > > +/* Determine whether "1 << x" is relatively cheap in word_mode. */ > + > +bool > +lshift_cheap_p (bool speed_p) > +{ > + /* FIXME: This should be made target dependent via this "this_target" > + mechanism, similar to e.g. can_copy_init_p in gcse.c. */ > + static bool init[2] = { false, false }; > + static bool cheap[2] = { true, true }; > + > + /* If the targer has no lshift in word_mode, the operation will most > + probably not be cheap. ??? Does GCC even work for such targets? */ > + if (optab_handler (ashl_optab, word_mode) == CODE_FOR_nothing) > + return false; > + > + if (!init[speed_p]) > + { > + rtx reg = gen_raw_REG (word_mode, 10000); > + int cost = set_src_cost (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), > + speed_p); > + cheap[speed_p] = cost < COSTS_N_INSNS (3); > + init[speed_p] = true; > + } > + > + return cheap[speed_p]; > +} > + > #include "gt-optabs.h" > --- gcc/tree-switch-conversion.c.jj 2014-10-15 12:28:06.386498896 +0200 > +++ gcc/tree-switch-conversion.c 2014-10-15 13:43:18.963491510 +0200 > @@ -125,35 +125,6 @@ hoist_edge_and_branch_if_true (gimple_st > } > > > -/* Determine whether "1 << x" is relatively cheap in word_mode. */ > -/* FIXME: This is the function that we need rtl.h and optabs.h for. > - This function (and similar RTL-related cost code in e.g. IVOPTS) should > - be moved to some kind of interface file for GIMPLE/RTL interactions. */ > -static bool > -lshift_cheap_p (bool speed_p) > -{ > - /* FIXME: This should be made target dependent via this "this_target" > - mechanism, similar to e.g. can_copy_init_p in gcse.c. */ > - static bool init[2] = {false, false}; > - static bool cheap[2] = {true, true}; > - > - /* If the targer has no lshift in word_mode, the operation will most > - probably not be cheap. ??? Does GCC even work for such targets? */ > - if (optab_handler (ashl_optab, word_mode) == CODE_FOR_nothing) > - return false; > - > - if (!init[speed_p]) > - { > - rtx reg = gen_raw_REG (word_mode, 10000); > - int cost = set_src_cost (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), > - speed_p); > - cheap[speed_p] = cost < COSTS_N_INSNS (MAX_CASE_BIT_TESTS); > - init[speed_p] = true; > - } > - > - return cheap[speed_p]; > -} > - > /* Return true if a switch should be expanded as a bit test. > RANGE is the difference between highest and lowest case. > UNIQ is number of unique case node targets, not counting the default case. > --- gcc/tree-ssa-reassoc.c.jj 2014-10-15 13:41:45.145301213 +0200 > +++ gcc/tree-ssa-reassoc.c 2014-10-16 16:44:58.776901531 +0200 > @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. > #include "params.h" > #include "diagnostic-core.h" > #include "builtins.h" > +#include "gimplify.h" > +#include "optabs.h" > > /* This is a simple global reassociation pass. It is, in part, based > on the LLVM pass of the same name (They do some things more/less > @@ -218,6 +220,12 @@ static long *bb_rank; > /* Operand->rank hashtable. */ > static hash_map<tree, long> *operand_rank; > > +/* Vector of SSA_NAMEs on which after reassociate_bb is done with > + all basic blocks the CFG should be adjusted - basic blocks > + split right after that SSA_NAME's definition statement and before > + the only use, which must be a bit ior. */ > +static vec<tree> reassoc_branch_fixups; > + > /* Forward decls. */ > static long get_rank (tree); > static bool reassoc_stmt_dominates_stmt_p (gimple, gimple); > @@ -2071,7 +2079,9 @@ range_entry_cmp (const void *a, const vo > /* Helper routine of optimize_range_test. > [EXP, IN_P, LOW, HIGH, STRICT_OVERFLOW_P] is a merged range for > RANGE and OTHERRANGE through OTHERRANGE + COUNT - 1 ranges, > - OPCODE and OPS are arguments of optimize_range_tests. Return > + OPCODE and OPS are arguments of optimize_range_tests. If OTHERRANGE > + is NULL, OTHERRANGEP should not be and then OTHERRANGEP points to > + an array of COUNT pointers to other ranges. Return > true if the range merge has been successful. > If OPCODE is ERROR_MARK, this is called from within > maybe_optimize_range_tests and is performing inter-bb range optimization. > @@ -2080,9 +2090,10 @@ range_entry_cmp (const void *a, const vo > > static bool > update_range_test (struct range_entry *range, struct range_entry *otherrange, > + struct range_entry **otherrangep, > unsigned int count, enum tree_code opcode, > - vec<operand_entry_t> *ops, tree exp, bool in_p, > - tree low, tree high, bool strict_overflow_p) > + vec<operand_entry_t> *ops, tree exp, gimple_seq seq, > + bool in_p, tree low, tree high, bool strict_overflow_p) > { > operand_entry_t oe = (*ops)[range->idx]; > tree op = oe->op; > @@ -2090,9 +2101,11 @@ update_range_test (struct range_entry *r > last_stmt (BASIC_BLOCK_FOR_FN (cfun, oe->id)); > location_t loc = gimple_location (stmt); > tree optype = op ? TREE_TYPE (op) : boolean_type_node; > - tree tem = build_range_check (loc, optype, exp, in_p, low, high); > + tree tem = build_range_check (loc, optype, unshare_expr (exp), > + in_p, low, high); > enum warn_strict_overflow_code wc = WARN_STRICT_OVERFLOW_COMPARISON; > gimple_stmt_iterator gsi; > + unsigned int i; > > if (tem == NULL_TREE) > return false; > @@ -2112,8 +2125,12 @@ update_range_test (struct range_entry *r > fprintf (dump_file, ", "); > print_generic_expr (dump_file, range->high, 0); > fprintf (dump_file, "]"); > - for (r = otherrange; r < otherrange + count; r++) > + for (i = 0; i < count; i++) > { > + if (otherrange) > + r = otherrange + i; > + else > + r = otherrangep[i]; > fprintf (dump_file, " and %c[", r->in_p ? '+' : '-'); > print_generic_expr (dump_file, r->low, 0); > fprintf (dump_file, ", "); > @@ -2135,10 +2152,14 @@ update_range_test (struct range_entry *r > In that case we have to insert after the stmt rather then before > it. */ > if (op == range->exp) > - tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, false, > - GSI_CONTINUE_LINKING); > + { > + gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING); > + tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, false, > + GSI_CONTINUE_LINKING); > + } > else > { > + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); > tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, true, > GSI_SAME_STMT); > gsi_prev (&gsi); > @@ -2156,8 +2177,12 @@ update_range_test (struct range_entry *r > range->in_p = in_p; > range->strict_overflow_p = false; > > - for (range = otherrange; range < otherrange + count; range++) > + for (i = 0; i < count; i++) > { > + if (otherrange) > + range = otherrange + i; > + else > + range = otherrangep[i]; > oe = (*ops)[range->idx]; > /* Now change all the other range test immediate uses, so that > those tests will be optimized away. */ > @@ -2195,8 +2220,8 @@ optimize_range_tests_xor (enum tree_code > struct range_entry *rangej) > { > tree lowxor, highxor, tem, exp; > - /* Check highi ^ lowi == highj ^ lowj and > - popcount (highi ^ lowi) == 1. */ > + /* Check lowi ^ lowj == highi ^ highj and > + popcount (lowi ^ lowj) == 1. */ > lowxor = fold_binary (BIT_XOR_EXPR, type, lowi, lowj); > if (lowxor == NULL_TREE || TREE_CODE (lowxor) != INTEGER_CST) > return false; > @@ -2210,8 +2235,8 @@ optimize_range_tests_xor (enum tree_code > exp = fold_build2 (BIT_AND_EXPR, type, rangei->exp, tem); > lowj = fold_build2 (BIT_AND_EXPR, type, lowi, tem); > highj = fold_build2 (BIT_AND_EXPR, type, highi, tem); > - if (update_range_test (rangei, rangej, 1, opcode, ops, exp, > - rangei->in_p, lowj, highj, > + if (update_range_test (rangei, rangej, NULL, 1, opcode, ops, exp, > + NULL, rangei->in_p, lowj, highj, > rangei->strict_overflow_p > || rangej->strict_overflow_p)) > return true; > @@ -2259,8 +2284,8 @@ optimize_range_tests_diff (enum tree_cod > fold_convert (type, rangei->exp), lowi); > tem1 = fold_build2 (BIT_AND_EXPR, type, tem1, mask); > lowj = build_int_cst (type, 0); > - if (update_range_test (rangei, rangej, 1, opcode, ops, tem1, > - rangei->in_p, lowj, tem2, > + if (update_range_test (rangei, rangej, NULL, 1, opcode, ops, tem1, > + NULL, rangei->in_p, lowj, tem2, > rangei->strict_overflow_p > || rangej->strict_overflow_p)) > return true; > @@ -2328,6 +2353,255 @@ optimize_range_tests_1 (enum tree_code o > return any_changes; > } > > +/* Helper function of optimize_range_tests_to_bit_test. Handle a single > + range, EXP, LOW, HIGH, compute bit mask of bits to test and return > + EXP on success, NULL otherwise. */ > + > +static tree > +extract_bit_test_mask (tree exp, int prec, tree totallow, tree low, tree high, > + wide_int *mask, tree *totallowp) > +{ > + tree tem = int_const_binop (MINUS_EXPR, high, low); > + if (tem == NULL_TREE > + || TREE_CODE (tem) != INTEGER_CST > + || TREE_OVERFLOW (tem) > + || tree_int_cst_sgn (tem) == -1 > + || compare_tree_int (tem, prec) != -1) > + return NULL_TREE; > + > + unsigned HOST_WIDE_INT max = tree_to_uhwi (tem) + 1; > + *mask = wi::shifted_mask (0, max, false, prec); > + if (TREE_CODE (exp) == BIT_AND_EXPR > + && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) > + { > + widest_int msk = wi::to_widest (TREE_OPERAND (exp, 1)); > + msk = wi::zext (~msk, TYPE_PRECISION (TREE_TYPE (exp))); > + if (wi::popcount (msk) == 1 > + && wi::ltu_p (msk, prec - max)) > + { > + *mask |= wi::shifted_mask (msk.to_uhwi (), max, false, prec); > + max += msk.to_uhwi (); > + exp = TREE_OPERAND (exp, 0); > + if (integer_zerop (low) > + && TREE_CODE (exp) == PLUS_EXPR > + && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) > + { > + widest_int bias > + = wi::neg (wi::sext (wi::to_widest (TREE_OPERAND (exp, 1)), > + TYPE_PRECISION (TREE_TYPE (low)))); > + tree tbias = wide_int_to_tree (TREE_TYPE (low), bias); > + if (totallowp) > + { > + *totallowp = tbias; > + exp = TREE_OPERAND (exp, 0); > + STRIP_NOPS (exp); > + return exp; > + } > + else if (!tree_int_cst_lt (totallow, tbias)) > + return NULL_TREE; > + bias -= wi::to_widest (totallow); > + if (wi::ges_p (bias, 0) && wi::lts_p (bias, prec - max)) > + { > + *mask = wi::lshift (*mask, bias); > + exp = TREE_OPERAND (exp, 0); > + STRIP_NOPS (exp); > + return exp; > + } > + } > + } > + } > + if (totallowp) > + return exp; > + if (!tree_int_cst_lt (totallow, low)) > + return exp; > + tem = int_const_binop (MINUS_EXPR, low, totallow); > + if (tem == NULL_TREE > + || TREE_CODE (tem) != INTEGER_CST > + || TREE_OVERFLOW (tem) > + || compare_tree_int (tem, prec - max) == 1) > + return NULL_TREE; > + > + *mask = wi::lshift (*mask, wi::to_widest (tem)); > + return exp; > +} > + > +/* Attempt to optimize small range tests using bit test. > + E.g. > + X != 43 && X != 76 && X != 44 && X != 78 && X != 49 > + && X != 77 && X != 46 && X != 75 && X != 45 && X != 82 > + has been by earlier optimizations optimized into: > + ((X - 43U) & ~32U) > 3U && X != 49 && X != 82 > + As all the 43 through 82 range is less than 64 numbers, > + for 64-bit word targets optimize that into: > + (X - 43U) > 40U && ((1 << (X - 43U)) & 0x8F0000004FULL) == 0 */ > + > +static bool > +optimize_range_tests_to_bit_test (enum tree_code opcode, int first, int length, > + vec<operand_entry_t> *ops, > + struct range_entry *ranges) > +{ > + int i, j; > + bool any_changes = false; > + int prec = GET_MODE_BITSIZE (word_mode); > + auto_vec<struct range_entry *, 64> candidates; > + > + for (i = first; i < length - 2; i++) > + { > + tree lowi, highi, lowj, highj, type; > + > + if (ranges[i].exp == NULL_TREE || ranges[i].in_p) > + continue; > + type = TREE_TYPE (ranges[i].exp); > + if (!INTEGRAL_TYPE_P (type)) > + continue; > + lowi = ranges[i].low; > + if (lowi == NULL_TREE) > + lowi = TYPE_MIN_VALUE (type); > + highi = ranges[i].high; > + if (highi == NULL_TREE) > + continue; > + wide_int mask; > + tree exp = extract_bit_test_mask (ranges[i].exp, prec, lowi, lowi, > + highi, &mask, &lowi); > + if (exp == NULL_TREE) > + continue; > + bool strict_overflow_p = ranges[i].strict_overflow_p; > + candidates.truncate (0); > + int end = MIN (i + 64, length); > + for (j = i + 1; j < end; j++) > + { > + tree exp2; > + if (ranges[j].exp == NULL_TREE || ranges[j].in_p) > + continue; > + if (ranges[j].exp == exp) > + ; > + else if (TREE_CODE (ranges[j].exp) == BIT_AND_EXPR) > + { > + exp2 = TREE_OPERAND (ranges[j].exp, 0); > + if (exp2 == exp) > + ; > + else if (TREE_CODE (exp2) == PLUS_EXPR) > + { > + exp2 = TREE_OPERAND (exp2, 0); > + STRIP_NOPS (exp2); > + if (exp2 != exp) > + continue; > + } > + else > + continue; > + } > + else > + continue; > + lowj = ranges[j].low; > + if (lowj == NULL_TREE) > + continue; > + highj = ranges[j].high; > + if (highj == NULL_TREE) > + highj = TYPE_MAX_VALUE (type); > + wide_int mask2; > + exp2 = extract_bit_test_mask (ranges[j].exp, prec, lowi, lowj, > + highj, &mask2, NULL); > + if (exp2 != exp) > + continue; > + mask |= mask2; > + strict_overflow_p |= ranges[j].strict_overflow_p; > + candidates.safe_push (&ranges[j]); > + } > + > + /* If we need otherwise 3 or more comparisons, use a bit test. */ > + if (candidates.length () >= 2) > + { > + tree high = wide_int_to_tree (TREE_TYPE (lowi), > + wi::to_widest (lowi) > + + prec - wi::clz (mask)); > + operand_entry_t oe = (*ops)[ranges[i].idx]; > + tree op = oe->op; > + gimple stmt = op ? SSA_NAME_DEF_STMT (op) > + : last_stmt (BASIC_BLOCK_FOR_FN (cfun, oe->id)); > + location_t loc = gimple_location (stmt); > + tree optype = op ? TREE_TYPE (op) : boolean_type_node; > + > + /* See if it isn't cheaper to pretend the minimum value of the > + range is 0, if maximum value is small enough. > + We can avoid then subtraction of the minimum value, but the > + mask constant could be perhaps more expensive. */ > + if (compare_tree_int (lowi, 0) > 0 > + && compare_tree_int (high, prec) < 0) > + { > + int cost_diff; > + HOST_WIDE_INT m = tree_to_uhwi (lowi); > + rtx reg = gen_raw_REG (word_mode, 10000); > + bool speed_p = optimize_bb_for_speed_p (gimple_bb (stmt)); > + cost_diff = set_rtx_cost (gen_rtx_PLUS (word_mode, reg, > + GEN_INT (-m)), speed_p); > + rtx r = immed_wide_int_const (mask, word_mode); > + cost_diff += set_src_cost (gen_rtx_AND (word_mode, reg, r), > + speed_p); > + r = immed_wide_int_const (wi::lshift (mask, m), word_mode); > + cost_diff -= set_src_cost (gen_rtx_AND (word_mode, reg, r), > + speed_p); > + if (cost_diff > 0) > + { > + mask = wi::lshift (mask, m); > + lowi = build_zero_cst (TREE_TYPE (lowi)); > + } > + } > + > + tree tem = build_range_check (loc, optype, unshare_expr (exp), > + false, lowi, high); > + if (tem == NULL_TREE || is_gimple_val (tem)) > + continue; > + tree etype = unsigned_type_for (TREE_TYPE (exp)); > + exp = fold_build2_loc (loc, MINUS_EXPR, etype, > + fold_convert_loc (loc, etype, exp), > + fold_convert_loc (loc, etype, lowi)); > + exp = fold_convert_loc (loc, integer_type_node, exp); > + tree word_type = lang_hooks.types.type_for_mode (word_mode, 1); > + exp = fold_build2_loc (loc, LSHIFT_EXPR, word_type, > + build_int_cst (word_type, 1), exp); > + exp = fold_build2_loc (loc, BIT_AND_EXPR, word_type, exp, > + wide_int_to_tree (word_type, mask)); > + exp = fold_build2_loc (loc, EQ_EXPR, optype, exp, > + build_zero_cst (word_type)); > + if (is_gimple_val (exp)) > + continue; > + > + /* The shift might have undefined behavior if TEM is true, > + but reassociate_bb isn't prepared to have basic blocks > + split when it is running. So, temporarily emit a code > + with BIT_IOR_EXPR instead of &&, and fix it up in > + branch_fixup. */ > + gimple_seq seq; > + tem = force_gimple_operand (tem, &seq, true, NULL_TREE); > + gcc_assert (TREE_CODE (tem) == SSA_NAME); > + gimple_set_visited (SSA_NAME_DEF_STMT (tem), true); > + gimple_seq seq2; > + exp = force_gimple_operand (exp, &seq2, true, NULL_TREE); > + gimple_seq_add_seq_without_update (&seq, seq2); > + gcc_assert (TREE_CODE (exp) == SSA_NAME); > + gimple_set_visited (SSA_NAME_DEF_STMT (exp), true); > + gimple g > + = gimple_build_assign_with_ops (BIT_IOR_EXPR, > + make_ssa_name (optype, NULL), > + tem, exp); > + gimple_set_location (g, loc); > + gimple_seq_add_stmt_without_update (&seq, g); > + exp = gimple_assign_lhs (g); > + tree val = build_zero_cst (optype); > + if (update_range_test (&ranges[i], NULL, candidates.address (), > + candidates.length (), opcode, ops, exp, > + seq, false, val, val, strict_overflow_p)) > + { > + any_changes = true; > + reassoc_branch_fixups.safe_push (tem); > + } > + else > + gimple_seq_discard (seq); > + } > + } > + return any_changes; > +} > + > /* Optimize range tests, similarly how fold_range_test optimizes > it on trees. The tree code for the binary > operation between all the operands is OPCODE. > @@ -2391,9 +2665,9 @@ optimize_range_tests (enum tree_code opc > if (j == i + 1) > continue; > > - if (update_range_test (ranges + i, ranges + i + 1, j - i - 1, opcode, > - ops, ranges[i].exp, in_p, low, high, > - strict_overflow_p)) > + if (update_range_test (ranges + i, ranges + i + 1, NULL, j - i - 1, > + opcode, ops, ranges[i].exp, NULL, in_p, > + low, high, strict_overflow_p)) > { > i = j - 1; > any_changes = true; > @@ -2412,6 +2686,9 @@ optimize_range_tests (enum tree_code opc > if (BRANCH_COST (optimize_function_for_speed_p (cfun), false) >= 2) > any_changes |= optimize_range_tests_1 (opcode, first, length, false, > ops, ranges); > + if (lshift_cheap_p (optimize_function_for_speed_p (cfun))) > + any_changes |= optimize_range_tests_to_bit_test (opcode, first, length, > + ops, ranges); > > if (any_changes && opcode != ERROR_MARK) > { > @@ -4581,6 +4858,82 @@ reassociate_bb (basic_block bb) > reassociate_bb (son); > } > > +/* Add jumps around shifts for range tests turned into bit tests. > + For each SSA_NAME VAR we have code like: > + VAR = ...; // final stmt of range comparison > + // bit test here...; > + OTHERVAR = ...; // final stmt of the bit test sequence > + RES = VAR | OTHERVAR; > + Turn the above into: > + VAR = ...; > + if (VAR != 0) > + goto <l3>; > + else > + goto <l2>; > + <l2>: > + // bit test here...; > + OTHERVAR = ...; > + <l3>: > + # RES = PHI<1(l1), OTHERVAR(l2)>; */ > + > +static void > +branch_fixup (void) > +{ > + tree var; > + unsigned int i; > + > + FOR_EACH_VEC_ELT (reassoc_branch_fixups, i, var) > + { > + gimple def_stmt = SSA_NAME_DEF_STMT (var); > + gimple use_stmt; > + use_operand_p use; > + bool ok = single_imm_use (var, &use, &use_stmt); > + gcc_assert (ok > + && is_gimple_assign (use_stmt) > + && gimple_assign_rhs_code (use_stmt) == BIT_IOR_EXPR > + && gimple_bb (def_stmt) == gimple_bb (use_stmt)); > + > + basic_block cond_bb = gimple_bb (def_stmt); > + basic_block then_bb = split_block (cond_bb, def_stmt)->dest; > + basic_block merge_bb = split_block (then_bb, use_stmt)->dest; > + > + gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt); > + gimple g = gimple_build_cond (NE_EXPR, var, > + build_zero_cst (TREE_TYPE (var)), > + NULL_TREE, NULL_TREE); > + location_t loc = gimple_location (use_stmt); > + gimple_set_location (g, loc); > + gsi_insert_after (&gsi, g, GSI_NEW_STMT); > + > + edge etrue = make_edge (cond_bb, merge_bb, EDGE_TRUE_VALUE); > + etrue->probability = REG_BR_PROB_BASE / 2; > + etrue->count = cond_bb->count / 2; > + edge efalse = find_edge (cond_bb, then_bb); > + efalse->flags = EDGE_FALSE_VALUE; > + efalse->probability -= etrue->probability; > + efalse->count -= etrue->count; > + then_bb->count -= etrue->count; > + > + tree othervar = NULL_TREE; > + if (gimple_assign_rhs1 (use_stmt) == var) > + othervar = gimple_assign_rhs2 (use_stmt); > + else if (gimple_assign_rhs2 (use_stmt) == var) > + othervar = gimple_assign_rhs1 (use_stmt); > + else > + gcc_unreachable (); > + tree lhs = gimple_assign_lhs (use_stmt); > + gimple phi = create_phi_node (lhs, merge_bb); > + add_phi_arg (phi, build_one_cst (TREE_TYPE (lhs)), etrue, loc); > + add_phi_arg (phi, othervar, single_succ_edge (then_bb), loc); > + gsi = gsi_for_stmt (use_stmt); > + gsi_remove (&gsi, true); > + > + set_immediate_dominator (CDI_DOMINATORS, merge_bb, cond_bb); > + set_immediate_dominator (CDI_POST_DOMINATORS, cond_bb, merge_bb); > + } > + reassoc_branch_fixups.release (); > +} > + > void dump_ops_vector (FILE *file, vec<operand_entry_t> ops); > void debug_ops_vector (vec<operand_entry_t> ops); > > @@ -4695,6 +5048,7 @@ execute_reassoc (void) > > do_reassoc (); > repropagate_negates (); > + branch_fixup (); > > fini_reassoc (); > return 0; > --- gcc/testsuite/gcc.dg/torture/pr63464.c.jj 2014-10-15 15:15:27.018725273 +0200 > +++ gcc/testsuite/gcc.dg/torture/pr63464.c 2014-10-15 15:14:58.000000000 +0200 > @@ -0,0 +1,92 @@ > +/* PR tree-optimization/63464 */ > +/* { dg-do run { target int32plus } } */ > + > +int cnt; > + > +__attribute__((noinline, noclone)) void > +bar (int x, int y) > +{ > + cnt++; > + switch (y) > + { > + case 1: > + if ((unsigned) x < 24U && ((1U << x) & 0x860c0cU) != 0) > + __builtin_abort (); > + break; > + case 2: > + if ((unsigned) x >= 24U || ((1U << x) & 0x860c0cU) == 0) > + __builtin_abort (); > + break; > + case 3: > + if ((unsigned) x - 43U < 40U && ((1ULL << (x - 43U)) & 0x8f0000004fULL) != 0) > + __builtin_abort (); > + break; > + case 4: > + if ((unsigned) x - 43U >= 40U || ((1ULL << (x - 43U)) & 0x8f0000004fULL) == 0) > + __builtin_abort (); > + break; > + default: > + __builtin_abort (); > + } > +} > + > +__attribute__((noinline, noclone)) void > +f1 (int x) > +{ > + if (x != 2 && x != 3 && x != 10 && x != 11 && x != 17 && x != 18 && x != 23) > + bar (x, 1); > +} > + > +__attribute__((noinline, noclone)) void > +f2 (int x) > +{ > + if (x == 2 || x == 3 || x == 10 || x == 11 || x == 17 || x == 18 || x == 23) > + bar (x, 2); > +} > + > +__attribute__((noinline, noclone)) void > +f3 (int x) > +{ > + if (x != 43 && x != 76 && x != 44 && x != 78 && x != 49 > + && x != 77 && x != 46 && x != 75 && x != 45 && x != 82) > + bar (x, 3); > +} > + > +__attribute__((noinline, noclone)) void > +f4 (int x) > +{ > + if (x == 43 || x == 76 || x == 44 || x == 78 || x == 49 > + || x == 77 || x == 46 || x == 75 || x == 45 || x == 82) > + bar (x, 4); > +} > + > +int > +main () > +{ > + int i; > + f1 (-__INT_MAX__ - 1); > + for (i = -3; i < 92; i++) > + f1 (i); > + f1 (__INT_MAX__); > + if (cnt != 97 - 7) > + __builtin_abort (); > + f2 (-__INT_MAX__ - 1); > + for (i = -3; i < 92; i++) > + f2 (i); > + f2 (__INT_MAX__); > + if (cnt != 97) > + __builtin_abort (); > + f3 (-__INT_MAX__ - 1); > + for (i = -3; i < 92; i++) > + f3 (i); > + f3 (__INT_MAX__); > + if (cnt != 97 * 2 - 10) > + __builtin_abort (); > + f4 (-__INT_MAX__ - 1); > + for (i = -3; i < 92; i++) > + f4 (i); > + f4 (__INT_MAX__); > + if (cnt != 97 * 2) > + __builtin_abort (); > + return 0; > +} > --- gcc/testsuite/gcc.dg/tree-ssa/reassoc-37.c.jj 2014-10-15 15:22:07.458048561 +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/reassoc-37.c 2014-10-15 15:29:16.849807253 +0200 > @@ -0,0 +1,17 @@ > +/* PR tree-optimization/63464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +void bar (void); > + > +void > +foo (int x) > +{ > + if (x != 2 && x != 3 && x != 10 && x != 11 && x != 17 && x != 18 && x != 23) > + bar (); > +} > + > +/* Check if the tests have been folded into a bit test. */ > +/* { dg-final { scan-tree-dump "(8784908|0x0*860c0c)" "optimized" { target i?86-*-* x86_64-*-* } } } */ > +/* { dg-final { scan-tree-dump "(<<|>>)" "optimized" { target i?86-*-* x86_64-*-* } } } */ > +/* { dg-final { cleanup-tree-dump "optimized" } } */ > --- gcc/testsuite/gcc.dg/tree-ssa/reassoc-38.c.jj 2014-10-15 15:28:04.133216268 +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/reassoc-38.c 2014-10-15 16:01:47.100534040 +0200 > @@ -0,0 +1,18 @@ > +/* PR tree-optimization/63464 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +void bar (void); > + > +void > +foo (int x) > +{ > + if (x == 43 || x == 76 || x == 44 || x == 78 || x == 49 > + || x == 77 || x == 46 || x == 75 || x == 45 || x == 82) > + bar (); > +} > + > +/* Check if the tests have been folded into a bit test. */ > +/* { dg-final { scan-tree-dump "(614180323407|0x0*8f0000004f)" "optimized" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 } } } } } } */ > +/* { dg-final { scan-tree-dump "(<<|>>)" "optimized" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 } } } } } } */ > +/* { dg-final { cleanup-tree-dump "optimized" } } */ > > Jakub > >
diff + xor optimizations are performed) are needed and range of values is at most number of bits in a word, we instead check whether the operand is >= smallest and <= highest number in the range and if it is, test (word) 1 << operand against a bitmask. Bootstrapped/regtested on x86_64-linux and i686-linux (for i686-linux I had to go back to r216304 because there are multiple ICF related bootstrap issues on i686-linux), ok for trunk? 2014-10-16 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/63464 * gimple.h (gimple_seq_discard): New prototype. * gimple.c: Include stringpool.h and tree-ssanames.h. (gimple_seq_discard): New function. * optabs.h (lshift_cheap_p): New prototype. * optabs.c (lshift_cheap_p): New function, moved from... * tree-switch-conversion.c (lshift_cheap_p): ... here. * tree-ssa-reassoc.c: Include gimplify.h and optabs.h. (reassoc_branch_fixups): New variable. (update_range_test): Add otherrangep and seq arguments. Unshare exp. If otherrange is NULL, use for other ranges array of pointers pointed by otherrangep instead. Emit seq before gimplified statements for tem. (optimize_range_tests_diff): Adjust update_range_test caller. (optimize_range_tests_xor): Likewise. Fix up comment. (extract_bit_test_mask, optimize_range_tests_to_bit_test): New functions. (optimize_range_tests): Adjust update_range_test caller. Call optimize_range_tests_to_bit_test. (branch_fixup): New function. (execute_reassoc): Call branch_fixup. * gcc.dg/torture/pr63464.c: New test. * gcc.dg/tree-ssa/reassoc-37.c: New test. * gcc.dg/tree-ssa/reassoc-38.c: New test. --- gcc/gimple.h.jj 2014-10-15 12:28:06.428498079 +0200 +++ gcc/gimple.h 2014-10-15 13:43:18.967491428 +0200 @@ -1269,9 +1269,10 @@ extern bool gimple_asm_clobbers_memory_p extern void dump_decl_set (FILE *, bitmap); extern bool nonfreeing_call_p (gimple); extern bool infer_nonnull_range (gimple, tree, bool, bool); -extern void sort_case_labels (vec<tree> ); -extern void preprocess_case_label_vec_for_gimple (vec<tree> , tree, tree *); -extern void gimple_seq_set_location (gimple_seq , location_t); +extern void sort_case_labels (vec<tree>); +extern void preprocess_case_label_vec_for_gimple (vec<tree>, tree, tree *); +extern void gimple_seq_set_location (gimple_seq, location_t); +extern void gimple_seq_discard (gimple_seq); /* Formal (expression) temporary table handling: multiple occurrences of the same scalar expression are evaluated into the same temporary. */ --- gcc/gimple.c.jj 2014-10-15 12:28:19.917235900 +0200 +++ gcc/gimple.c 2014-10-15 13:43:18.970491368 +0200 @@ -47,6 +47,8 @@ along with GCC; see the file COPYING3. #include "demangle.h" #include "langhooks.h" #include "bitmap.h" +#include "stringpool.h" +#include "tree-ssanames.h" /* All the tuples have their operand vector (if present) at the very bottom @@ -2826,3 +2828,19 @@ gimple_seq_set_location (gimple_seq seq, for (gimple_stmt_iterator i = gsi_start (seq); !gsi_end_p (i); gsi_next (&i)) gimple_set_location (gsi_stmt (i), loc); } + +/* Release SSA_NAMEs in SEQ as well as the GIMPLE statements. */ + +void +gimple_seq_discard (gimple_seq seq) +{ + gimple_stmt_iterator gsi; + + for (gsi = gsi_start (seq); !gsi_end_p (gsi); ) + { + gimple stmt = gsi_stmt (gsi); + gsi_remove (&gsi, true); + release_defs (stmt); + ggc_free (stmt); + } +} --- gcc/optabs.h.jj 2014-10-15 12:28:06.479497088 +0200 +++ gcc/optabs.h 2014-10-15 13:43:18.970491368 +0200 @@ -538,5 +538,6 @@ extern void gen_satfractuns_conv_libfunc enum machine_mode, enum machine_mode); extern void init_tree_optimization_optabs (tree); +extern bool lshift_cheap_p (bool); #endif /* GCC_OPTABS_H */ --- gcc/optabs.c.jj 2014-10-15 12:28:06.433497982 +0200 +++ gcc/optabs.c 2014-10-15 13:43:18.969491387 +0200 @@ -8624,4 +8624,31 @@ get_best_mem_extraction_insn (extraction struct_bits, field_mode); } +/* Determine whether "1 << x" is relatively cheap in word_mode. */ + +bool +lshift_cheap_p (bool speed_p) +{ + /* FIXME: This should be made target dependent via this "this_target" + mechanism, similar to e.g. can_copy_init_p in gcse.c. */ + static bool init[2] = { false, false }; + static bool cheap[2] = { true, true }; + + /* If the targer has no lshift in word_mode, the operation will most + probably not be cheap. ??? Does GCC even work for such targets? */ + if (optab_handler (ashl_optab, word_mode) == CODE_FOR_nothing) + return false; + + if (!init[speed_p]) + { + rtx reg = gen_raw_REG (word_mode, 10000); + int cost = set_src_cost (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), + speed_p); + cheap[speed_p] = cost < COSTS_N_INSNS (3); + init[speed_p] = true; + } + + return cheap[speed_p]; +} + #include "gt-optabs.h" --- gcc/tree-switch-conversion.c.jj 2014-10-15 12:28:06.386498896 +0200 +++ gcc/tree-switch-conversion.c 2014-10-15 13:43:18.963491510 +0200 @@ -125,35 +125,6 @@ hoist_edge_and_branch_if_true (gimple_st } -/* Determine whether "1 << x" is relatively cheap in word_mode. */ -/* FIXME: This is the function that we need rtl.h and optabs.h for. - This function (and similar RTL-related cost code in e.g. IVOPTS) should - be moved to some kind of interface file for GIMPLE/RTL interactions. */ -static bool -lshift_cheap_p (bool speed_p) -{ - /* FIXME: This should be made target dependent via this "this_target" - mechanism, similar to e.g. can_copy_init_p in gcse.c. */ - static bool init[2] = {false, false}; - static bool cheap[2] = {true, true}; - - /* If the targer has no lshift in word_mode, the operation will most - probably not be cheap. ??? Does GCC even work for such targets? */ - if (optab_handler (ashl_optab, word_mode) == CODE_FOR_nothing) - return false; - - if (!init[speed_p]) - { - rtx reg = gen_raw_REG (word_mode, 10000); - int cost = set_src_cost (gen_rtx_ASHIFT (word_mode, const1_rtx, reg), - speed_p); - cheap[speed_p] = cost < COSTS_N_INSNS (MAX_CASE_BIT_TESTS); - init[speed_p] = true; - } - - return cheap[speed_p]; -} - /* Return true if a switch should be expanded as a bit test. RANGE is the difference between highest and lowest case. UNIQ is number of unique case node targets, not counting the default case. --- gcc/tree-ssa-reassoc.c.jj 2014-10-15 13:41:45.145301213 +0200 +++ gcc/tree-ssa-reassoc.c 2014-10-16 16:44:58.776901531 +0200 @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. #include "params.h" #include "diagnostic-core.h" #include "builtins.h" +#include "gimplify.h" +#include "optabs.h" /* This is a simple global reassociation pass. It is, in part, based on the LLVM pass of the same name (They do some things more/less @@ -218,6 +220,12 @@ static long *bb_rank; /* Operand->rank hashtable. */ static hash_map<tree, long> *operand_rank; +/* Vector of SSA_NAMEs on which after reassociate_bb is done with + all basic blocks the CFG should be adjusted - basic blocks + split right after that SSA_NAME's definition statement and before + the only use, which must be a bit ior. */ +static vec<tree> reassoc_branch_fixups; + /* Forward decls. */ static long get_rank (tree); static bool reassoc_stmt_dominates_stmt_p (gimple, gimple); @@ -2071,7 +2079,9 @@ range_entry_cmp (const void *a, const vo /* Helper routine of optimize_range_test. [EXP, IN_P, LOW, HIGH, STRICT_OVERFLOW_P] is a merged range for RANGE and OTHERRANGE through OTHERRANGE + COUNT - 1 ranges, - OPCODE and OPS are arguments of optimize_range_tests. Return + OPCODE and OPS are arguments of optimize_range_tests. If OTHERRANGE + is NULL, OTHERRANGEP should not be and then OTHERRANGEP points to + an array of COUNT pointers to other ranges. Return true if the range merge has been successful. If OPCODE is ERROR_MARK, this is called from within maybe_optimize_range_tests and is performing inter-bb range optimization. @@ -2080,9 +2090,10 @@ range_entry_cmp (const void *a, const vo static bool update_range_test (struct range_entry *range, struct range_entry *otherrange, + struct range_entry **otherrangep, unsigned int count, enum tree_code opcode, - vec<operand_entry_t> *ops, tree exp, bool in_p, - tree low, tree high, bool strict_overflow_p) + vec<operand_entry_t> *ops, tree exp, gimple_seq seq, + bool in_p, tree low, tree high, bool strict_overflow_p) { operand_entry_t oe = (*ops)[range->idx]; tree op = oe->op; @@ -2090,9 +2101,11 @@ update_range_test (struct range_entry *r last_stmt (BASIC_BLOCK_FOR_FN (cfun, oe->id)); location_t loc = gimple_location (stmt); tree optype = op ? TREE_TYPE (op) : boolean_type_node; - tree tem = build_range_check (loc, optype, exp, in_p, low, high); + tree tem = build_range_check (loc, optype, unshare_expr (exp), + in_p, low, high); enum warn_strict_overflow_code wc = WARN_STRICT_OVERFLOW_COMPARISON; gimple_stmt_iterator gsi; + unsigned int i; if (tem == NULL_TREE) return false; @@ -2112,8 +2125,12 @@ update_range_test (struct range_entry *r fprintf (dump_file, ", "); print_generic_expr (dump_file, range->high, 0); fprintf (dump_file, "]"); - for (r = otherrange; r < otherrange + count; r++) + for (i = 0; i < count; i++) { + if (otherrange) + r = otherrange + i; + else + r = otherrangep[i]; fprintf (dump_file, " and %c[", r->in_p ? '+' : '-'); print_generic_expr (dump_file, r->low, 0); fprintf (dump_file, ", "); @@ -2135,10 +2152,14 @@ update_range_test (struct range_entry *r In that case we have to insert after the stmt rather then before it. */ if (op == range->exp) - tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, false, - GSI_CONTINUE_LINKING); + { + gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING); + tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + } else { + gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); tem = force_gimple_operand_gsi (&gsi, tem, true, NULL_TREE, true, GSI_SAME_STMT); gsi_prev (&gsi); @@ -2156,8 +2177,12 @@ update_range_test (struct range_entry *r range->in_p = in_p; range->strict_overflow_p = false; - for (range = otherrange; range < otherrange + count; range++) + for (i = 0; i < count; i++) { + if (otherrange) + range = otherrange + i; + else + range = otherrangep[i]; oe = (*ops)[range->idx]; /* Now change all the other range test immediate uses, so that those tests will be optimized away. */ @@ -2195,8 +2220,8 @@ optimize_range_tests_xor (enum tree_code struct range_entry *rangej) { tree lowxor, highxor, tem, exp; - /* Check highi ^ lowi == highj ^ lowj and - popcount (highi ^ lowi) == 1. */ + /* Check lowi ^ lowj == highi ^ highj and + popcount (lowi ^ lowj) == 1. */ lowxor = fold_binary (BIT_XOR_EXPR, type, lowi, lowj); if (lowxor == NULL_TREE || TREE_CODE (lowxor) != INTEGER_CST) return false; @@ -2210,8 +2235,8 @@ optimize_range_tests_xor (enum tree_code exp = fold_build2 (BIT_AND_EXPR, type, rangei->exp, tem); lowj = fold_build2 (BIT_AND_EXPR, type, lowi, tem); highj = fold_build2 (BIT_AND_EXPR, type, highi, tem); - if (update_range_test (rangei, rangej, 1, opcode, ops, exp, - rangei->in_p, lowj, highj, + if (update_range_test (rangei, rangej, NULL, 1, opcode, ops, exp, + NULL, rangei->in_p, lowj, highj, rangei->strict_overflow_p || rangej->strict_overflow_p)) return true; @@ -2259,8 +2284,8 @@ optimize_range_tests_diff (enum tree_cod fold_convert (type, rangei->exp), lowi); tem1 = fold_build2 (BIT_AND_EXPR, type, tem1, mask); lowj = build_int_cst (type, 0); - if (update_range_test (rangei, rangej, 1, opcode, ops, tem1, - rangei->in_p, lowj, tem2, + if (update_range_test (rangei, rangej, NULL, 1, opcode, ops, tem1, + NULL, rangei->in_p, lowj, tem2, rangei->strict_overflow_p || rangej->strict_overflow_p)) return true; @@ -2328,6 +2353,255 @@ optimize_range_tests_1 (enum tree_code o return any_changes; } +/* Helper function of optimize_range_tests_to_bit_test. Handle a single + range, EXP, LOW, HIGH, compute bit mask of bits to test and return + EXP on success, NULL otherwise. */ + +static tree +extract_bit_test_mask (tree exp, int prec, tree totallow, tree low, tree high, + wide_int *mask, tree *totallowp) +{ + tree tem = int_const_binop (MINUS_EXPR, high, low); + if (tem == NULL_TREE + || TREE_CODE (tem) != INTEGER_CST + || TREE_OVERFLOW (tem) + || tree_int_cst_sgn (tem) == -1 + || compare_tree_int (tem, prec) != -1) + return NULL_TREE; + + unsigned HOST_WIDE_INT max = tree_to_uhwi (tem) + 1; + *mask = wi::shifted_mask (0, max, false, prec); + if (TREE_CODE (exp) == BIT_AND_EXPR + && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) + { + widest_int msk = wi::to_widest (TREE_OPERAND (exp, 1)); + msk = wi::zext (~msk, TYPE_PRECISION (TREE_TYPE (exp))); + if (wi::popcount (msk) == 1 + && wi::ltu_p (msk, prec - max)) + { + *mask |= wi::shifted_mask (msk.to_uhwi (), max, false, prec); + max += msk.to_uhwi (); + exp = TREE_OPERAND (exp, 0); + if (integer_zerop (low) + && TREE_CODE (exp) == PLUS_EXPR + && TREE_CODE (TREE_OPERAND (exp, 1)) == INTEGER_CST) + { + widest_int bias + = wi::neg (wi::sext (wi::to_widest (TREE_OPERAND (exp, 1)), + TYPE_PRECISION (TREE_TYPE (low)))); + tree tbias = wide_int_to_tree (TREE_TYPE (low), bias); + if (totallowp) + { + *totallowp = tbias; + exp = TREE_OPERAND (exp, 0); + STRIP_NOPS (exp); + return exp; + } + else if (!tree_int_cst_lt (totallow, tbias)) + return NULL_TREE; + bias -= wi::to_widest (totallow); + if (wi::ges_p (bias, 0) && wi::lts_p (bias, prec - max)) + { + *mask = wi::lshift (*mask, bias); + exp = TREE_OPERAND (exp, 0); + STRIP_NOPS (exp); + return exp; + } + } + } + } + if (totallowp) + return exp; + if (!tree_int_cst_lt (totallow, low)) + return exp; + tem = int_const_binop (MINUS_EXPR, low, totallow); + if (tem == NULL_TREE + || TREE_CODE (tem) != INTEGER_CST + || TREE_OVERFLOW (tem) + || compare_tree_int (tem, prec - max) == 1) + return NULL_TREE; + + *mask = wi::lshift (*mask, wi::to_widest (tem)); + return exp; +} + +/* Attempt to optimize small range tests using bit test. + E.g. + X != 43 && X != 76 && X != 44 && X != 78 && X != 49 + && X != 77 && X != 46 && X != 75 && X != 45 && X != 82 + has been by earlier optimizations optimized into: + ((X - 43U) & ~32U) > 3U && X != 49 && X != 82 + As all the 43 through 82 range is less than 64 numbers, + for 64-bit word targets optimize that into: + (X - 43U) > 40U && ((1 << (X - 43U)) & 0x8F0000004FULL) == 0 */ + +static bool +optimize_range_tests_to_bit_test (enum tree_code opcode, int first, int length, + vec<operand_entry_t> *ops, + struct range_entry *ranges) +{ + int i, j; + bool any_changes = false; + int prec = GET_MODE_BITSIZE (word_mode); + auto_vec<struct range_entry *, 64> candidates; + + for (i = first; i < length - 2; i++) + { + tree lowi, highi, lowj, highj, type; + + if (ranges[i].exp == NULL_TREE || ranges[i].in_p) + continue; + type = TREE_TYPE (ranges[i].exp); + if (!INTEGRAL_TYPE_P (type)) + continue; + lowi = ranges[i].low; + if (lowi == NULL_TREE) + lowi = TYPE_MIN_VALUE (type); + highi = ranges[i].high; + if (highi == NULL_TREE) + continue; + wide_int mask; + tree exp = extract_bit_test_mask (ranges[i].exp, prec, lowi, lowi, + highi, &mask, &lowi); + if (exp == NULL_TREE) + continue; + bool strict_overflow_p = ranges[i].strict_overflow_p; + candidates.truncate (0); + int end = MIN (i + 64, length); + for (j = i + 1; j < end; j++) + { + tree exp2; + if (ranges[j].exp == NULL_TREE || ranges[j].in_p) + continue; + if (ranges[j].exp == exp) + ; + else if (TREE_CODE (ranges[j].exp) == BIT_AND_EXPR) + { + exp2 = TREE_OPERAND (ranges[j].exp, 0); + if (exp2 == exp) + ; + else if (TREE_CODE (exp2) == PLUS_EXPR) + { + exp2 = TREE_OPERAND (exp2, 0); + STRIP_NOPS (exp2); + if (exp2 != exp) + continue; + } + else + continue; + } + else + continue; + lowj = ranges[j].low; + if (lowj == NULL_TREE) + continue; + highj = ranges[j].high; + if (highj == NULL_TREE) + highj = TYPE_MAX_VALUE (type); + wide_int mask2; + exp2 = extract_bit_test_mask (ranges[j].exp, prec, lowi, lowj, + highj, &mask2, NULL); + if (exp2 != exp) + continue; + mask |= mask2; + strict_overflow_p |= ranges[j].strict_overflow_p; + candidates.safe_push (&ranges[j]); + } + + /* If we need otherwise 3 or more comparisons, use a bit test. */ + if (candidates.length () >= 2) + { + tree high = wide_int_to_tree (TREE_TYPE (lowi), + wi::to_widest (lowi) + + prec - wi::clz (mask)); + operand_entry_t oe = (*ops)[ranges[i].idx]; + tree op = oe->op; + gimple stmt = op ? SSA_NAME_DEF_STMT (op) + : last_stmt (BASIC_BLOCK_FOR_FN (cfun, oe->id)); + location_t loc = gimple_location (stmt); + tree optype = op ? TREE_TYPE (op) : boolean_type_node; + + /* See if it isn't cheaper to pretend the minimum value of the + range is 0, if maximum value is small enough. + We can avoid then subtraction of the minimum value, but the + mask constant could be perhaps more expensive. */ + if (compare_tree_int (lowi, 0) > 0 + && compare_tree_int (high, prec) < 0) + { + int cost_diff; + HOST_WIDE_INT m = tree_to_uhwi (lowi); + rtx reg = gen_raw_REG (word_mode, 10000); + bool speed_p = optimize_bb_for_speed_p (gimple_bb (stmt)); + cost_diff = set_rtx_cost (gen_rtx_PLUS (word_mode, reg, + GEN_INT (-m)), speed_p); + rtx r = immed_wide_int_const (mask, word_mode); + cost_diff += set_src_cost (gen_rtx_AND (word_mode, reg, r), + speed_p); + r = immed_wide_int_const (wi::lshift (mask, m), word_mode); + cost_diff -= set_src_cost (gen_rtx_AND (word_mode, reg, r), + speed_p); + if (cost_diff > 0) + { + mask = wi::lshift (mask, m); + lowi = build_zero_cst (TREE_TYPE (lowi)); + } + } + + tree tem = build_range_check (loc, optype, unshare_expr (exp), + false, lowi, high); + if (tem == NULL_TREE || is_gimple_val (tem)) + continue; + tree etype = unsigned_type_for (TREE_TYPE (exp)); + exp = fold_build2_loc (loc, MINUS_EXPR, etype, + fold_convert_loc (loc, etype, exp), + fold_convert_loc (loc, etype, lowi)); + exp = fold_convert_loc (loc, integer_type_node, exp); + tree word_type = lang_hooks.types.type_for_mode (word_mode, 1); + exp = fold_build2_loc (loc, LSHIFT_EXPR, word_type, + build_int_cst (word_type, 1), exp); + exp = fold_build2_loc (loc, BIT_AND_EXPR, word_type, exp, + wide_int_to_tree (word_type, mask)); + exp = fold_build2_loc (loc, EQ_EXPR, optype, exp, + build_zero_cst (word_type)); + if (is_gimple_val (exp)) + continue; + + /* The shift might have undefined behavior if TEM is true, + but reassociate_bb isn't prepared to have basic blocks + split when it is running. So, temporarily emit a code + with BIT_IOR_EXPR instead of &&, and fix it up in + branch_fixup. */ + gimple_seq seq; + tem = force_gimple_operand (tem, &seq, true, NULL_TREE); + gcc_assert (TREE_CODE (tem) == SSA_NAME); + gimple_set_visited (SSA_NAME_DEF_STMT (tem), true); + gimple_seq seq2; + exp = force_gimple_operand (exp, &seq2, true, NULL_TREE); + gimple_seq_add_seq_without_update (&seq, seq2); + gcc_assert (TREE_CODE (exp) == SSA_NAME); + gimple_set_visited (SSA_NAME_DEF_STMT (exp), true); + gimple g + = gimple_build_assign_with_ops (BIT_IOR_EXPR, + make_ssa_name (optype, NULL), + tem, exp); + gimple_set_location (g, loc); + gimple_seq_add_stmt_without_update (&seq, g); + exp = gimple_assign_lhs (g); + tree val = build_zero_cst (optype); + if (update_range_test (&ranges[i], NULL, candidates.address (), + candidates.length (), opcode, ops, exp, + seq, false, val, val, strict_overflow_p)) + { + any_changes = true; + reassoc_branch_fixups.safe_push (tem); + } + else + gimple_seq_discard (seq); + } + } + return any_changes; +} + /* Optimize range tests, similarly how fold_range_test optimizes it on trees. The tree code for the binary operation between all the operands is OPCODE. @@ -2391,9 +2665,9 @@ optimize_range_tests (enum tree_code opc if (j == i + 1) continue; - if (update_range_test (ranges + i, ranges + i + 1, j - i - 1, opcode, - ops, ranges[i].exp, in_p, low, high, - strict_overflow_p)) + if (update_range_test (ranges + i, ranges + i + 1, NULL, j - i - 1, + opcode, ops, ranges[i].exp, NULL, in_p, + low, high, strict_overflow_p)) { i = j - 1; any_changes = true; @@ -2412,6 +2686,9 @@ optimize_range_tests (enum tree_code opc if (BRANCH_COST (optimize_function_for_speed_p (cfun), false) >= 2) any_changes |= optimize_range_tests_1 (opcode, first, length, false, ops, ranges); + if (lshift_cheap_p (optimize_function_for_speed_p (cfun))) + any_changes |= optimize_range_tests_to_bit_test (opcode, first, length, + ops, ranges); if (any_changes && opcode != ERROR_MARK) { @@ -4581,6 +4858,82 @@ reassociate_bb (basic_block bb) reassociate_bb (son); } +/* Add jumps around shifts for range tests turned into bit tests. + For each SSA_NAME VAR we have code like: + VAR = ...; // final stmt of range comparison + // bit test here...; + OTHERVAR = ...; // final stmt of the bit test sequence + RES = VAR | OTHERVAR; + Turn the above into: + VAR = ...; + if (VAR != 0) + goto <l3>; + else + goto <l2>; + <l2>: + // bit test here...; + OTHERVAR = ...; + <l3>: + # RES = PHI<1(l1), OTHERVAR(l2)>; */ + +static void +branch_fixup (void) +{ + tree var; + unsigned int i; + + FOR_EACH_VEC_ELT (reassoc_branch_fixups, i, var) + { + gimple def_stmt = SSA_NAME_DEF_STMT (var); + gimple use_stmt; + use_operand_p use; + bool ok = single_imm_use (var, &use, &use_stmt); + gcc_assert (ok + && is_gimple_assign (use_stmt) + && gimple_assign_rhs_code (use_stmt) == BIT_IOR_EXPR + && gimple_bb (def_stmt) == gimple_bb (use_stmt)); + + basic_block cond_bb = gimple_bb (def_stmt); + basic_block then_bb = split_block (cond_bb, def_stmt)->dest; + basic_block merge_bb = split_block (then_bb, use_stmt)->dest; + + gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt); + gimple g = gimple_build_cond (NE_EXPR, var, + build_zero_cst (TREE_TYPE (var)), + NULL_TREE, NULL_TREE); + location_t loc = gimple_location (use_stmt); + gimple_set_location (g, loc); + gsi_insert_after (&gsi, g, GSI_NEW_STMT); + + edge etrue = make_edge (cond_bb, merge_bb, EDGE_TRUE_VALUE); + etrue->probability = REG_BR_PROB_BASE / 2; + etrue->count = cond_bb->count / 2; + edge efalse = find_edge (cond_bb, then_bb); + efalse->flags = EDGE_FALSE_VALUE; + efalse->probability -= etrue->probability; + efalse->count -= etrue->count; + then_bb->count -= etrue->count; + + tree othervar = NULL_TREE; + if (gimple_assign_rhs1 (use_stmt) == var) + othervar = gimple_assign_rhs2 (use_stmt); + else if (gimple_assign_rhs2 (use_stmt) == var) + othervar = gimple_assign_rhs1 (use_stmt); + else + gcc_unreachable (); + tree lhs = gimple_assign_lhs (use_stmt); + gimple phi = create_phi_node (lhs, merge_bb); + add_phi_arg (phi, build_one_cst (TREE_TYPE (lhs)), etrue, loc); + add_phi_arg (phi, othervar, single_succ_edge (then_bb), loc); + gsi = gsi_for_stmt (use_stmt); + gsi_remove (&gsi, true); + + set_immediate_dominator (CDI_DOMINATORS, merge_bb, cond_bb); + set_immediate_dominator (CDI_POST_DOMINATORS, cond_bb, merge_bb); + } + reassoc_branch_fixups.release (); +} + void dump_ops_vector (FILE *file, vec<operand_entry_t> ops); void debug_ops_vector (vec<operand_entry_t> ops); @@ -4695,6 +5048,7 @@ execute_reassoc (void) do_reassoc (); repropagate_negates (); + branch_fixup (); fini_reassoc (); return 0; --- gcc/testsuite/gcc.dg/torture/pr63464.c.jj 2014-10-15 15:15:27.018725273 +0200 +++ gcc/testsuite/gcc.dg/torture/pr63464.c 2014-10-15 15:14:58.000000000 +0200 @@ -0,0 +1,92 @@ +/* PR tree-optimization/63464 */ +/* { dg-do run { target int32plus } } */ + +int cnt; + +__attribute__((noinline, noclone)) void +bar (int x, int y) +{ + cnt++; + switch (y) + { + case 1: + if ((unsigned) x < 24U && ((1U << x) & 0x860c0cU) != 0) + __builtin_abort (); + break; + case 2: + if ((unsigned) x >= 24U || ((1U << x) & 0x860c0cU) == 0) + __builtin_abort (); + break; + case 3: + if ((unsigned) x - 43U < 40U && ((1ULL << (x - 43U)) & 0x8f0000004fULL) != 0) + __builtin_abort (); + break; + case 4: + if ((unsigned) x - 43U >= 40U || ((1ULL << (x - 43U)) & 0x8f0000004fULL) == 0) + __builtin_abort (); + break; + default: + __builtin_abort (); + } +} + +__attribute__((noinline, noclone)) void +f1 (int x) +{ + if (x != 2 && x != 3 && x != 10 && x != 11 && x != 17 && x != 18 && x != 23) + bar (x, 1); +} + +__attribute__((noinline, noclone)) void +f2 (int x) +{ + if (x == 2 || x == 3 || x == 10 || x == 11 || x == 17 || x == 18 || x == 23) + bar (x, 2); +} + +__attribute__((noinline, noclone)) void +f3 (int x) +{ + if (x != 43 && x != 76 && x != 44 && x != 78 && x != 49 + && x != 77 && x != 46 && x != 75 && x != 45 && x != 82) + bar (x, 3); +} + +__attribute__((noinline, noclone)) void +f4 (int x) +{ + if (x == 43 || x == 76 || x == 44 || x == 78 || x == 49 + || x == 77 || x == 46 || x == 75 || x == 45 || x == 82) + bar (x, 4); +} + +int +main () +{ + int i; + f1 (-__INT_MAX__ - 1); + for (i = -3; i < 92; i++) + f1 (i); + f1 (__INT_MAX__); + if (cnt != 97 - 7) + __builtin_abort (); + f2 (-__INT_MAX__ - 1); + for (i = -3; i < 92; i++) + f2 (i); + f2 (__INT_MAX__); + if (cnt != 97) + __builtin_abort (); + f3 (-__INT_MAX__ - 1); + for (i = -3; i < 92; i++) + f3 (i); + f3 (__INT_MAX__); + if (cnt != 97 * 2 - 10) + __builtin_abort (); + f4 (-__INT_MAX__ - 1); + for (i = -3; i < 92; i++) + f4 (i); + f4 (__INT_MAX__); + if (cnt != 97 * 2) + __builtin_abort (); + return 0; +} --- gcc/testsuite/gcc.dg/tree-ssa/reassoc-37.c.jj 2014-10-15 15:22:07.458048561 +0200 +++ gcc/testsuite/gcc.dg/tree-ssa/reassoc-37.c 2014-10-15 15:29:16.849807253 +0200 @@ -0,0 +1,17 @@ +/* PR tree-optimization/63464 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +void bar (void); + +void +foo (int x) +{ + if (x != 2 && x != 3 && x != 10 && x != 11 && x != 17 && x != 18 && x != 23) + bar (); +} + +/* Check if the tests have been folded into a bit test. */ +/* { dg-final { scan-tree-dump "(8784908|0x0*860c0c)" "optimized" { target i?86-*-* x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump "(<<|>>)" "optimized" { target i?86-*-* x86_64-*-* } } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */ --- gcc/testsuite/gcc.dg/tree-ssa/reassoc-38.c.jj 2014-10-15 15:28:04.133216268 +0200 +++ gcc/testsuite/gcc.dg/tree-ssa/reassoc-38.c 2014-10-15 16:01:47.100534040 +0200 @@ -0,0 +1,18 @@ +/* PR tree-optimization/63464 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +void bar (void); + +void +foo (int x) +{ + if (x == 43 || x == 76 || x == 44 || x == 78 || x == 49 + || x == 77 || x == 46 || x == 75 || x == 45 || x == 82) + bar (); +} + +/* Check if the tests have been folded into a bit test. */ +/* { dg-final { scan-tree-dump "(614180323407|0x0*8f0000004f)" "optimized" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 } } } } } } */ +/* { dg-final { scan-tree-dump "(<<|>>)" "optimized" { target { { i?86-*-* x86_64-*-* } && { ! { ia32 } } } } } } */ +/* { dg-final { cleanup-tree-dump "optimized" } } */