[ARM] Thumb2 constant loading optimization

Message ID	4CFF8170.6020403@codesourcery.com
State	New
Headers	show Return-Path: <gcc-patches-return-280557-incoming=patchwork.ozlabs.org@gcc.gnu.org> Message-ID: <4CFF8170.6020403@codesourcery.com> Date: Wed, 08 Dec 2010 13:00:32 +0000 From: Andrew Stubbs <ams@codesourcery.com> User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.12) Gecko/20101027 Lightning/1.0b2 Thunderbird/3.1.6 MIME-Version: 1.0 To: gcc-patches@gcc.gnu.org Subject: [PATCH][ARM] Thumb2 constant loading optimization Content-Type: multipart/mixed; boundary="------------030704080200000205030501" Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

2010-12-08 Andrew Stubbs <ams@codesourcery.com> gcc/ * config/arm/arm-protos.h (const_ok_for_op): Add prototype. * config/arm/arm.c (const_ok_for_op): Add support for movw/addw/subw. (count_insns_for_constant, find_best_start): Delete functions. (optimal_immediate_sequence): New function. (optimal_immediate_sequence_1): New function. (arm_gen_constant): Remove old movw support. Move constant splitting code to optimal_immediate_sequence. Rewrite constant negation/invertion code. * config/arm/arm.md (*arm_addsi3): Add addw/subw support. (*arm_subsi3_insn): Add subw support. (*arm_movsi_insn): Change 'j' constraint to 'ja'. * config/arm/constraints.md (j): Rename ... (ja): ... the this. (jb, jB): New constraints. * config/arm/thumb2.md (*thumb2_movsi_insn): Change 'j' constraint to 'ja'. * config/arm/vfp.md (*arm_movsi_vfp): Likewise. gcc/testsuite/ * gcc.target/arm/thumb2-replicated-constant1.c: New file. * gcc.target/arm/thumb2-replicated-constant2.c: New file. * gcc.target/arm/thumb2-replicated-constant3.c: New file. * gcc.target/arm/thumb2-replicated-constant4.c: New file. --- src/gcc-mainline/gcc/config/arm/arm-protos.h | 1 src/gcc-mainline/gcc/config/arm/arm.c | 456 ++++++++++++-------- src/gcc-mainline/gcc/config/arm/arm.md | 35 +- src/gcc-mainline/gcc/config/arm/constraints.md | 18 + src/gcc-mainline/gcc/config/arm/thumb2.md | 4 src/gcc-mainline/gcc/config/arm/vfp.md | 8 .../gcc.target/arm/thumb2-replicated-constant1.c | 27 + .../gcc.target/arm/thumb2-replicated-constant2.c | 75 +++ .../gcc.target/arm/thumb2-replicated-constant3.c | 28 + .../gcc.target/arm/thumb2-replicated-constant4.c | 22 + 10 files changed, 476 insertions(+), 198 deletions(-) create mode 100644 src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant1.c create mode 100644 src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant2.c create mode 100644 src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant3.c create mode 100644 src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant4.c diff --git a/src/gcc-mainline/gcc/config/arm/arm-protos.h b/src/gcc-mainline/gcc/config/arm/arm-protos.h index 53923bd..db405ac 100644 --- a/src/gcc-mainline/gcc/config/arm/arm-protos.h +++ b/src/gcc-mainline/gcc/config/arm/arm-protos.h @@ -46,6 +46,7 @@ extern bool arm_vector_mode_supported_p (enum machine_mode); extern bool arm_small_register_classes_for_mode_p (enum machine_mode); extern int arm_hard_regno_mode_ok (unsigned int, enum machine_mode); extern int const_ok_for_arm (HOST_WIDE_INT); +extern int const_ok_for_op (HOST_WIDE_INT, enum rtx_code); extern int arm_split_constant (RTX_CODE, enum machine_mode, rtx, HOST_WIDE_INT, rtx, rtx, int); extern RTX_CODE arm_canonicalize_comparison (RTX_CODE, rtx *, rtx *); diff --git a/src/gcc-mainline/gcc/config/arm/arm.c b/src/gcc-mainline/gcc/config/arm/arm.c index 88c43e3..cb0130e 100644 --- a/src/gcc-mainline/gcc/config/arm/arm.c +++ b/src/gcc-mainline/gcc/config/arm/arm.c @@ -81,7 +81,6 @@ inline static int thumb1_index_register_rtx_p (rtx, int); static bool arm_legitimate_address_p (enum machine_mode, rtx, bool); static int thumb_far_jump_used_p (void); static bool thumb_force_lr_save (void); -static int const_ok_for_op (HOST_WIDE_INT, enum rtx_code); static rtx emit_sfm (int, int); static unsigned arm_size_return_regs (void); static bool arm_assemble_integer (rtx, unsigned int, int); @@ -129,7 +128,12 @@ static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT); static int arm_comp_type_attributes (const_tree, const_tree); static void arm_set_default_type_attributes (tree); static int arm_adjust_cost (rtx, rtx, rtx, int); -static int count_insns_for_constant (HOST_WIDE_INT, int); +static int optimal_immediate_sequence (enum rtx_code code, + unsigned HOST_WIDE_INT val, + int return_sequence[]); +static int optimal_immediate_sequence_1 (enum rtx_code code, + unsigned HOST_WIDE_INT val, + int return_sequence[], int i); static int arm_get_strip_length (int); static bool arm_function_ok_for_sibcall (tree, tree); static enum machine_mode arm_promote_function_mode (const_tree, @@ -2426,7 +2430,7 @@ const_ok_for_arm (HOST_WIDE_INT i) } /* Return true if I is a valid constant for the operation CODE. */ -static int +int const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code) { if (const_ok_for_arm (i)) @@ -2434,7 +2438,21 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code) switch (code) { + case SET: + /* See if we can use movw. */ + if (arm_arch_thumb2 && (i & 0xffff0000) == 0) + return 1; + else + return 0; + case PLUS: + /* See if we can use addw or subw. */ + if (TARGET_THUMB2 + && ((i & 0xfffff000) == 0 + || ((-i) & 0xfffff000) == 0)) + return 1; + /* else fall through. */ + case COMPARE: case EQ: case NE: @@ -2550,68 +2568,42 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn, 1); } -/* Return the number of instructions required to synthesize the given - constant, if we start emitting them from bit-position I. */ +/* Return a sequence of integers, in RETURN_SEQUENCE that fit into + ARM/THUMB2 immediates, and add up to VAL. + RETURN_SEQUENCE must be an int[4]. + Thr function return value gives the number of insns required. */ static int -count_insns_for_constant (HOST_WIDE_INT remainder, int i) -{ - HOST_WIDE_INT temp1; - int step_size = TARGET_ARM ? 2 : 1; - int num_insns = 0; - - gcc_assert (TARGET_ARM || i == 0); - - do - { - int end; - - if (i <= 0) - i += 32; - if (remainder & (((1 << step_size) - 1) << (i - step_size))) - { - end = i - 8; - if (end < 0) - end += 32; - temp1 = remainder & ((0x0ff << end) - | ((i < end) ? (0xff >> (32 - end)) : 0)); - remainder &= ~temp1; - num_insns++; - i -= 8 - step_size; - } - i -= step_size; - } while (remainder); - return num_insns; -} - -static int -find_best_start (unsigned HOST_WIDE_INT remainder) +optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val, + int return_sequence[]) { int best_consecutive_zeros = 0; int i; int best_start = 0; + int insns1, insns2; + int tmp_sequence[4]; /* If we aren't targetting ARM, the best place to start is always at - the bottom. */ - if (! TARGET_ARM) - return 0; - - for (i = 0; i < 32; i += 2) + the bottom, otherwise look more closely. */ + if (TARGET_ARM) { - int consecutive_zeros = 0; - - if (!(remainder & (3 << i))) + for (i = 0; i < 32; i += 2) { - while ((i < 32) && !(remainder & (3 << i))) - { - consecutive_zeros += 2; - i += 2; - } - if (consecutive_zeros > best_consecutive_zeros) + int consecutive_zeros = 0; + + if (!(val & (3 << i))) { - best_consecutive_zeros = consecutive_zeros; - best_start = i - consecutive_zeros; + while ((i < 32) && !(val & (3 << i))) + { + consecutive_zeros += 2; + i += 2; + } + if (consecutive_zeros > best_consecutive_zeros) + { + best_consecutive_zeros = consecutive_zeros; + best_start = i - consecutive_zeros; + } + i -= 2; } - i -= 2; } } @@ -2638,13 +2630,161 @@ find_best_start (unsigned HOST_WIDE_INT remainder) the constant starting from `best_start', and also starting from zero (i.e. with bit 31 first to be output). If `best_start' doesn't yield a shorter sequence, we may as well use zero. */ + insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start); if (best_start != 0 - && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) - && (count_insns_for_constant (remainder, 0) <= - count_insns_for_constant (remainder, best_start))) - best_start = 0; + && ((((unsigned HOST_WIDE_INT) 1) << best_start) < val)) + { + insns2 = optimal_immediate_sequence_1 (code, val, tmp_sequence, 0); + if (insns2 <= insns1) + { + memcpy (return_sequence, tmp_sequence, sizeof(tmp_sequence)); + insns1 = insns2; + } + } + + return insns1; +} + +/* As for optimal_immediate_sequence, but starting at bit-position I. */ +static int +optimal_immediate_sequence_1 (enum rtx_code code, unsigned HOST_WIDE_INT val, + int return_sequence[], int i) +{ + int remainder = val & 0xffffffff; + int insns = 0; + + /* Try and find a way of doing the job in either two or three + instructions. + + In ARM mode we can use 8-bit constants, rotated to any 2-bit aligned + location. We start at position I. This may be the MSB, or + optimial_immediate_sequence may have positioned it at the largest block + of zeros that are aligned on a 2-bit boundary. We then fill up the temps, + wrapping around to the top of the word when we drop off the bottom. + In the worst case this code should produce no more than four insns. + + In Thumb2 mode, we can use 32/16-bit replicated constants, and 8-bit + constants, shifted to any arbitrary location. We should always start + at the MSB. */ + do + { + int end; + int b1, b2, b3, b4; + unsigned HOST_WIDE_INT result; + int loc; + + gcc_assert (insns < 4); + + if (i <= 0) + i += 32; + + /* First, find the next normal 12/8-bit shifted/rotated immediate. */ + if (remainder & ((TARGET_ARM ? (3 << (i - 2)) : (1 << (i - 1))))) + { + loc = i; + if (i <= 12 && TARGET_THUMB2 && code == PLUS) + /* We can use addw/subw for the last 12 bits. */ + result = remainder; + else + { + /* Use an 8-bit shifted/rotated immediate. */ + end = i - 8; + if (end < 0) + end += 32; + result = remainder & ((0x0ff << end) + | ((i < end) ? (0xff >> (32 - end)) + : 0)); + i -= 8; + } + } + else + { + /* Arm allows rotates by a multiple of two. Thumb-2 allows + arbitrary shifts. */ + i -= TARGET_ARM ? 2 : 1; + continue; + } + + /* Next, see if we can do a better job with a thumb2 replicated + constant. + + We do it this way around to catch the cases like 0x01F001E0 where + two 8-bit immediates would work, but a replicated constant would + make it worse. + + TODO: 16-bit constants that don't clear all the bits, but still win. + TODO: Arithmetic splitting for set/add/sub, rather than bitwise. */ + if (TARGET_THUMB2) + { + b1 = (remainder & 0xff000000) >> 24; + b2 = (remainder & 0x00ff0000) >> 16; + b3 = (remainder & 0x0000ff00) >> 8; + b4 = remainder & 0xff; - return best_start; + if (loc > 24) + { + /* The 8-bit immediate already found clears b1 (and maybe b2), + but must leave b3 and b4 alone. */ + + /* First try to find a 32-bit replicated constant that clears + almost everything. We can assume that we can't do it in one, + or else we wouldn't be here. */ + unsigned int tmp = b1 & b2 & b3 & b4; + unsigned int tmp2 = tmp + (tmp << 8) + (tmp << 16) + + (tmp << 24); + unsigned int matching_bytes = (tmp == b1) + (tmp == b2) + + (tmp == b3) + (tmp == b4); + if (tmp + && (matching_bytes >= 3 + || (matching_bytes == 2 + && const_ok_for_op (remainder & ~tmp2, code)))) + { + /* At least 3 of the bytes match, and the fourth has at + least as many bits set, or two of the bytes match + and it will only require one more insn to finish. */ + result = tmp2; + i = tmp != b1 ? 32 + : tmp != b2 ? 24 + : tmp != b3 ? 16 + : 8; + } + + /* Second, try to find a 16-bit replicated constant that can + leave three of the bytes clear. If b2 or b4 is already + zero, then we can. If the 8-bit from above would not + clear b2 anyway, then we still win. */ + else if (b1 == b3 && (!b2 || !b4 + || (remainder & 0x00ff0000 & ~result))) + { + result = remainder & 0xff00ff00; + i = 24; + } + } + else if (loc > 16) + { + /* The 8-bit immediate already found clears b2 (and maybe b3) + and we don't get here unless b1 is alredy clear, but it will + leave b4 unchanged. */ + + /* If we can clear b2 and b4 at once, then we win, since the + 8-bits couldn't possibly reach that far. */ + if (b2 == b4) + { + result = remainder & 0x00ff00ff; + i = 16; + } + } + } + + return_sequence[insns++] = result; + remainder &= ~result; + + if (code == SET || code == MINUS) + code = PLUS; + } + while (remainder); + + return insns; } /* Emit an instruction with the indicated PATTERN. If COND is @@ -2661,7 +2801,6 @@ emit_constant_insn (rtx cond, rtx pattern) /* As above, but extra parameter GENERATE which, if clear, suppresses RTL generation. */ -/* ??? This needs more work for thumb2. */ static int arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, @@ -2673,15 +2812,14 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, int final_invert = 0; int can_negate_initial = 0; int i; - int num_bits_set = 0; int set_sign_bit_copies = 0; int clear_sign_bit_copies = 0; int clear_zero_bit_copies = 0; int set_zero_bit_copies = 0; - int insns = 0; + int insns = 0, neg_insns, inv_insns; unsigned HOST_WIDE_INT temp1, temp2; unsigned HOST_WIDE_INT remainder = val & 0xffffffff; - int step_size = TARGET_ARM ? 2 : 1; + int immediates[4], neg_immediates[4], inv_immediates[4]; /* Find out which operations are safe for a given CODE. Also do a quick check for degenerate cases; these can occur when DImode operations @@ -2793,9 +2931,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, } /* If we can do it in one insn get out quickly. */ - if (const_ok_for_arm (val) - || (can_negate_initial && const_ok_for_arm (-val)) - || (can_invert && const_ok_for_arm (~val))) + if (const_ok_for_op (val, code)) { if (generate) emit_constant_insn (cond, @@ -2848,15 +2984,6 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, switch (code) { case SET: - /* See if we can use movw. */ - if (arm_arch_thumb2 && (remainder & 0xffff0000) == 0) - { - if (generate) - emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, - GEN_INT (val))); - return 1; - } - /* See if we can do this by sign_extending a constant that is known to be negative. This is a good, way of doing it, since the shift may well merge into a subsequent insn. */ @@ -3207,121 +3334,100 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, break; } - for (i = 0; i < 32; i++) - if (remainder & (1 << i)) - num_bits_set++; + /* Calculate what the instruction sequences would be if we generated it + normally, negated, or inverted. */ + if (code == AND) + /* AND cannot be split into multiple insns, so invert and use BIC. */ + insns = 99; + else + insns = optimal_immediate_sequence (code, remainder, immediates); - if ((code == AND) - || (code != IOR && can_invert && num_bits_set > 16)) - remainder ^= 0xffffffff; - else if (code == PLUS && num_bits_set > 16) - remainder = (-remainder) & 0xffffffff; + if (can_negate) + neg_insns = optimal_immediate_sequence (code, (-remainder) & 0xffffffff, + neg_immediates); + else + neg_insns = 99; + + if (can_invert) + inv_insns = optimal_immediate_sequence (code, remainder ^ 0xffffffff, + inv_immediates); + else + inv_insns = 99; - /* For XOR, if more than half the bits are set and there's a sequence - of more than 8 consecutive ones in the pattern then we can XOR by the - inverted constant and then invert the final result; this may save an - instruction and might also lead to the final mvn being merged with - some other operation. */ - else if (code == XOR && num_bits_set > 16 - && (count_insns_for_constant (remainder ^ 0xffffffff, - find_best_start - (remainder ^ 0xffffffff)) - < count_insns_for_constant (remainder, - find_best_start (remainder)))) + /* Is the negated immediate sequence more efficient? */ + if (neg_insns < insns && neg_insns <= inv_insns) { - remainder ^= 0xffffffff; - final_invert = 1; + insns = neg_insns; + memcpy (immediates, neg_immediates, sizeof (immediates)); } else + can_negate = 0; + + /* Is the inverted immediate sequence more efficient? + We must allow for an extra NOT instruction for XOR operations, although + there is some chance that the final 'mvn' will get optimized later. */ + if (inv_insns < insns && (code != XOR || (inv_insns + 1) < insns)) { - can_invert = 0; - can_negate = 0; - } + insns = inv_insns; + memcpy (immediates, inv_immediates, sizeof (immediates)); - /* Now try and find a way of doing the job in either two or three - instructions. - We start by looking for the largest block of zeros that are aligned on - a 2-bit boundary, we then fill up the temps, wrapping around to the - top of the word when we drop off the bottom. - In the worst case this code should produce no more than four insns. - Thumb-2 constants are shifted, not rotated, so the MSB is always the - best place to start. */ + if (code == XOR) + final_invert = 1; + } + else + can_invert = 0; - /* ??? Use thumb2 replicated constants when the high and low halfwords are - the same. */ - { - /* Now start emitting the insns. */ - i = find_best_start (remainder); - do - { - int end; + /* Now output the chosen sequence as instructions. */ + if (generate) + { + for (i = 0; i < insns; i++) + { + rtx new_src, temp1_rtx; - if (i <= 0) - i += 32; - if (remainder & (3 << (i - 2))) - { - end = i - 8; - if (end < 0) - end += 32; - temp1 = remainder & ((0x0ff << end) - | ((i < end) ? (0xff >> (32 - end)) : 0)); - remainder &= ~temp1; - - if (generate) - { - rtx new_src, temp1_rtx; + temp1 = immediates[i]; - if (code == SET || code == MINUS) - { - new_src = (subtargets ? gen_reg_rtx (mode) : target); - if (can_invert && code != MINUS) - temp1 = ~temp1; - } - else - { - if ((final_invert || remainder) && subtargets) - new_src = gen_reg_rtx (mode); - else - new_src = target; - if (can_invert) - temp1 = ~temp1; - else if (can_negate) - temp1 = -temp1; - } + if (code == SET || code == MINUS) + { + new_src = (subtargets ? gen_reg_rtx (mode) : target); + if (can_invert && code != MINUS) + temp1 = ~temp1; + } + else + { + if ((final_invert || i < (insns - 1)) && subtargets) + new_src = gen_reg_rtx (mode); + else + new_src = target; + if (can_invert) + temp1 = ~temp1; + else if (can_negate) + temp1 = -temp1; + } - temp1 = trunc_int_for_mode (temp1, mode); - temp1_rtx = GEN_INT (temp1); + temp1 = trunc_int_for_mode (temp1, mode); + temp1_rtx = GEN_INT (temp1); - if (code == SET) - ; - else if (code == MINUS) - temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source); - else - temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx); + if (code == SET) + ; + else if (code == MINUS) + temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source); + else + temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx); - emit_constant_insn (cond, - gen_rtx_SET (VOIDmode, new_src, - temp1_rtx)); - source = new_src; - } + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, new_src, + temp1_rtx)); + source = new_src; - if (code == SET) - { - can_invert = 0; - code = PLUS; - } - else if (code == MINUS) + if (code == SET) + { + can_invert = 0; code = PLUS; - - insns++; - i -= 8 - step_size; - } - /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary - shifts. */ - i -= step_size; - } - while (remainder); - } + } + else if (code == MINUS) + code = PLUS; + } + } if (final_invert) { diff --git a/src/gcc-mainline/gcc/config/arm/arm.md b/src/gcc-mainline/gcc/config/arm/arm.md index 889b86f..8bc9926 100644 --- a/src/gcc-mainline/gcc/config/arm/arm.md +++ b/src/gcc-mainline/gcc/config/arm/arm.md @@ -696,21 +696,24 @@ ;; (plus (reg rN) (reg sp)) into (reg rN). In this case reload will ;; put the duplicated register first, and not try the commutative version. (define_insn_and_split "*arm_addsi3" - [(set (match_operand:SI 0 "s_register_operand" "=r, k,r,r, k,r") - (plus:SI (match_operand:SI 1 "s_register_operand" "%rk,k,r,rk,k,rk") - (match_operand:SI 2 "reg_or_int_operand" "rI,rI,k,L, L,?n")))] + [(set (match_operand:SI 0 "s_register_operand" "=r, k,r,r, k, r, k,r, k, r") + (plus:SI (match_operand:SI 1 "s_register_operand" "%rk,k,r,rk,k, rk,k,rk,k, rk") + (match_operand:SI 2 "reg_or_int_operand" "rI,rI,k,jb,jb,L, L,jB,jB,?n")))] "TARGET_32BIT" "@ add%?\\t%0, %1, %2 add%?\\t%0, %1, %2 add%?\\t%0, %2, %1 + addw%?\\t%0, %1, %2 + addw%?\\t%0, %1, %2 sub%?\\t%0, %1, #%n2 sub%?\\t%0, %1, #%n2 + subw%?\\t%0, %1, #%n2 + subw%?\\t%0, %1, #%n2 #" "TARGET_32BIT && GET_CODE (operands[2]) == CONST_INT - && !(const_ok_for_arm (INTVAL (operands[2])) - || const_ok_for_arm (-INTVAL (operands[2]))) + && !const_ok_for_op (INTVAL (operands[2]), PLUS) && (reload_completed || !arm_eliminable_register (operands[1]))" [(clobber (const_int 0))] " @@ -719,7 +722,7 @@ operands[1], 0); DONE; " - [(set_attr "length" "4,4,4,4,4,16") + [(set_attr "length" "4,4,4,4,4,4,4,4,4,16") (set_attr "predicable" "yes")] ) @@ -1173,27 +1176,31 @@ ; ??? Check Thumb-2 split length (define_insn_and_split "*arm_subsi3_insn" - [(set (match_operand:SI 0 "s_register_operand" "=r,r,rk,r,r") - (minus:SI (match_operand:SI 1 "reg_or_int_operand" "rI,r,k,?n,r") - (match_operand:SI 2 "reg_or_int_operand" "r,rI,r, r,?n")))] + [(set (match_operand:SI 0 "s_register_operand" "=r,r,rk,r, k, r,r") + (minus:SI (match_operand:SI 1 "reg_or_int_operand" "rI,r,k, rk,k, ?n,r") + (match_operand:SI 2 "reg_or_int_operand" "r,rI,r, jb,jb,r,?n")))] "TARGET_32BIT" "@ rsb%?\\t%0, %2, %1 sub%?\\t%0, %1, %2 sub%?\\t%0, %1, %2 + subw%?\\t%0, %1, %2 + subw%?\\t%0, %1, %2 # #" "&& ((GET_CODE (operands[1]) == CONST_INT - && !const_ok_for_arm (INTVAL (operands[1]))) + && !(const_ok_for_arm (INTVAL (operands[1])) + || satisfies_constraint_jb (operands[2]))) || (GET_CODE (operands[2]) == CONST_INT - && !const_ok_for_arm (INTVAL (operands[2]))))" + && !(const_ok_for_arm (INTVAL (operands[2])) + || satisfies_constraint_jb (operands[2]))))" [(clobber (const_int 0))] " arm_split_constant (MINUS, SImode, curr_insn, INTVAL (operands[1]), operands[0], operands[2], 0); DONE; " - [(set_attr "length" "4,4,4,16,16") + [(set_attr "length" "4,4,4,4,4,16,16") (set_attr "predicable" "yes")] ) @@ -5128,8 +5135,8 @@ ) (define_insn "*arm_movsi_insn" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m") - (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, rk,m") + (match_operand:SI 1 "general_operand" "rk, I,K,ja,mi,rk"))] "TARGET_ARM && ! TARGET_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_VFP) && ( register_operand (operands[0], SImode) diff --git a/src/gcc-mainline/gcc/config/arm/constraints.md b/src/gcc-mainline/gcc/config/arm/constraints.md index 41a0663..0c1dce2 100644 --- a/src/gcc-mainline/gcc/config/arm/constraints.md +++ b/src/gcc-mainline/gcc/config/arm/constraints.md @@ -25,13 +25,13 @@ ;; In ARM state, 'l' is an alias for 'r' ;; The following normal constraints have been used: -;; in ARM/Thumb-2 state: G, H, I, j, J, K, L, M +;; in ARM/Thumb-2 state: G, H, I, J, K, L, M ;; in Thumb-1 state: I, J, K, L, M, N, O ;; The following multi-letter normal constraints have been used: ;; in ARM/Thumb-2 state: Da, Db, Dc, Dn, Dl, DL, Dv, Dy, Di, Dz ;; in Thumb-1 state: Pa, Pb, Pc, Pd -;; in Thumb-2 state: Ps, Pt, Pu, Pv, Pw, Px +;; in Thumb-2 state: Ps, Pt, Pu, Pv, Pw, Px, ja, jb, jB ;; The following memory constraints have been used: ;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Um, Us @@ -67,13 +67,25 @@ (define_register_constraint "h" "TARGET_THUMB ? HI_REGS : NO_REGS" "In Thumb state the core registers @code{r8}-@code{r15}.") -(define_constraint "j" +(define_constraint "ja" "A constant suitable for a MOVW instruction. (ARM/Thumb-2)" (and (match_test "TARGET_32BIT && arm_arch_thumb2") (ior (match_code "high") (and (match_code "const_int") (match_test "(ival & 0xffff0000) == 0"))))) +(define_constraint "jb" + "A 12-bit constant suitable for an ADDW or SUBW instruction. (Thumb-2)" + (and (match_test "TARGET_THUMB2") + (and (match_code "const_int") + (match_test "(ival & 0xfffff000) == 0")))) + +(define_constraint "jB" + "A constant that satisfies the jb constrant if negated." + (and (match_test "TARGET_THUMB2") + (and (match_code "const_int") + (match_test "((-ival) & 0xfffff000) == 0")))) + (define_register_constraint "k" "STACK_REG" "@internal The stack register.") diff --git a/src/gcc-mainline/gcc/config/arm/thumb2.md b/src/gcc-mainline/gcc/config/arm/thumb2.md index 9ccb4d8..e6014f7 100644 --- a/src/gcc-mainline/gcc/config/arm/thumb2.md +++ b/src/gcc-mainline/gcc/config/arm/thumb2.md @@ -165,8 +165,8 @@ ;; regs. The high register alternatives are not taken into account when ;; choosing register preferences in order to reflect their expense. (define_insn "*thumb2_movsi_insn" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,l ,*hk,m,*m") - (match_operand:SI 1 "general_operand" "rk ,I,K,j,mi,*mi,l,*hk"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, l ,*hk,m,*m") + (match_operand:SI 1 "general_operand" "rk ,I,K,ja,mi,*mi,l,*hk"))] "TARGET_THUMB2 && ! TARGET_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_VFP) && ( register_operand (operands[0], SImode) diff --git a/src/gcc-mainline/gcc/config/arm/vfp.md b/src/gcc-mainline/gcc/config/arm/vfp.md index 4e7b801..b1b5850 100644 --- a/src/gcc-mainline/gcc/config/arm/vfp.md +++ b/src/gcc-mainline/gcc/config/arm/vfp.md @@ -50,8 +50,8 @@ ;; ??? For now do not allow loading constants into vfp regs. This causes ;; problems because small constants get converted into adds. (define_insn "*arm_movsi_vfp" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r,rk,m ,*t,r,*t,*t, *Uv") - (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,rk,r,*t,*t,*Uvi,*t"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, rk,m ,*t,r,*t,*t, *Uv") + (match_operand:SI 1 "general_operand" "rk, I,K,ja,mi,rk,r,*t,*t,*Uvi,*t"))] "TARGET_ARM && TARGET_VFP && TARGET_HARD_FLOAT && ( s_register_operand (operands[0], SImode) || s_register_operand (operands[1], SImode))" @@ -90,8 +90,8 @@ ;; See thumb2.md:thumb2_movsi_insn for an explanation of the split ;; high/low register alternatives for loads and stores here. (define_insn "*thumb2_movsi_vfp" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, l,*hk,m, *m,*t, r,*t,*t, *Uv") - (match_operand:SI 1 "general_operand" "rk, I,K,j,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))] + [(set (match_operand:SI 0 "nonimmediate_operand" "=rk,r,r,r, l,*hk,m, *m,*t, r,*t,*t, *Uv") + (match_operand:SI 1 "general_operand" "rk, I,K,ja,mi,*mi,l,*hk, r,*t,*t,*Uvi,*t"))] "TARGET_THUMB2 && TARGET_VFP && TARGET_HARD_FLOAT && ( s_register_operand (operands[0], SImode) || s_register_operand (operands[1], SImode))" diff --git a/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant1.c b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant1.c new file mode 100644 index 0000000..e10ea03 --- /dev/null +++ b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant1.c @@ -0,0 +1,27 @@ +/* Ensure simple replicated constant immediates work. */ +/* { dg-options "-mthumb -O2" } */ +/* { dg-require-effective-target arm_thumb2_ok } */ + +int +foo1 (int a) +{ + return a + 0xfefefefe; +} + +/* { dg-final { scan-assembler "add.*#-16843010" } } */ + +int +foo2 (int a) +{ + return a - 0xab00ab00; +} + +/* { dg-final { scan-assembler "sub.*#-1426019584" } } */ + +int +foo3 (int a) +{ + return a & 0x00cd00cd; +} + +/* { dg-final { scan-assembler "and.*#13435085" } } */ diff --git a/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant2.c b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant2.c new file mode 100644 index 0000000..3739adb --- /dev/null +++ b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant2.c @@ -0,0 +1,75 @@ +/* Ensure split constants can use replicated patterns. */ +/* { dg-options "-mthumb -O2" } */ +/* { dg-require-effective-target arm_thumb2_ok } */ + +int +foo1 (int a) +{ + return a + 0xfe00fe01; +} + +/* { dg-final { scan-assembler "add.*#-33489408" } } */ +/* { dg-final { scan-assembler "add.*#1" } } */ + +int +foo2 (int a) +{ + return a + 0xdd01dd00; +} + +/* { dg-final { scan-assembler "add.*#-587145984" } } */ +/* { dg-final { scan-assembler "add.*#65536" } } */ + +int +foo3 (int a) +{ + return a + 0x00443344; +} + +/* { dg-final { scan-assembler "add.*#4456516" } } */ +/* { dg-final { scan-assembler "add.*#13056" } } */ + +int +foo4 (int a) +{ + return a + 0x77330033; +} + +/* { dg-final { scan-assembler "add.*#1996488704" } } */ +/* { dg-final { scan-assembler "add.*#3342387" } } */ + +int +foo5 (int a) +{ + return a + 0x11221122; +} + +/* { dg-final { scan-assembler "add.*#285217024" } } */ +/* { dg-final { scan-assembler "add.*#2228258" } } */ + +int +foo6 (int a) +{ + return a + 0x66666677; +} + +/* { dg-final { scan-assembler "add.*#1717986918" } } */ +/* { dg-final { scan-assembler "add.*#17" } } */ + +int +foo7 (int a) +{ + return a + 0x99888888; +} + +/* { dg-final { scan-assembler "add.*#-2004318072" } } */ +/* { dg-final { scan-assembler "add.*#285212672" } } */ + +int +foo8 (int a) +{ + return a + 0xdddddfff; +} + +/* { dg-final { scan-assembler "add.*#-572662307" } } */ +/* { dg-final { scan-assembler "addw.*#546" } } */ diff --git a/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant3.c b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant3.c new file mode 100644 index 0000000..eb6ad44 --- /dev/null +++ b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant3.c @@ -0,0 +1,28 @@ +/* Ensure negated/inverted replicated constant immediates work. */ +/* { dg-options "-mthumb -O2" } */ +/* { dg-require-effective-target arm_thumb2_ok } */ + +int +foo1 (int a) +{ + return a | 0xffffff00; +} + +/* { dg-final { scan-assembler "orn.*#255" } } */ + +int +foo2 (int a) +{ + return a & 0xffeeffee; +} + +/* { dg-final { scan-assembler "bic.*#1114129" } } */ + +int +foo3 (int a) +{ + return a & 0xaaaaaa00; +} + +/* { dg-final { scan-assembler "and.*#-1431655766" } } */ +/* { dg-final { scan-assembler "bic.*#170" } } */ diff --git a/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant4.c b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant4.c new file mode 100644 index 0000000..24efdcf --- /dev/null +++ b/src/gcc-mainline/gcc/testsuite/gcc.target/arm/thumb2-replicated-constant4.c @@ -0,0 +1,22 @@ +/* Ensure replicated constants don't make things worse. */ +/* { dg-options "-mthumb -O2" } */ +/* { dg-require-effective-target arm_thumb2_ok } */ + +int +foo1 (int a) +{ + /* It might be tempting to use 0x01000100, but it wouldn't help. */ + return a + 0x01f001e0; +} + +/* { dg-final { scan-assembler "add.*#32505856" } } */ +/* { dg-final { scan-assembler "add.*#480" } } */ + +int +foo2 (int a) +{ + return a + 0x0f100e10; +} + +/* { dg-final { scan-assembler "add.*#252706816" } } */ +/* { dg-final { scan-assembler "add.*#3600" } } */

[ARM] Thumb2 constant loading optimization

Commit Message

Comments

Patch