From patchwork Thu Nov 17 21:10:57 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Hubicka X-Patchwork-Id: 126315 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id F4018B7234 for ; Fri, 18 Nov 2011 08:11:24 +1100 (EST) Received: (qmail 11724 invoked by alias); 17 Nov 2011 21:11:23 -0000 Received: (qmail 11713 invoked by uid 22791); 17 Nov 2011 21:11:21 -0000 X-SWARE-Spam-Status: No, hits=-2.4 required=5.0 tests=AWL, BAYES_00, RP_MATCHES_RCVD, TW_OV X-Spam-Check-By: sourceware.org Received: from nikam.ms.mff.cuni.cz (HELO nikam.ms.mff.cuni.cz) (195.113.20.16) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Thu, 17 Nov 2011 21:11:00 +0000 Received: by nikam.ms.mff.cuni.cz (Postfix, from userid 16202) id D294E9AC652; Thu, 17 Nov 2011 22:10:57 +0100 (CET) Date: Thu, 17 Nov 2011 22:10:57 +0100 From: Jan Hubicka To: "H.J. Lu" Cc: Jan Hubicka , Michael Zolotukhin , gcc-patches@gcc.gnu.org, izamyatin@gmail.com Subject: Re: Memset/memcpy patch Message-ID: <20111117211057.GE30593@kam.mff.cuni.cz> References: <20111114170335.GA29877@kam.mff.cuni.cz> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: User-Agent: Mutt/1.5.18 (2008-05-17) Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org > > The current x86 memset/memcpy expansion is broken. It miscompiles > many programs, including GCC itself. Should it be reverted for now? There was problem in the new code doing loopy epilogues. I am currently testing the following patch that shold fix the problem. We could either revert now and I will apply combined patch or I hope to fix that tonight. Honza Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 181442) +++ config/i386/i386.h (working copy) @@ -276,6 +276,7 @@ enum ix86_tune_indices { X86_TUNE_PROMOTE_QIMODE, X86_TUNE_FAST_PREFIX, X86_TUNE_SINGLE_STRINGOP, + X86_TUNE_ALIGN_STRINGOP, X86_TUNE_QIMODE_MATH, X86_TUNE_HIMODE_MATH, X86_TUNE_PROMOTE_QI_REGS, Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 181442) +++ config/i386/i386.md (working copy) @@ -15944,6 +15944,17 @@ (clobber (reg:CC FLAGS_REG))])] "" { + rtx vec_reg; + enum machine_mode mode = GET_MODE (operands[2]); + if (vector_extensions_used_for_mode (mode) + && CONSTANT_P (operands[2])) + { + if (mode == DImode) + mode = TARGET_64BIT ? V2DImode : V4SImode; + vec_reg = gen_reg_rtx (mode); + emit_move_insn (vec_reg, operands[2]); + operands[2] = vec_reg; + } if (GET_MODE (operands[1]) != GET_MODE (operands[2])) operands[1] = adjust_address_nv (operands[1], GET_MODE (operands[2]), 0); Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 181442) +++ config/i386/i386.c (working copy) @@ -1785,7 +1785,7 @@ struct processor_costs atom_cost = { if that fails. */ {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment. */ {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}}, - {{libcall, {{-1, libcall}}}, /* Unknown alignment. */ + {{libcall, {{2048, sse_loop}, {2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment. */ {libcall, {{2048, sse_loop}, {2048, unrolled_loop}, {-1, libcall}}}}}, @@ -2178,6 +2178,9 @@ static unsigned int initial_ix86_tune_fe /* X86_TUNE_SINGLE_STRINGOP */ m_386 | m_P4_NOCONA, + /* X86_TUNE_ALIGN_STRINGOP */ + ~(m_BDVER | m_CORE2I7), + /* X86_TUNE_QIMODE_MATH */ ~0, @@ -3724,6 +3727,14 @@ ix86_option_override_internal (bool main target_flags |= MASK_NO_RED_ZONE; } + if (!(target_flags_explicit & MASK_NO_ALIGN_STRINGOPS)) + { + if (ix86_tune_features[X86_TUNE_ALIGN_STRINGOP] & ix86_tune_mask) + target_flags &= ~MASK_NO_ALIGN_STRINGOPS; + else + target_flags |= MASK_NO_ALIGN_STRINGOPS; + } + /* Keep nonleaf frame pointers. */ if (flag_omit_frame_pointer) target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; @@ -21149,20 +21160,25 @@ expand_set_or_movmem_via_loop_with_iter top_label = gen_label_rtx (); out_label = gen_label_rtx (); - if (!reuse_iter) - iter = gen_reg_rtx (iter_mode); - size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, - NULL, 1, OPTAB_DIRECT); - /* Those two should combine. */ - if (piece_size == const1_rtx) + NULL, 1, OPTAB_DIRECT); + if (!reuse_iter) { - emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + iter = gen_reg_rtx (iter_mode); + /* Those two should combine. */ + if (piece_size == const1_rtx) + { + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + } + emit_move_insn (iter, const0_rtx); + } + else + { + emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode, true, out_label); - predict_jump (REG_BR_PROB_BASE * 10 / 100); } - if (!reuse_iter) - emit_move_insn (iter, const0_rtx); emit_label (top_label); @@ -21588,17 +21604,28 @@ expand_setmem_epilogue (rtx destmem, rtx Remaining part we'll move using Pmode and narrower modes. */ if (promoted_to_vector_value) - while (remainder_size >= 16) - { - if (GET_MODE (destmem) != move_mode) - destmem = adjust_automodify_address_nv (destmem, move_mode, - destptr, offset); - emit_strset (destmem, promoted_to_vector_value, destptr, - move_mode, offset); + { + if (promoted_to_vector_value) + { + if (max_size >= GET_MODE_SIZE (V4SImode)) + move_mode = V4SImode; + else if (max_size >= GET_MODE_SIZE (DImode)) + move_mode = DImode; + } + while (remainder_size >= GET_MODE_SIZE (move_mode)) + { + if (GET_MODE (destmem) != move_mode) + destmem = adjust_automodify_address_nv (destmem, move_mode, + destptr, offset); + emit_strset (destmem, + promoted_to_vector_value, + destptr, + move_mode, offset); - offset += 16; - remainder_size -= 16; - } + offset += GET_MODE_SIZE (move_mode); + remainder_size -= GET_MODE_SIZE (move_mode); + } + } /* Move the remaining part of epilogue - its size might be a size of the widest mode. */ @@ -22022,10 +22049,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WI || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])); -#define ALG_USABLE_P(alg) (rep_prefix_usable \ - || (alg != rep_prefix_1_byte \ - && alg != rep_prefix_4_byte \ - && alg != rep_prefix_8_byte)) +#define ALG_USABLE_P(alg) ((rep_prefix_usable \ + || (alg != rep_prefix_1_byte \ + && alg != rep_prefix_4_byte \ + && alg != rep_prefix_8_byte)) \ + && (TARGET_SSE2 || alg != sse_loop)) const struct processor_costs *cost; /* Even if the string operation call is cold, we still might spend a lot @@ -22037,6 +22065,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WI else optimize_for_speed = true; + if (!optimize) + return (rep_prefix_usable ? rep_prefix_1_byte : libcall); + cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; *dynamic_check = -1; @@ -22049,10 +22080,10 @@ decide_alg (HOST_WIDE_INT count, HOST_WI /* rep; movq or rep; movl is the smallest variant. */ else if (!optimize_for_speed) { - if (!count || (count & 3)) - return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte; + if (!count || (count & 3) || memset) + return rep_prefix_usable ? rep_prefix_1_byte : libcall; else - return rep_prefix_usable ? rep_prefix_4_byte : loop; + return rep_prefix_usable ? rep_prefix_4_byte : libcall; } /* Very tiny blocks are best handled via the loop, REP is expensive to setup. */ @@ -22106,13 +22137,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WI int max = -1; enum stringop_alg alg; int i; - bool any_alg_usable_p = true; bool only_libcall_fits = true; for (i = 0; i < MAX_STRINGOP_ALGS; i++) { enum stringop_alg candidate = algs->size[i].alg; - any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate); if (candidate != libcall && candidate && ALG_USABLE_P (candidate)) @@ -22124,7 +22153,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WI /* If there aren't any usable algorithms, then recursing on smaller sizes isn't going to find anything. Just return the simple byte-at-a-time copy loop. */ - if (!any_alg_usable_p || only_libcall_fits) + if (only_libcall_fits) { /* Pick something reasonable. */ if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) @@ -22253,7 +22282,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt int dynamic_check; bool need_zero_guard = false; bool align_unknown; - int unroll_factor; + unsigned int unroll_factor; enum machine_mode move_mode; rtx loop_iter = NULL_RTX; int dst_offset, src_offset; @@ -22316,14 +22345,27 @@ ix86_expand_movmem (rtx dst, rtx src, rt case unrolled_loop: need_zero_guard = true; move_mode = Pmode; - unroll_factor = TARGET_64BIT ? 4 : 2; + unroll_factor = 1; + /* Select maximal available 1,2 or 4 unroll factor. + In 32bit we can not afford to use 4 registers inside the loop. */ + if (!count) + unroll_factor = TARGET_64BIT ? 4 : 2; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < (TARGET_64BIT ? 4 :2)) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case sse_loop: need_zero_guard = true; /* Use SSE instructions, if possible. */ - move_mode = align_unknown ? DImode : V4SImode; - unroll_factor = TARGET_64BIT ? 4 : 2; + move_mode = V4SImode; + /* Select maximal available 1,2 or 4 unroll factor. */ + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case rep_prefix_8_byte: @@ -22568,7 +22610,13 @@ ix86_expand_movmem (rtx dst, rtx src, rt if (alg == sse_loop || alg == unrolled_loop) { rtx tmp; - if (align_unknown && unroll_factor > 1) + int remainder_size = epilogue_size_needed; + + /* We may not need the epilgoue loop at all when the count is known + and alignment is not adjusted. */ + if (count && desired_align <= align) + remainder_size = count & epilogue_size_needed; + if (remainder_size > GET_MODE_SIZE (move_mode) * 2) { /* Reduce epilogue's size by creating not-unrolled loop. If we won't do this, we can have very big epilogue - when alignment is statically @@ -22764,6 +22812,7 @@ ix86_expand_setmem (rtx dst, rtx count_e unsigned int unroll_factor; enum machine_mode move_mode; rtx loop_iter = NULL_RTX; + bool early_jump = false; if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); @@ -22813,9 +22862,12 @@ ix86_expand_setmem (rtx dst, rtx count_e move_mode = Pmode; unroll_factor = 1; /* Select maximal available 1,2 or 4 unroll factor. */ - while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count - && unroll_factor < 4) - unroll_factor *= 2; + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case sse_loop: @@ -22823,9 +22875,12 @@ ix86_expand_setmem (rtx dst, rtx count_e move_mode = TARGET_64BIT ? V2DImode : V4SImode; unroll_factor = 1; /* Select maximal available 1,2 or 4 unroll factor. */ - while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count - && unroll_factor < 4) - unroll_factor *= 2; + if (!count) + unroll_factor = 4; + else + while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count + && unroll_factor < 4) + unroll_factor *= 2; size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; break; case rep_prefix_8_byte: @@ -22904,6 +22959,7 @@ ix86_expand_setmem (rtx dst, rtx count_e emit_move_insn (loop_iter, const0_rtx); } label = gen_label_rtx (); + early_jump = true; emit_cmp_and_jump_insns (count_exp, GEN_INT (epilogue_size_needed), LTU, 0, counter_mode (count_exp), 1, label); @@ -23065,21 +23121,26 @@ ix86_expand_setmem (rtx dst, rtx count_e LABEL_NUSES (label) = 1; /* We can not rely on fact that promoved value is known. */ vec_promoted_val = 0; - gpr_promoted_val = 0; + if (early_jump) + gpr_promoted_val = 0; } epilogue: if (alg == unrolled_loop || alg == sse_loop) { rtx tmp; - if (align_unknown && unroll_factor > 1 - && epilogue_size_needed >= GET_MODE_SIZE (move_mode) - && vec_promoted_val) + int remainder_size = epilogue_size_needed; + if (count && desired_align <= align) + remainder_size = count & epilogue_size_needed; + /* We may not need the epilgoue loop at all when the count is known + and alignment is not adjusted. */ + if (remainder_size > GET_MODE_SIZE (move_mode) * 2 + && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val)) { /* Reduce epilogue's size by creating not-unrolled loop. If we won't do this, we can have very big epilogue - when alignment is statically unknown we'll have the epilogue byte by byte which may be very slow. */ loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg, - NULL, vec_promoted_val, count_exp, + NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp, loop_iter, move_mode, 1, expected_size, false); dst = change_address (dst, BLKmode, destreg); @@ -23090,17 +23151,14 @@ ix86_expand_setmem (rtx dst, rtx count_e if (tmp != destreg) emit_move_insn (destreg, tmp); } - if (count_exp == const0_rtx) + if (count_exp == const0_rtx || epilogue_size_needed <= 1) ; - else if (!gpr_promoted_val && epilogue_size_needed > 1) + else if (!gpr_promoted_val) expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, epilogue_size_needed); else - { - if (epilogue_size_needed > 1) - expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val, - val_exp, count_exp, epilogue_size_needed); - } + expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val, + val_exp, count_exp, epilogue_size_needed); if (jump_around_label) emit_label (jump_around_label); return true;