From patchwork Thu Nov 17 21:10:57 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jan Hubicka <hubicka@ucw.cz>
X-Patchwork-Id: 126315
Return-Path: 
 <gcc-patches-return-307974-incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from sourceware.org (server1.sourceware.org [209.132.180.131])
	by ozlabs.org (Postfix) with SMTP id F4018B7234
	for <incoming@patchwork.ozlabs.org>;
	Fri, 18 Nov 2011 08:11:24 +1100 (EST)
Received: (qmail 11724 invoked by alias); 17 Nov 2011 21:11:23 -0000
Received: (qmail 11713 invoked by uid 22791); 17 Nov 2011 21:11:21 -0000
X-SWARE-Spam-Status: No, hits=-2.4 required=5.0	tests=AWL, BAYES_00,
	RP_MATCHES_RCVD, TW_OV
X-Spam-Check-By: sourceware.org
Received: from nikam.ms.mff.cuni.cz (HELO nikam.ms.mff.cuni.cz)
	(195.113.20.16) by sourceware.org (qpsmtpd/0.43rc1) with
	ESMTP; Thu, 17 Nov 2011 21:11:00 +0000
Received: by nikam.ms.mff.cuni.cz (Postfix, from userid 16202)	id
	D294E9AC652; Thu, 17 Nov 2011 22:10:57 +0100 (CET)
Date: Thu, 17 Nov 2011 22:10:57 +0100
From: Jan Hubicka <hubicka@ucw.cz>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: Jan Hubicka <hubicka@ucw.cz>,
	Michael Zolotukhin <michael.v.zolotukhin@gmail.com>,
	gcc-patches@gcc.gnu.org, izamyatin@gmail.com
Subject: Re: Memset/memcpy patch
Message-ID: <20111117211057.GE30593@kam.mff.cuni.cz>
References: 
 <CANtU07_0M+T=+6JhB-vyyMpt8h-_h5G7w05xYTyqqQ32tGNowg@mail.gmail.com>
	<20111114170335.GA29877@kam.mff.cuni.cz>
	<CAMe9rOrfMq6CJG71kZ3+-tmWhQQ_7pLK2HFz_ZOdAnzk5b5CHA@mail.gmail.com>
	<CAMe9rOpTtYtW6oy0yyfeSzXDW4-N5S0VQ6=uLftOaq1egZKG4A@mail.gmail.com>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: 
 <CAMe9rOpTtYtW6oy0yyfeSzXDW4-N5S0VQ6=uLftOaq1egZKG4A@mail.gmail.com>
User-Agent: Mutt/1.5.18 (2008-05-17)
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Unsubscribe: 
 <mailto:gcc-patches-unsubscribe-incoming=patchwork.ozlabs.org@gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Delivered-To: mailing list gcc-patches@gcc.gnu.org

> 
> The current x86 memset/memcpy expansion is broken. It miscompiles
> many programs, including GCC itself.  Should it be reverted for now?

There was problem in the new code doing loopy epilogues.
I am currently testing the following patch that shold fix the problem.
We could either revert now and I will apply combined patch or I hope to fix that
tonight.

Honza

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 181442)
+++ config/i386/i386.h	(working copy)
@@ -276,6 +276,7 @@ enum ix86_tune_indices {
   X86_TUNE_PROMOTE_QIMODE,
   X86_TUNE_FAST_PREFIX,
   X86_TUNE_SINGLE_STRINGOP,
+  X86_TUNE_ALIGN_STRINGOP,
   X86_TUNE_QIMODE_MATH,
   X86_TUNE_HIMODE_MATH,
   X86_TUNE_PROMOTE_QI_REGS,
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 181442)
+++ config/i386/i386.md	(working copy)
@@ -15944,6 +15944,17 @@
 	      (clobber (reg:CC FLAGS_REG))])]
   ""
 {
+  rtx vec_reg;
+  enum machine_mode mode = GET_MODE (operands[2]);
+  if (vector_extensions_used_for_mode (mode)
+      && CONSTANT_P (operands[2]))
+    {
+      if (mode == DImode)
+	mode = TARGET_64BIT ? V2DImode : V4SImode;
+      vec_reg = gen_reg_rtx (mode);
+      emit_move_insn (vec_reg, operands[2]);
+      operands[2] = vec_reg;
+    }
   if (GET_MODE (operands[1]) != GET_MODE (operands[2]))
     operands[1] = adjust_address_nv (operands[1], GET_MODE (operands[2]), 0);
 
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 181442)
+++ config/i386/i386.c	(working copy)
@@ -1785,7 +1785,7 @@ struct processor_costs atom_cost = {
      if that fails.  */
   {{{libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}, /* Known alignment.  */
     {libcall, {{4096, sse_loop}, {4096, unrolled_loop}, {-1, libcall}}}},
-   {{libcall, {{-1, libcall}}},			       /* Unknown alignment.  */
+   {{libcall, {{2048, sse_loop}, {2048, unrolled_loop}, {-1, libcall}}}, /* Unknown alignment.  */
     {libcall, {{2048, sse_loop}, {2048, unrolled_loop},
 	       {-1, libcall}}}}},
 
@@ -2178,6 +2178,9 @@ static unsigned int initial_ix86_tune_fe
   /* X86_TUNE_SINGLE_STRINGOP */
   m_386 | m_P4_NOCONA,
 
+  /* X86_TUNE_ALIGN_STRINGOP */
+  ~(m_BDVER | m_CORE2I7),
+
   /* X86_TUNE_QIMODE_MATH */
   ~0,
 
@@ -3724,6 +3727,14 @@ ix86_option_override_internal (bool main
         target_flags |= MASK_NO_RED_ZONE;
     }
 
+  if (!(target_flags_explicit & MASK_NO_ALIGN_STRINGOPS))
+    {
+      if (ix86_tune_features[X86_TUNE_ALIGN_STRINGOP] & ix86_tune_mask)
+        target_flags &= ~MASK_NO_ALIGN_STRINGOPS;
+      else
+        target_flags |= MASK_NO_ALIGN_STRINGOPS;
+    }
+
   /* Keep nonleaf frame pointers.  */
   if (flag_omit_frame_pointer)
     target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
@@ -21149,20 +21160,25 @@ expand_set_or_movmem_via_loop_with_iter
 
   top_label = gen_label_rtx ();
   out_label = gen_label_rtx ();
-  if (!reuse_iter)
-    iter = gen_reg_rtx (iter_mode);
-
   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
-			      NULL, 1, OPTAB_DIRECT);
-  /* Those two should combine.  */
-  if (piece_size == const1_rtx)
+		               NULL, 1, OPTAB_DIRECT);
+  if (!reuse_iter)
     {
-      emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+      iter = gen_reg_rtx (iter_mode);
+      /* Those two should combine.  */
+      if (piece_size == const1_rtx)
+	{
+	  emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
+				   true, out_label);
+	  predict_jump (REG_BR_PROB_BASE * 10 / 100);
+	}
+      emit_move_insn (iter, const0_rtx);
+    }
+  else
+    {
+      emit_cmp_and_jump_insns (iter, size, GE, NULL_RTX, iter_mode,
 			       true, out_label);
-      predict_jump (REG_BR_PROB_BASE * 10 / 100);
     }
-  if (!reuse_iter)
-    emit_move_insn (iter, const0_rtx);
 
   emit_label (top_label);
 
@@ -21588,17 +21604,28 @@ expand_setmem_epilogue (rtx destmem, rtx
 	 Remaining part we'll move using Pmode and narrower modes.  */
 
       if (promoted_to_vector_value)
-	while (remainder_size >= 16)
-	  {
-	    if (GET_MODE (destmem) != move_mode)
-	      destmem = adjust_automodify_address_nv (destmem, move_mode,
-						      destptr, offset);
-	    emit_strset (destmem, promoted_to_vector_value, destptr,
-			 move_mode, offset);
+	{
+	  if (promoted_to_vector_value)
+	    {
+	      if (max_size >= GET_MODE_SIZE (V4SImode))
+		move_mode = V4SImode;
+	      else if (max_size >= GET_MODE_SIZE (DImode))
+		move_mode = DImode;
+	    }
+	  while (remainder_size >= GET_MODE_SIZE (move_mode))
+	    {
+	      if (GET_MODE (destmem) != move_mode)
+		destmem = adjust_automodify_address_nv (destmem, move_mode,
+							destptr, offset);
+	      emit_strset (destmem,
+			   promoted_to_vector_value,
+			   destptr,
+			   move_mode, offset);
 
-	    offset += 16;
-	    remainder_size -= 16;
-	  }
+	      offset += GET_MODE_SIZE (move_mode);
+	      remainder_size -= GET_MODE_SIZE (move_mode);
+	    }
+	}
 
       /* Move the remaining part of epilogue - its size might be
 	 a size of the widest mode.  */
@@ -22022,10 +22049,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 			     || (memset
 				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
 
-#define ALG_USABLE_P(alg) (rep_prefix_usable			\
-			   || (alg != rep_prefix_1_byte		\
-			       && alg != rep_prefix_4_byte      \
-			       && alg != rep_prefix_8_byte))
+#define ALG_USABLE_P(alg) ((rep_prefix_usable			\
+			    || (alg != rep_prefix_1_byte	\
+			        && alg != rep_prefix_4_byte      \
+			        && alg != rep_prefix_8_byte))    \
+			   && (TARGET_SSE2 || alg != sse_loop))
   const struct processor_costs *cost;
 
   /* Even if the string operation call is cold, we still might spend a lot
@@ -22037,6 +22065,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
   else
     optimize_for_speed = true;
 
+  if (!optimize)
+    return (rep_prefix_usable ? rep_prefix_1_byte : libcall);
+
   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
 
   *dynamic_check = -1;
@@ -22049,10 +22080,10 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (!optimize_for_speed)
     {
-      if (!count || (count & 3))
-	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
+      if (!count || (count & 3) || memset)
+	return rep_prefix_usable ? rep_prefix_1_byte : libcall;
       else
-	return rep_prefix_usable ? rep_prefix_4_byte : loop;
+	return rep_prefix_usable ? rep_prefix_4_byte : libcall;
     }
   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
    */
@@ -22106,13 +22137,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
       int max = -1;
       enum stringop_alg alg;
       int i;
-      bool any_alg_usable_p = true;
       bool only_libcall_fits = true;
 
       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
 	{
 	  enum stringop_alg candidate = algs->size[i].alg;
-	  any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
 
 	  if (candidate != libcall && candidate
 	      && ALG_USABLE_P (candidate))
@@ -22124,7 +22153,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
       /* If there aren't any usable algorithms, then recursing on
 	 smaller sizes isn't going to find anything.  Just return the
 	 simple byte-at-a-time copy loop.  */
-      if (!any_alg_usable_p || only_libcall_fits)
+      if (only_libcall_fits)
 	{
 	  /* Pick something reasonable.  */
 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
@@ -22253,7 +22282,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
   int dynamic_check;
   bool need_zero_guard = false;
   bool align_unknown;
-  int unroll_factor;
+  unsigned int unroll_factor;
   enum machine_mode move_mode;
   rtx loop_iter = NULL_RTX;
   int dst_offset, src_offset;
@@ -22316,14 +22345,27 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     case unrolled_loop:
       need_zero_guard = true;
       move_mode = Pmode;
-      unroll_factor = TARGET_64BIT ? 4 : 2;
+      unroll_factor = 1;
+      /* Select maximal available 1,2 or 4 unroll factor.
+	 In 32bit we can not afford to use 4 registers inside the loop.  */
+      if (!count)
+	unroll_factor = TARGET_64BIT ? 4 : 2;
+      else
+	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+	       && unroll_factor < (TARGET_64BIT ? 4 :2))
+	  unroll_factor *= 2;
       size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
       break;
     case sse_loop:
       need_zero_guard = true;
       /* Use SSE instructions, if possible.  */
-      move_mode = align_unknown ? DImode : V4SImode;
-      unroll_factor = TARGET_64BIT ? 4 : 2;
+      move_mode = V4SImode;
+      /* Select maximal available 1,2 or 4 unroll factor.  */
+      if (!count)
+	unroll_factor = 4;
+      else
+	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+	       && unroll_factor < 4)
       size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
       break;
     case rep_prefix_8_byte:
@@ -22568,7 +22610,13 @@ ix86_expand_movmem (rtx dst, rtx src, rt
   if (alg == sse_loop || alg == unrolled_loop)
     {
       rtx tmp;
-      if (align_unknown && unroll_factor > 1)
+      int remainder_size = epilogue_size_needed;
+
+      /* We may not need the epilgoue loop at all when the count is known
+	 and alignment is not adjusted.  */
+      if (count && desired_align <= align)
+	remainder_size = count & epilogue_size_needed;
+      if (remainder_size > GET_MODE_SIZE (move_mode) * 2)
 	{
 	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
 	     do this, we can have very big epilogue - when alignment is statically
@@ -22764,6 +22812,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
   unsigned int unroll_factor;
   enum machine_mode move_mode;
   rtx loop_iter = NULL_RTX;
+  bool early_jump = false;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22813,9 +22862,12 @@ ix86_expand_setmem (rtx dst, rtx count_e
       move_mode = Pmode;
       unroll_factor = 1;
       /* Select maximal available 1,2 or 4 unroll factor.  */
-      while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
-	     && unroll_factor < 4)
-	unroll_factor *= 2;
+      if (!count)
+	unroll_factor = 4;
+      else
+	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+	       && unroll_factor < 4)
+	  unroll_factor *= 2;
       size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
       break;
     case sse_loop:
@@ -22823,9 +22875,12 @@ ix86_expand_setmem (rtx dst, rtx count_e
       move_mode = TARGET_64BIT ? V2DImode : V4SImode;
       unroll_factor = 1;
       /* Select maximal available 1,2 or 4 unroll factor.  */
-      while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
-	     && unroll_factor < 4)
-	unroll_factor *= 2;
+      if (!count)
+	unroll_factor = 4;
+      else
+	while (GET_MODE_SIZE (move_mode) * unroll_factor * 2 < count
+	       && unroll_factor < 4)
+	  unroll_factor *= 2;
       size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
       break;
     case rep_prefix_8_byte:
@@ -22904,6 +22959,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
               emit_move_insn (loop_iter, const0_rtx);
 	    }
 	  label = gen_label_rtx ();
+	  early_jump = true;
 	  emit_cmp_and_jump_insns (count_exp,
 				   GEN_INT (epilogue_size_needed),
 				   LTU, 0, counter_mode (count_exp), 1, label);
@@ -23065,21 +23121,26 @@ ix86_expand_setmem (rtx dst, rtx count_e
       LABEL_NUSES (label) = 1;
       /* We can not rely on fact that promoved value is known.  */
       vec_promoted_val = 0;
-      gpr_promoted_val = 0;
+      if (early_jump)
+        gpr_promoted_val = 0;
     }
  epilogue:
   if (alg == unrolled_loop || alg == sse_loop)
     {
       rtx tmp;
-      if (align_unknown && unroll_factor > 1
-	  && epilogue_size_needed >= GET_MODE_SIZE (move_mode)
-	  && vec_promoted_val)
+      int remainder_size = epilogue_size_needed;
+      if (count && desired_align <= align)
+	remainder_size = count & epilogue_size_needed;
+      /* We may not need the epilgoue loop at all when the count is known
+	 and alignment is not adjusted.  */
+      if (remainder_size > GET_MODE_SIZE (move_mode) * 2
+	  && (alg == sse_loop ? vec_promoted_val : gpr_promoted_val))
 	{
 	  /* Reduce epilogue's size by creating not-unrolled loop.  If we won't
 	     do this, we can have very big epilogue - when alignment is statically
 	     unknown we'll have the epilogue byte by byte which may be very slow.  */
 	  loop_iter = expand_set_or_movmem_via_loop_with_iter (dst, NULL, destreg,
-	      NULL, vec_promoted_val, count_exp,
+	      NULL, (alg == sse_loop ? vec_promoted_val : gpr_promoted_val), count_exp,
 	      loop_iter, move_mode, 1,
 	      expected_size, false);
 	  dst = change_address (dst, BLKmode, destreg);
@@ -23090,17 +23151,14 @@ ix86_expand_setmem (rtx dst, rtx count_e
       if (tmp != destreg)
 	emit_move_insn (destreg, tmp);
     }
-  if (count_exp == const0_rtx)
+  if (count_exp == const0_rtx || epilogue_size_needed <= 1)
     ;
-  else if (!gpr_promoted_val && epilogue_size_needed > 1)
+  else if (!gpr_promoted_val)
     expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
 				     epilogue_size_needed);
   else
-    {
-      if (epilogue_size_needed > 1)
-	expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
-				val_exp, count_exp, epilogue_size_needed);
-    }
+    expand_setmem_epilogue (dst, destreg, vec_promoted_val, gpr_promoted_val,
+			    val_exp, count_exp, epilogue_size_needed);
   if (jump_around_label)
     emit_label (jump_around_label);
   return true;