Patchwork [x86] Use vector moves in memmove expanding

login
register
mail settings
Submitter Jan Hubicka
Date April 17, 2013, 2:21 p.m.
Message ID <20130417142100.GA10525@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/237235/
State New
Headers show

Comments

Jan Hubicka - April 17, 2013, 2:21 p.m.
> 
> Bootstrap/make check/Specs2k are passing on i686 and x86_64.
Thanks for returning to this!

glibc has quite comprehensive testsuite for stringop.  It may be useful to test it
with -minline-all-stringop -mstringop-stategy=vector

I tested the patch on my core notebook and my memcpy micro benchmark. 
Vector loop is not a win since apparenlty we do not produce any SSE code for 64bit
compilation. What CPUs and bock sizes this is intended for?

Also the internal loop with -march=native seems to come out as:
.L7:
        movq    (%rsi,%r8), %rax
        movq    8(%rsi,%r8), %rdx
        movq    48(%rsi,%r8), %r9
        movq    56(%rsi,%r8), %r10
        movdqu  16(%rsi,%r8), %xmm3
        movdqu  32(%rsi,%r8), %xmm1
        movq    %rax, (%rdi,%r8)
        movq    %rdx, 8(%rdi,%r8)
        movdqa  %xmm3, 16(%rdi,%r8)
        movdqa  %xmm1, 32(%rdi,%r8)
        movq    %r9, 48(%rdi,%r8)
        movq    %r10, 56(%rdi,%r8)
        addq    $64, %r8
        cmpq    %r11, %r8

It is not htat much of SSE enablement since RA seems to home the vars in integer regs.
Could you please look into it?
> 
> Changelog entry:
> 
> 2013-04-10  Michael Zolotukhin  <michael.v.zolotukhin@gmail.com>
> 
>         * config/i386/i386-opts.h (enum stringop_alg): Add vector_loop.
>         * config/i386/i386.c (expand_set_or_movmem_via_loop): Use
>         adjust_address instead of change_address to keep info about alignment.
>         (emit_strmov): Remove.
>         (emit_memmov): New function.
>         (expand_movmem_epilogue): Refactor to properly handle bigger sizes.
>         (expand_movmem_epilogue): Likewise and return updated rtx for
>         destination.
>         (expand_constant_movmem_prologue): Likewise and return updated rtx for
>         destination and source.
>         (decide_alignment): Refactor, handle vector_loop.
>         (ix86_expand_movmem): Likewise.
>         (ix86_expand_setmem): Likewise.
>         * config/i386/i386.opt (Enum): Add vector_loop to option stringop_alg.
>         * emit-rtl.c (get_mem_align_offset): Compute alignment for MEM_REF.

+    }
   else
     return -1;
 
This change out to go independently. I can not review it. 
I will make first look over the patch shortly, but please send updated patch fixing
the problem with integer regs.

Honza
Jan Hubicka - April 17, 2013, 3:12 p.m.
@@ -2392,6 +2392,7 @@ static void ix86_set_current_function (tree);
 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 static enum calling_abi ix86_function_abi (const_tree);
+static int smallest_pow2_greater_than (int);

Perhaps it is easier to use existing 1<<ceil_log2?
 
 
 #ifndef SUBTARGET32_DEFAULT_CPU
@@ -21829,11 +21830,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
 {
   rtx out_label, top_label, iter, tmp;
   enum machine_mode iter_mode = counter_mode (count);
-  rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
+  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
+  rtx piece_size = GEN_INT (piece_size_n);
   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
   rtx size;
-  rtx x_addr;
-  rtx y_addr;
   int i;
 
   top_label = gen_label_rtx ();
@@ -21854,13 +21854,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
   emit_label (top_label);
 
   tmp = convert_modes (Pmode, iter_mode, iter, true);
-  x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
-  destmem = change_address (destmem, mode, x_addr);
+
+  /* This assert could be relaxed - in this case we'll need to compute
+     smallest power of two, containing in PIECE_SIZE_N and pass it to
+     offset_address.  */
+  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
+  destmem = offset_address (destmem, tmp, piece_size_n);
+  destmem = adjust_address (destmem, mode, 0);
 
   if (srcmem)
     {
-      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
-      srcmem = change_address (srcmem, mode, y_addr);
+      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
+      srcmem = adjust_address (srcmem, mode, 0);
 
       /* When unrolling for chips that reorder memory reads and writes,
 	 we can save registers by using single temporary.
@@ -22039,13 +22044,61 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
   emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
 }

This change looks OK and can go into manline independnetly. Just please ensure that changing
the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
set of the src/destination objects.
 
-static void
-emit_strmov (rtx destmem, rtx srcmem,
-	     rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
-{
-  rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
-  rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
-  emit_insn (gen_strmov (destptr, dest, srcptr, src));
+/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
+   DESTMEM.
+   SRC is passed by pointer to be updated on return.
+   Return value is updated DST.  */
+static rtx
+emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
+	     HOST_WIDE_INT size_to_move)
+{
+  rtx dst = destmem, src = *srcmem, adjust, tempreg;
+  enum insn_code code;
+  enum machine_mode move_mode;
+  int piece_size, i;
+
+  /* Find the widest mode in which we could perform moves.
+     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
+     it until move of such size is supported.  */
+  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
+  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);

I suppose this is a problem with SSE moves ending up in integer register, since
you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
with the original mode parmaeter?
+  code = optab_handler (mov_optab, move_mode);
+  while (code == CODE_FOR_nothing && piece_size > 1)
+    {
+      piece_size >>= 1;
+      move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
+      code = optab_handler (mov_optab, move_mode);
+    }
+  gcc_assert (code != CODE_FOR_nothing);
+
+  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
+  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
+
+  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
+  gcc_assert (size_to_move % piece_size == 0);
+  adjust = GEN_INT (piece_size);
+  for (i = 0; i < size_to_move; i += piece_size)
+    {
+      /* We move from memory to memory, so we'll need to do it via
+	 a temporary register.  */
+      tempreg = gen_reg_rtx (move_mode);
+      emit_insn (GEN_FCN (code) (tempreg, src));
+      emit_insn (GEN_FCN (code) (dst, tempreg));
+
+      emit_move_insn (destptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
+      emit_move_insn (srcptr,
+		      gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
+
+      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
+					  piece_size);
+      src = adjust_automodify_address_nv (src, move_mode, srcptr,
+					  piece_size);
+    }
+
+  /* Update DST and SRC rtx.  */
+  *srcmem = src;
+  return dst;

Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
size optimization.
 }
 
 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
@@ -22057,44 +22110,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
   if (CONST_INT_P (count))
     {
       HOST_WIDE_INT countval = INTVAL (count);
-      int offset = 0;
+      HOST_WIDE_INT epilogue_size = countval % max_size;
+      int i;
 
-      if ((countval & 0x10) && max_size > 16)
-	{
-	  if (TARGET_64BIT)
-	    {
-	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
-	      emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
-	    }
-	  else
-	    gcc_unreachable ();
-	  offset += 16;
-	}
-      if ((countval & 0x08) && max_size > 8)
-	{
-	  if (TARGET_64BIT)
-	    emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
-	  else
-	    {
-	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
-	      emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
-	    }
-	  offset += 8;
-	}
-      if ((countval & 0x04) && max_size > 4)
+      /* For now MAX_SIZE should be a power of 2.  This assert could be
+	 relaxed, but it'll require a bit more complicated epilogue
+	 expanding.  */
+      gcc_assert ((max_size & (max_size - 1)) == 0);
+      for (i = max_size; i >= 1; i >>= 1)
 	{
-          emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
-	  offset += 4;
-	}
-      if ((countval & 0x02) && max_size > 2)
-	{
-          emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
-	  offset += 2;
-	}
-      if ((countval & 0x01) && max_size > 1)
-	{
-          emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
-	  offset += 1;
+	  if (epilogue_size & i)
+	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
 	}
       return;
     }
@@ -22330,47 +22356,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_
 }
 
 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
-   DESIRED_ALIGNMENT.  */
-static void
+   DESIRED_ALIGNMENT.
+   Return value is updated DESTMEM.  */
+static rtx
 expand_movmem_prologue (rtx destmem, rtx srcmem,
 			rtx destptr, rtx srcptr, rtx count,
 			int align, int desired_alignment)
 {
-  if (align <= 1 && desired_alignment > 1)
-    {
-      rtx label = ix86_expand_aligntest (destptr, 1, false);
-      srcmem = change_address (srcmem, QImode, srcptr);
-      destmem = change_address (destmem, QImode, destptr);
-      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
-      ix86_adjust_counter (count, 1);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (align <= 2 && desired_alignment > 2)
-    {
-      rtx label = ix86_expand_aligntest (destptr, 2, false);
-      srcmem = change_address (srcmem, HImode, srcptr);
-      destmem = change_address (destmem, HImode, destptr);
-      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
-      ix86_adjust_counter (count, 2);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
-    }
-  if (align <= 4 && desired_alignment > 4)
+  int i;
+  for (i = 1; i < desired_alignment; i <<= 1)
     {
-      rtx label = ix86_expand_aligntest (destptr, 4, false);
-      srcmem = change_address (srcmem, SImode, srcptr);
-      destmem = change_address (destmem, SImode, destptr);
-      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
-      ix86_adjust_counter (count, 4);
-      emit_label (label);
-      LABEL_NUSES (label) = 1;
+      if (align <= i)
+	{
+	  rtx label = ix86_expand_aligntest (destptr, i, false);
+	  destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
+	  ix86_adjust_counter (count, i);
+	  emit_label (label);
+	  LABEL_NUSES (label) = 1;
+	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
+	}
     }
-  gcc_assert (desired_alignment <= 8);
+  return destmem;
 }
 
 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
-   ALIGN_BYTES is how many bytes need to be copied.  */
+   ALIGN_BYTES is how many bytes need to be copied.
+   The function updates DST and SRC, namely, it sets proper alignment.
+   DST is returned via return value, SRC is updated via pointer SRCP.  */
 static rtx
 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
 				 int desired_align, int align_bytes)
@@ -22378,62 +22390,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
   rtx src = *srcp;
   rtx orig_dst = dst;
   rtx orig_src = src;
-  int off = 0;
+  int piece_size = 1;
+  int copied_bytes = 0;
   int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
   if (src_align_bytes >= 0)
     src_align_bytes = desired_align - src_align_bytes;
-  if (align_bytes & 1)
-    {
-      dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
-      src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
-      off = 1;
-      emit_insn (gen_strmov (destreg, dst, srcreg, src));
-    }
-  if (align_bytes & 2)
-    {
-      dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
-      src = adjust_automodify_address_nv (src, HImode, srcreg, off);
-      if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
-	set_mem_align (dst, 2 * BITS_PER_UNIT);
-      if (src_align_bytes >= 0
-	  && (src_align_bytes & 1) == (align_bytes & 1)
-	  && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
-	set_mem_align (src, 2 * BITS_PER_UNIT);
-      off = 2;
-      emit_insn (gen_strmov (destreg, dst, srcreg, src));
-    }
-  if (align_bytes & 4)
+
+  for (piece_size = 1;
+       piece_size <= desired_align && copied_bytes < align_bytes;
+       piece_size <<= 1)
     {
-      dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
-      src = adjust_automodify_address_nv (src, SImode, srcreg, off);
-      if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
-	set_mem_align (dst, 4 * BITS_PER_UNIT);
-      if (src_align_bytes >= 0)
+      if (align_bytes & piece_size)
 	{
-	  unsigned int src_align = 0;
-	  if ((src_align_bytes & 3) == (align_bytes & 3))
-	    src_align = 4;
-	  else if ((src_align_bytes & 1) == (align_bytes & 1))
-	    src_align = 2;
-	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
-	    set_mem_align (src, src_align * BITS_PER_UNIT);
+	  dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
+	  copied_bytes += piece_size;
 	}
-      off = 4;
-      emit_insn (gen_strmov (destreg, dst, srcreg, src));
     }
-  dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
-  src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
+
   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
     set_mem_align (dst, desired_align * BITS_PER_UNIT);
   if (src_align_bytes >= 0)
     {
-      unsigned int src_align = 0;
-      if ((src_align_bytes & 7) == (align_bytes & 7))
-	src_align = 8;
-      else if ((src_align_bytes & 3) == (align_bytes & 3))
-	src_align = 4;
-      else if ((src_align_bytes & 1) == (align_bytes & 1))
-	src_align = 2;
+      unsigned int src_align;
+      for (src_align = desired_align; src_align >= 2; src_align >>= 1)
+	{
+	  if ((src_align_bytes & (src_align - 1))
+	       == (align_bytes & (src_align - 1)))
+	    break;
+	}
       if (src_align > (unsigned int) desired_align)
 	src_align = desired_align;
       if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
@@ -22666,42 +22650,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
 static int
 decide_alignment (int align,
 		  enum stringop_alg alg,
-		  int expected_size)
+		  int expected_size,
+		  enum machine_mode move_mode)
 {
   int desired_align = 0;
-  switch (alg)
-    {
-      case no_stringop:
-	gcc_unreachable ();
-      case loop:
-      case unrolled_loop:
-	desired_align = GET_MODE_SIZE (Pmode);
-	break;
-      case rep_prefix_8_byte:
-	desired_align = 8;
-	break;
-      case rep_prefix_4_byte:
-	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
-	   copying whole cacheline at once.  */
-	if (TARGET_PENTIUMPRO)
-	  desired_align = 8;
-	else
-	  desired_align = 4;
-	break;
-      case rep_prefix_1_byte:
-	/* PentiumPro has special logic triggering for 8 byte aligned blocks.
-	   copying whole cacheline at once.  */
-	if (TARGET_PENTIUMPRO)
-	  desired_align = 8;
-	else
-	  desired_align = 1;
-	break;
-      case loop_1_byte:
-	desired_align = 1;
-	break;
-      case libcall:
-	return 0;
-    }
+
+  gcc_assert (alg != no_stringop);
+
+  if (alg == libcall)
+    return 0;
+  if (move_mode == VOIDmode)
+    return 0;
+
+  desired_align = GET_MODE_SIZE (move_mode);
+  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
+     copying whole cacheline at once.  */
+  if (TARGET_PENTIUMPRO
+      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
+    desired_align = 8;
 
   if (optimize_size)
     desired_align = 1;
@@ -22709,6 +22675,7 @@ decide_alignment (int align,
     desired_align = align;
   if (expected_size != -1 && expected_size < 4)
     desired_align = align;
+
   return desired_align;
 }
 
@@ -22765,6 +22732,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
   int dynamic_check;
   bool need_zero_guard = false;
   bool noalign;
+  enum machine_mode move_mode = VOIDmode;
+  int unroll_factor = 1;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -22788,50 +22757,60 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
 
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
-
   alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
-  desired_align = decide_alignment (align, alg, expected_size);
-
-  if (!TARGET_ALIGN_STRINGOPS || noalign)
-    align = desired_align;
-
   if (alg == libcall)
     return false;
   gcc_assert (alg != no_stringop);
+
   if (!count)
     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   destreg = copy_addr_to_reg (XEXP (dst, 0));
   srcreg = copy_addr_to_reg (XEXP (src, 0));
+
+  unroll_factor = 1;
+  move_mode = word_mode;
   switch (alg)
     {
     case libcall:
     case no_stringop:
       gcc_unreachable ();
+    case loop_1_byte:
+      need_zero_guard = true;
+      move_mode = QImode;
+      break;
     case loop:
       need_zero_guard = true;
-      size_needed = GET_MODE_SIZE (word_mode);
       break;
     case unrolled_loop:
       need_zero_guard = true;
-      size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
+      unroll_factor = (TARGET_64BIT ? 4 : 2);
+      break;
+    case vector_loop:
+      need_zero_guard = true;
+      unroll_factor = 4;
+      /* Find the widest supported mode.  */
+      move_mode = Pmode;
+      while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
+	     != CODE_FOR_nothing)
+	  move_mode = GET_MODE_WIDER_MODE (move_mode);
       break;
     case rep_prefix_8_byte:
-      size_needed = 8;
+      move_mode = DImode;
       break;
     case rep_prefix_4_byte:
-      size_needed = 4;
+      move_mode = SImode;
       break;
     case rep_prefix_1_byte:
-      size_needed = 1;
-      break;
-    case loop_1_byte:
-      need_zero_guard = true;
-      size_needed = 1;
+      move_mode = QImode;
       break;
     }
-
+  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
   epilogue_size_needed = size_needed;
 
+  desired_align = decide_alignment (align, alg, expected_size, move_mode);
 
+  desired_align = decide_alignment (align, alg, expected_size, move_mode);
+  if (!TARGET_ALIGN_STRINGOPS || noalign)
+    align = desired_align;
+

For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?

Otherwise the patch seems resonable.  Thanks for submitting it by pieces so the review is easier.

Honza
Michael Zolotukhin - April 18, 2013, 11:53 a.m.
Hi,
Jan, thanks for the review, I hope to prepare an updated version of the patch
shortly.  Please see my answers to your comments below.

Uros, there is a question of a better approach for generation of wide moves.
Could you please comment it (see details in bullets 3 and 5)?

1.
> +static int smallest_pow2_greater_than (int);
>
> Perhaps it is easier to use existing 1<<ceil_log2?
Well, yep.  Actually, this routine has already been used there, so I continued
using it.  I guess we could change its implementation to call
ceil_log2/floor_log2 or remove it entirely.

2.
> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
> -      srcmem = change_address (srcmem, mode, y_addr);
> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
> +      srcmem = adjust_address (srcmem, mode, 0);
> ...
> This change looks OK and can go into manline independnetly. Just please ensure that changing
> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
> set of the src/destination objects.
Could you explain it in more details?  Do you mean that at the beginning DST
and SRC could point to one memory location and have corresponding alias sets,
and I just change addresses they point to without invalidating alias sets?
I haven't thought about this, and that seems like a possible bug, but I guess
it could be simply fixed by calling change_address at the end.

3.
> +  /* Find the widest mode in which we could perform moves.
> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
> +     it until move of such size is supported.  */
> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>
> I suppose this is a problem with SSE moves ending up in integer register, since
> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
> with the original mode parmaeter?
Yes, here we choose TImode instead of a vector mode, but that actually was done
intentionally.  I tried several approaches here and decided that using the
widest integer mode is the best one for now.  We could try to find out a
particular (vector)mode in which we want to perform copying, but isn't it better
to rely on a machine-description here?  My idea here was to just request a copy
of, for instance, 128-bit piece (i.e. one TI-move) and leave it to the compiler
to choose the most optimal way of performing it.  Currently, the compiler thinks
that move of 128bits should be splitted into two moves of 64-bits (this
transformation is done in split2 pass) - if it's actually not so optimal, we
should fix it there, IMHO.

I think Uros could give me an advice on whether it's a reasonable approach or it
should be changed.

Also, I tried to avoid such fixes in this patch - that doesn't mean I'm not
going to work on the fixes, quite the contrary.  But it'd be easier to work on
them if we have a code in the trunk that could reveal the problem.

4.
> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
> size optimization.
Do you mean removing emit_strmov?  I don't think it'll kill anything, as new
emit_memmov is capable of doing what emit_strmov did and is just an extended
version of it.  BTW, under TARGET_SINGLE_STRINGOP switch gen_strmov is used, not
emit_strmov - behaviour there hasn't been changed by this patch.

5.
> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
We try to achieve a required alignment by prologue, so in the main loop
destination is aligned properly.  Source, meanwhile, could be misaligned, so for
it unaligned moves could be generated.  Here I actually also rely on the fact
that we have an optimal description of aligned/unaligned moves in MD-file, i.e.
if it's better to emit two DI-moves instead of one unaligned TI-mode, then
splits/expands will manage to do that.

6.
> +  else if (TREE_CODE (expr) == MEM_REF)
> +    {
> +      tree base = TREE_OPERAND (expr, 0);
> +      tree byte_offset = TREE_OPERAND (expr, 1);
> +      if (TREE_CODE (base) != ADDR_EXPR
> +         || TREE_CODE (byte_offset) != INTEGER_CST)
> +       return -1;
> +      if (!DECL_P (TREE_OPERAND (base, 0))
> +         || DECL_ALIGN (TREE_OPERAND (base, 0)) < align)
>
> You can use TYPE_ALIGN here? In general can't we replace all the GIMPLE
> handling by get_object_alignment?
>
> +       return -1;
> +      offset += tree_low_cst (byte_offset, 1);
> +    }
>    else
>      return -1;
>
> This change out to go independently. I can not review it.
> I will make first look over the patch shortly, but please send updated patch fixing
> the problem with integer regs.
Actually, I don't know what is a right way to find out alignment, but the
existing one didn't work.  Routine get_mem_align_offset didn't handle MEM_REFs
at all, so I added some handling there - I'm not sure it's complete and
absoulutely correct, but that currently works for me.  I'd be glad to hear any
suggestions of how that should be done - whom should I ask about it?

---
Thanks, Michael


On 17 April 2013 19:12, Jan Hubicka <hubicka@ucw.cz> wrote:
> @@ -2392,6 +2392,7 @@ static void ix86_set_current_function (tree);
>  static unsigned int ix86_minimum_incoming_stack_boundary (bool);
>
>  static enum calling_abi ix86_function_abi (const_tree);
> +static int smallest_pow2_greater_than (int);
>
> Perhaps it is easier to use existing 1<<ceil_log2?
>
>
>  #ifndef SUBTARGET32_DEFAULT_CPU
> @@ -21829,11 +21830,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>  {
>    rtx out_label, top_label, iter, tmp;
>    enum machine_mode iter_mode = counter_mode (count);
> -  rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
> +  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
> +  rtx piece_size = GEN_INT (piece_size_n);
>    rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
>    rtx size;
> -  rtx x_addr;
> -  rtx y_addr;
>    int i;
>
>    top_label = gen_label_rtx ();
> @@ -21854,13 +21854,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>    emit_label (top_label);
>
>    tmp = convert_modes (Pmode, iter_mode, iter, true);
> -  x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
> -  destmem = change_address (destmem, mode, x_addr);
> +
> +  /* This assert could be relaxed - in this case we'll need to compute
> +     smallest power of two, containing in PIECE_SIZE_N and pass it to
> +     offset_address.  */
> +  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
> +  destmem = offset_address (destmem, tmp, piece_size_n);
> +  destmem = adjust_address (destmem, mode, 0);
>
>    if (srcmem)
>      {
> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
> -      srcmem = change_address (srcmem, mode, y_addr);
> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
> +      srcmem = adjust_address (srcmem, mode, 0);
>
>        /* When unrolling for chips that reorder memory reads and writes,
>          we can save registers by using single temporary.
> @@ -22039,13 +22044,61 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
>    emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
>  }
>
> This change looks OK and can go into manline independnetly. Just please ensure that changing
> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
> set of the src/destination objects.
>
> -static void
> -emit_strmov (rtx destmem, rtx srcmem,
> -            rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
> -{
> -  rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
> -  rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
> -  emit_insn (gen_strmov (destptr, dest, srcptr, src));
> +/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
> +   DESTMEM.
> +   SRC is passed by pointer to be updated on return.
> +   Return value is updated DST.  */
> +static rtx
> +emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
> +            HOST_WIDE_INT size_to_move)
> +{
> +  rtx dst = destmem, src = *srcmem, adjust, tempreg;
> +  enum insn_code code;
> +  enum machine_mode move_mode;
> +  int piece_size, i;
> +
> +  /* Find the widest mode in which we could perform moves.
> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
> +     it until move of such size is supported.  */
> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>
> I suppose this is a problem with SSE moves ending up in integer register, since
> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
> with the original mode parmaeter?
> +  code = optab_handler (mov_optab, move_mode);
> +  while (code == CODE_FOR_nothing && piece_size > 1)
> +    {
> +      piece_size >>= 1;
> +      move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
> +      code = optab_handler (mov_optab, move_mode);
> +    }
> +  gcc_assert (code != CODE_FOR_nothing);
> +
> +  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
> +  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
> +
> +  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
> +  gcc_assert (size_to_move % piece_size == 0);
> +  adjust = GEN_INT (piece_size);
> +  for (i = 0; i < size_to_move; i += piece_size)
> +    {
> +      /* We move from memory to memory, so we'll need to do it via
> +        a temporary register.  */
> +      tempreg = gen_reg_rtx (move_mode);
> +      emit_insn (GEN_FCN (code) (tempreg, src));
> +      emit_insn (GEN_FCN (code) (dst, tempreg));
> +
> +      emit_move_insn (destptr,
> +                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
> +      emit_move_insn (srcptr,
> +                     gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
> +
> +      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
> +                                         piece_size);
> +      src = adjust_automodify_address_nv (src, move_mode, srcptr,
> +                                         piece_size);
> +    }
> +
> +  /* Update DST and SRC rtx.  */
> +  *srcmem = src;
> +  return dst;
>
> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
> size optimization.
>  }
>
>  /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
> @@ -22057,44 +22110,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
>    if (CONST_INT_P (count))
>      {
>        HOST_WIDE_INT countval = INTVAL (count);
> -      int offset = 0;
> +      HOST_WIDE_INT epilogue_size = countval % max_size;
> +      int i;
>
> -      if ((countval & 0x10) && max_size > 16)
> -       {
> -         if (TARGET_64BIT)
> -           {
> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
> -           }
> -         else
> -           gcc_unreachable ();
> -         offset += 16;
> -       }
> -      if ((countval & 0x08) && max_size > 8)
> -       {
> -         if (TARGET_64BIT)
> -           emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
> -         else
> -           {
> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
> -           }
> -         offset += 8;
> -       }
> -      if ((countval & 0x04) && max_size > 4)
> +      /* For now MAX_SIZE should be a power of 2.  This assert could be
> +        relaxed, but it'll require a bit more complicated epilogue
> +        expanding.  */
> +      gcc_assert ((max_size & (max_size - 1)) == 0);
> +      for (i = max_size; i >= 1; i >>= 1)
>         {
> -          emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
> -         offset += 4;
> -       }
> -      if ((countval & 0x02) && max_size > 2)
> -       {
> -          emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
> -         offset += 2;
> -       }
> -      if ((countval & 0x01) && max_size > 1)
> -       {
> -          emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
> -         offset += 1;
> +         if (epilogue_size & i)
> +           destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
>         }
>        return;
>      }
> @@ -22330,47 +22356,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_
>  }
>
>  /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
> -   DESIRED_ALIGNMENT.  */
> -static void
> +   DESIRED_ALIGNMENT.
> +   Return value is updated DESTMEM.  */
> +static rtx
>  expand_movmem_prologue (rtx destmem, rtx srcmem,
>                         rtx destptr, rtx srcptr, rtx count,
>                         int align, int desired_alignment)
>  {
> -  if (align <= 1 && desired_alignment > 1)
> -    {
> -      rtx label = ix86_expand_aligntest (destptr, 1, false);
> -      srcmem = change_address (srcmem, QImode, srcptr);
> -      destmem = change_address (destmem, QImode, destptr);
> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
> -      ix86_adjust_counter (count, 1);
> -      emit_label (label);
> -      LABEL_NUSES (label) = 1;
> -    }
> -  if (align <= 2 && desired_alignment > 2)
> -    {
> -      rtx label = ix86_expand_aligntest (destptr, 2, false);
> -      srcmem = change_address (srcmem, HImode, srcptr);
> -      destmem = change_address (destmem, HImode, destptr);
> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
> -      ix86_adjust_counter (count, 2);
> -      emit_label (label);
> -      LABEL_NUSES (label) = 1;
> -    }
> -  if (align <= 4 && desired_alignment > 4)
> +  int i;
> +  for (i = 1; i < desired_alignment; i <<= 1)
>      {
> -      rtx label = ix86_expand_aligntest (destptr, 4, false);
> -      srcmem = change_address (srcmem, SImode, srcptr);
> -      destmem = change_address (destmem, SImode, destptr);
> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
> -      ix86_adjust_counter (count, 4);
> -      emit_label (label);
> -      LABEL_NUSES (label) = 1;
> +      if (align <= i)
> +       {
> +         rtx label = ix86_expand_aligntest (destptr, i, false);
> +         destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
> +         ix86_adjust_counter (count, i);
> +         emit_label (label);
> +         LABEL_NUSES (label) = 1;
> +         set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
> +       }
>      }
> -  gcc_assert (desired_alignment <= 8);
> +  return destmem;
>  }
>
>  /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
> -   ALIGN_BYTES is how many bytes need to be copied.  */
> +   ALIGN_BYTES is how many bytes need to be copied.
> +   The function updates DST and SRC, namely, it sets proper alignment.
> +   DST is returned via return value, SRC is updated via pointer SRCP.  */
>  static rtx
>  expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>                                  int desired_align, int align_bytes)
> @@ -22378,62 +22390,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>    rtx src = *srcp;
>    rtx orig_dst = dst;
>    rtx orig_src = src;
> -  int off = 0;
> +  int piece_size = 1;
> +  int copied_bytes = 0;
>    int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
>    if (src_align_bytes >= 0)
>      src_align_bytes = desired_align - src_align_bytes;
> -  if (align_bytes & 1)
> -    {
> -      dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
> -      src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
> -      off = 1;
> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
> -    }
> -  if (align_bytes & 2)
> -    {
> -      dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
> -      src = adjust_automodify_address_nv (src, HImode, srcreg, off);
> -      if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
> -       set_mem_align (dst, 2 * BITS_PER_UNIT);
> -      if (src_align_bytes >= 0
> -         && (src_align_bytes & 1) == (align_bytes & 1)
> -         && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
> -       set_mem_align (src, 2 * BITS_PER_UNIT);
> -      off = 2;
> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
> -    }
> -  if (align_bytes & 4)
> +
> +  for (piece_size = 1;
> +       piece_size <= desired_align && copied_bytes < align_bytes;
> +       piece_size <<= 1)
>      {
> -      dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
> -      src = adjust_automodify_address_nv (src, SImode, srcreg, off);
> -      if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
> -       set_mem_align (dst, 4 * BITS_PER_UNIT);
> -      if (src_align_bytes >= 0)
> +      if (align_bytes & piece_size)
>         {
> -         unsigned int src_align = 0;
> -         if ((src_align_bytes & 3) == (align_bytes & 3))
> -           src_align = 4;
> -         else if ((src_align_bytes & 1) == (align_bytes & 1))
> -           src_align = 2;
> -         if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
> -           set_mem_align (src, src_align * BITS_PER_UNIT);
> +         dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
> +         copied_bytes += piece_size;
>         }
> -      off = 4;
> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>      }
> -  dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
> -  src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
> +
>    if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
>      set_mem_align (dst, desired_align * BITS_PER_UNIT);
>    if (src_align_bytes >= 0)
>      {
> -      unsigned int src_align = 0;
> -      if ((src_align_bytes & 7) == (align_bytes & 7))
> -       src_align = 8;
> -      else if ((src_align_bytes & 3) == (align_bytes & 3))
> -       src_align = 4;
> -      else if ((src_align_bytes & 1) == (align_bytes & 1))
> -       src_align = 2;
> +      unsigned int src_align;
> +      for (src_align = desired_align; src_align >= 2; src_align >>= 1)
> +       {
> +         if ((src_align_bytes & (src_align - 1))
> +              == (align_bytes & (src_align - 1)))
> +           break;
> +       }
>        if (src_align > (unsigned int) desired_align)
>         src_align = desired_align;
>        if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
> @@ -22666,42 +22650,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
>  static int
>  decide_alignment (int align,
>                   enum stringop_alg alg,
> -                 int expected_size)
> +                 int expected_size,
> +                 enum machine_mode move_mode)
>  {
>    int desired_align = 0;
> -  switch (alg)
> -    {
> -      case no_stringop:
> -       gcc_unreachable ();
> -      case loop:
> -      case unrolled_loop:
> -       desired_align = GET_MODE_SIZE (Pmode);
> -       break;
> -      case rep_prefix_8_byte:
> -       desired_align = 8;
> -       break;
> -      case rep_prefix_4_byte:
> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
> -          copying whole cacheline at once.  */
> -       if (TARGET_PENTIUMPRO)
> -         desired_align = 8;
> -       else
> -         desired_align = 4;
> -       break;
> -      case rep_prefix_1_byte:
> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
> -          copying whole cacheline at once.  */
> -       if (TARGET_PENTIUMPRO)
> -         desired_align = 8;
> -       else
> -         desired_align = 1;
> -       break;
> -      case loop_1_byte:
> -       desired_align = 1;
> -       break;
> -      case libcall:
> -       return 0;
> -    }
> +
> +  gcc_assert (alg != no_stringop);
> +
> +  if (alg == libcall)
> +    return 0;
> +  if (move_mode == VOIDmode)
> +    return 0;
> +
> +  desired_align = GET_MODE_SIZE (move_mode);
> +  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
> +     copying whole cacheline at once.  */
> +  if (TARGET_PENTIUMPRO
> +      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
> +    desired_align = 8;
>
>    if (optimize_size)
>      desired_align = 1;
> @@ -22709,6 +22675,7 @@ decide_alignment (int align,
>      desired_align = align;
>    if (expected_size != -1 && expected_size < 4)
>      desired_align = align;
> +
>    return desired_align;
>  }
>
> @@ -22765,6 +22732,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>    int dynamic_check;
>    bool need_zero_guard = false;
>    bool noalign;
> +  enum machine_mode move_mode = VOIDmode;
> +  int unroll_factor = 1;
>
>    if (CONST_INT_P (align_exp))
>      align = INTVAL (align_exp);
> @@ -22788,50 +22757,60 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>
>    /* Step 0: Decide on preferred algorithm, desired alignment and
>       size of chunks to be copied by main loop.  */
> -
>    alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
> -  desired_align = decide_alignment (align, alg, expected_size);
> -
> -  if (!TARGET_ALIGN_STRINGOPS || noalign)
> -    align = desired_align;
> -
>    if (alg == libcall)
>      return false;
>    gcc_assert (alg != no_stringop);
> +
>    if (!count)
>      count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
>    destreg = copy_addr_to_reg (XEXP (dst, 0));
>    srcreg = copy_addr_to_reg (XEXP (src, 0));
> +
> +  unroll_factor = 1;
> +  move_mode = word_mode;
>    switch (alg)
>      {
>      case libcall:
>      case no_stringop:
>        gcc_unreachable ();
> +    case loop_1_byte:
> +      need_zero_guard = true;
> +      move_mode = QImode;
> +      break;
>      case loop:
>        need_zero_guard = true;
> -      size_needed = GET_MODE_SIZE (word_mode);
>        break;
>      case unrolled_loop:
>        need_zero_guard = true;
> -      size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
> +      unroll_factor = (TARGET_64BIT ? 4 : 2);
> +      break;
> +    case vector_loop:
> +      need_zero_guard = true;
> +      unroll_factor = 4;
> +      /* Find the widest supported mode.  */
> +      move_mode = Pmode;
> +      while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
> +            != CODE_FOR_nothing)
> +         move_mode = GET_MODE_WIDER_MODE (move_mode);
>        break;
>      case rep_prefix_8_byte:
> -      size_needed = 8;
> +      move_mode = DImode;
>        break;
>      case rep_prefix_4_byte:
> -      size_needed = 4;
> +      move_mode = SImode;
>        break;
>      case rep_prefix_1_byte:
> -      size_needed = 1;
> -      break;
> -    case loop_1_byte:
> -      need_zero_guard = true;
> -      size_needed = 1;
> +      move_mode = QImode;
>        break;
>      }
> -
> +  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
>    epilogue_size_needed = size_needed;
>
> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
>
> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
> +  if (!TARGET_ALIGN_STRINGOPS || noalign)
> +    align = desired_align;
> +
>
> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
>
> Otherwise the patch seems resonable.  Thanks for submitting it by pieces so the review is easier.
>
> Honza

--
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.
Michael Zolotukhin - April 18, 2013, 11:55 a.m.
Forgot to add Uros - adding now.

On 18 April 2013 15:53, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi,
> Jan, thanks for the review, I hope to prepare an updated version of the patch
> shortly.  Please see my answers to your comments below.
>
> Uros, there is a question of a better approach for generation of wide moves.
> Could you please comment it (see details in bullets 3 and 5)?
>
> 1.
>> +static int smallest_pow2_greater_than (int);
>>
>> Perhaps it is easier to use existing 1<<ceil_log2?
> Well, yep.  Actually, this routine has already been used there, so I continued
> using it.  I guess we could change its implementation to call
> ceil_log2/floor_log2 or remove it entirely.
>
> 2.
>> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
>> -      srcmem = change_address (srcmem, mode, y_addr);
>> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
>> +      srcmem = adjust_address (srcmem, mode, 0);
>> ...
>> This change looks OK and can go into manline independnetly. Just please ensure that changing
>> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
>> set of the src/destination objects.
> Could you explain it in more details?  Do you mean that at the beginning DST
> and SRC could point to one memory location and have corresponding alias sets,
> and I just change addresses they point to without invalidating alias sets?
> I haven't thought about this, and that seems like a possible bug, but I guess
> it could be simply fixed by calling change_address at the end.
>
> 3.
>> +  /* Find the widest mode in which we could perform moves.
>> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
>> +     it until move of such size is supported.  */
>> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
>> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>>
>> I suppose this is a problem with SSE moves ending up in integer register, since
>> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
>> with the original mode parmaeter?
> Yes, here we choose TImode instead of a vector mode, but that actually was done
> intentionally.  I tried several approaches here and decided that using the
> widest integer mode is the best one for now.  We could try to find out a
> particular (vector)mode in which we want to perform copying, but isn't it better
> to rely on a machine-description here?  My idea here was to just request a copy
> of, for instance, 128-bit piece (i.e. one TI-move) and leave it to the compiler
> to choose the most optimal way of performing it.  Currently, the compiler thinks
> that move of 128bits should be splitted into two moves of 64-bits (this
> transformation is done in split2 pass) - if it's actually not so optimal, we
> should fix it there, IMHO.
>
> I think Uros could give me an advice on whether it's a reasonable approach or it
> should be changed.
>
> Also, I tried to avoid such fixes in this patch - that doesn't mean I'm not
> going to work on the fixes, quite the contrary.  But it'd be easier to work on
> them if we have a code in the trunk that could reveal the problem.
>
> 4.
>> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
>> size optimization.
> Do you mean removing emit_strmov?  I don't think it'll kill anything, as new
> emit_memmov is capable of doing what emit_strmov did and is just an extended
> version of it.  BTW, under TARGET_SINGLE_STRINGOP switch gen_strmov is used, not
> emit_strmov - behaviour there hasn't been changed by this patch.
>
> 5.
>> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
> We try to achieve a required alignment by prologue, so in the main loop
> destination is aligned properly.  Source, meanwhile, could be misaligned, so for
> it unaligned moves could be generated.  Here I actually also rely on the fact
> that we have an optimal description of aligned/unaligned moves in MD-file, i.e.
> if it's better to emit two DI-moves instead of one unaligned TI-mode, then
> splits/expands will manage to do that.
>
> 6.
>> +  else if (TREE_CODE (expr) == MEM_REF)
>> +    {
>> +      tree base = TREE_OPERAND (expr, 0);
>> +      tree byte_offset = TREE_OPERAND (expr, 1);
>> +      if (TREE_CODE (base) != ADDR_EXPR
>> +         || TREE_CODE (byte_offset) != INTEGER_CST)
>> +       return -1;
>> +      if (!DECL_P (TREE_OPERAND (base, 0))
>> +         || DECL_ALIGN (TREE_OPERAND (base, 0)) < align)
>>
>> You can use TYPE_ALIGN here? In general can't we replace all the GIMPLE
>> handling by get_object_alignment?
>>
>> +       return -1;
>> +      offset += tree_low_cst (byte_offset, 1);
>> +    }
>>    else
>>      return -1;
>>
>> This change out to go independently. I can not review it.
>> I will make first look over the patch shortly, but please send updated patch fixing
>> the problem with integer regs.
> Actually, I don't know what is a right way to find out alignment, but the
> existing one didn't work.  Routine get_mem_align_offset didn't handle MEM_REFs
> at all, so I added some handling there - I'm not sure it's complete and
> absoulutely correct, but that currently works for me.  I'd be glad to hear any
> suggestions of how that should be done - whom should I ask about it?
>
> ---
> Thanks, Michael
>
>
> On 17 April 2013 19:12, Jan Hubicka <hubicka@ucw.cz> wrote:
>> @@ -2392,6 +2392,7 @@ static void ix86_set_current_function (tree);
>>  static unsigned int ix86_minimum_incoming_stack_boundary (bool);
>>
>>  static enum calling_abi ix86_function_abi (const_tree);
>> +static int smallest_pow2_greater_than (int);
>>
>> Perhaps it is easier to use existing 1<<ceil_log2?
>>
>>
>>  #ifndef SUBTARGET32_DEFAULT_CPU
>> @@ -21829,11 +21830,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>>  {
>>    rtx out_label, top_label, iter, tmp;
>>    enum machine_mode iter_mode = counter_mode (count);
>> -  rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
>> +  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
>> +  rtx piece_size = GEN_INT (piece_size_n);
>>    rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
>>    rtx size;
>> -  rtx x_addr;
>> -  rtx y_addr;
>>    int i;
>>
>>    top_label = gen_label_rtx ();
>> @@ -21854,13 +21854,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>>    emit_label (top_label);
>>
>>    tmp = convert_modes (Pmode, iter_mode, iter, true);
>> -  x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
>> -  destmem = change_address (destmem, mode, x_addr);
>> +
>> +  /* This assert could be relaxed - in this case we'll need to compute
>> +     smallest power of two, containing in PIECE_SIZE_N and pass it to
>> +     offset_address.  */
>> +  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
>> +  destmem = offset_address (destmem, tmp, piece_size_n);
>> +  destmem = adjust_address (destmem, mode, 0);
>>
>>    if (srcmem)
>>      {
>> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
>> -      srcmem = change_address (srcmem, mode, y_addr);
>> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
>> +      srcmem = adjust_address (srcmem, mode, 0);
>>
>>        /* When unrolling for chips that reorder memory reads and writes,
>>          we can save registers by using single temporary.
>> @@ -22039,13 +22044,61 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
>>    emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
>>  }
>>
>> This change looks OK and can go into manline independnetly. Just please ensure that changing
>> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
>> set of the src/destination objects.
>>
>> -static void
>> -emit_strmov (rtx destmem, rtx srcmem,
>> -            rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
>> -{
>> -  rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
>> -  rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
>> -  emit_insn (gen_strmov (destptr, dest, srcptr, src));
>> +/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
>> +   DESTMEM.
>> +   SRC is passed by pointer to be updated on return.
>> +   Return value is updated DST.  */
>> +static rtx
>> +emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
>> +            HOST_WIDE_INT size_to_move)
>> +{
>> +  rtx dst = destmem, src = *srcmem, adjust, tempreg;
>> +  enum insn_code code;
>> +  enum machine_mode move_mode;
>> +  int piece_size, i;
>> +
>> +  /* Find the widest mode in which we could perform moves.
>> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
>> +     it until move of such size is supported.  */
>> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
>> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>>
>> I suppose this is a problem with SSE moves ending up in integer register, since
>> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
>> with the original mode parmaeter?
>> +  code = optab_handler (mov_optab, move_mode);
>> +  while (code == CODE_FOR_nothing && piece_size > 1)
>> +    {
>> +      piece_size >>= 1;
>> +      move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>> +      code = optab_handler (mov_optab, move_mode);
>> +    }
>> +  gcc_assert (code != CODE_FOR_nothing);
>> +
>> +  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
>> +  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
>> +
>> +  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
>> +  gcc_assert (size_to_move % piece_size == 0);
>> +  adjust = GEN_INT (piece_size);
>> +  for (i = 0; i < size_to_move; i += piece_size)
>> +    {
>> +      /* We move from memory to memory, so we'll need to do it via
>> +        a temporary register.  */
>> +      tempreg = gen_reg_rtx (move_mode);
>> +      emit_insn (GEN_FCN (code) (tempreg, src));
>> +      emit_insn (GEN_FCN (code) (dst, tempreg));
>> +
>> +      emit_move_insn (destptr,
>> +                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
>> +      emit_move_insn (srcptr,
>> +                     gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
>> +
>> +      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
>> +                                         piece_size);
>> +      src = adjust_automodify_address_nv (src, move_mode, srcptr,
>> +                                         piece_size);
>> +    }
>> +
>> +  /* Update DST and SRC rtx.  */
>> +  *srcmem = src;
>> +  return dst;
>>
>> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
>> size optimization.
>>  }
>>
>>  /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
>> @@ -22057,44 +22110,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
>>    if (CONST_INT_P (count))
>>      {
>>        HOST_WIDE_INT countval = INTVAL (count);
>> -      int offset = 0;
>> +      HOST_WIDE_INT epilogue_size = countval % max_size;
>> +      int i;
>>
>> -      if ((countval & 0x10) && max_size > 16)
>> -       {
>> -         if (TARGET_64BIT)
>> -           {
>> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
>> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
>> -           }
>> -         else
>> -           gcc_unreachable ();
>> -         offset += 16;
>> -       }
>> -      if ((countval & 0x08) && max_size > 8)
>> -       {
>> -         if (TARGET_64BIT)
>> -           emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
>> -         else
>> -           {
>> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
>> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
>> -           }
>> -         offset += 8;
>> -       }
>> -      if ((countval & 0x04) && max_size > 4)
>> +      /* For now MAX_SIZE should be a power of 2.  This assert could be
>> +        relaxed, but it'll require a bit more complicated epilogue
>> +        expanding.  */
>> +      gcc_assert ((max_size & (max_size - 1)) == 0);
>> +      for (i = max_size; i >= 1; i >>= 1)
>>         {
>> -          emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
>> -         offset += 4;
>> -       }
>> -      if ((countval & 0x02) && max_size > 2)
>> -       {
>> -          emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
>> -         offset += 2;
>> -       }
>> -      if ((countval & 0x01) && max_size > 1)
>> -       {
>> -          emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
>> -         offset += 1;
>> +         if (epilogue_size & i)
>> +           destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
>>         }
>>        return;
>>      }
>> @@ -22330,47 +22356,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_
>>  }
>>
>>  /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
>> -   DESIRED_ALIGNMENT.  */
>> -static void
>> +   DESIRED_ALIGNMENT.
>> +   Return value is updated DESTMEM.  */
>> +static rtx
>>  expand_movmem_prologue (rtx destmem, rtx srcmem,
>>                         rtx destptr, rtx srcptr, rtx count,
>>                         int align, int desired_alignment)
>>  {
>> -  if (align <= 1 && desired_alignment > 1)
>> -    {
>> -      rtx label = ix86_expand_aligntest (destptr, 1, false);
>> -      srcmem = change_address (srcmem, QImode, srcptr);
>> -      destmem = change_address (destmem, QImode, destptr);
>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>> -      ix86_adjust_counter (count, 1);
>> -      emit_label (label);
>> -      LABEL_NUSES (label) = 1;
>> -    }
>> -  if (align <= 2 && desired_alignment > 2)
>> -    {
>> -      rtx label = ix86_expand_aligntest (destptr, 2, false);
>> -      srcmem = change_address (srcmem, HImode, srcptr);
>> -      destmem = change_address (destmem, HImode, destptr);
>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>> -      ix86_adjust_counter (count, 2);
>> -      emit_label (label);
>> -      LABEL_NUSES (label) = 1;
>> -    }
>> -  if (align <= 4 && desired_alignment > 4)
>> +  int i;
>> +  for (i = 1; i < desired_alignment; i <<= 1)
>>      {
>> -      rtx label = ix86_expand_aligntest (destptr, 4, false);
>> -      srcmem = change_address (srcmem, SImode, srcptr);
>> -      destmem = change_address (destmem, SImode, destptr);
>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>> -      ix86_adjust_counter (count, 4);
>> -      emit_label (label);
>> -      LABEL_NUSES (label) = 1;
>> +      if (align <= i)
>> +       {
>> +         rtx label = ix86_expand_aligntest (destptr, i, false);
>> +         destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
>> +         ix86_adjust_counter (count, i);
>> +         emit_label (label);
>> +         LABEL_NUSES (label) = 1;
>> +         set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
>> +       }
>>      }
>> -  gcc_assert (desired_alignment <= 8);
>> +  return destmem;
>>  }
>>
>>  /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
>> -   ALIGN_BYTES is how many bytes need to be copied.  */
>> +   ALIGN_BYTES is how many bytes need to be copied.
>> +   The function updates DST and SRC, namely, it sets proper alignment.
>> +   DST is returned via return value, SRC is updated via pointer SRCP.  */
>>  static rtx
>>  expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>>                                  int desired_align, int align_bytes)
>> @@ -22378,62 +22390,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>>    rtx src = *srcp;
>>    rtx orig_dst = dst;
>>    rtx orig_src = src;
>> -  int off = 0;
>> +  int piece_size = 1;
>> +  int copied_bytes = 0;
>>    int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
>>    if (src_align_bytes >= 0)
>>      src_align_bytes = desired_align - src_align_bytes;
>> -  if (align_bytes & 1)
>> -    {
>> -      dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
>> -      src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
>> -      off = 1;
>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>> -    }
>> -  if (align_bytes & 2)
>> -    {
>> -      dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
>> -      src = adjust_automodify_address_nv (src, HImode, srcreg, off);
>> -      if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
>> -       set_mem_align (dst, 2 * BITS_PER_UNIT);
>> -      if (src_align_bytes >= 0
>> -         && (src_align_bytes & 1) == (align_bytes & 1)
>> -         && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
>> -       set_mem_align (src, 2 * BITS_PER_UNIT);
>> -      off = 2;
>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>> -    }
>> -  if (align_bytes & 4)
>> +
>> +  for (piece_size = 1;
>> +       piece_size <= desired_align && copied_bytes < align_bytes;
>> +       piece_size <<= 1)
>>      {
>> -      dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
>> -      src = adjust_automodify_address_nv (src, SImode, srcreg, off);
>> -      if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
>> -       set_mem_align (dst, 4 * BITS_PER_UNIT);
>> -      if (src_align_bytes >= 0)
>> +      if (align_bytes & piece_size)
>>         {
>> -         unsigned int src_align = 0;
>> -         if ((src_align_bytes & 3) == (align_bytes & 3))
>> -           src_align = 4;
>> -         else if ((src_align_bytes & 1) == (align_bytes & 1))
>> -           src_align = 2;
>> -         if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
>> -           set_mem_align (src, src_align * BITS_PER_UNIT);
>> +         dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
>> +         copied_bytes += piece_size;
>>         }
>> -      off = 4;
>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>>      }
>> -  dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
>> -  src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
>> +
>>    if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
>>      set_mem_align (dst, desired_align * BITS_PER_UNIT);
>>    if (src_align_bytes >= 0)
>>      {
>> -      unsigned int src_align = 0;
>> -      if ((src_align_bytes & 7) == (align_bytes & 7))
>> -       src_align = 8;
>> -      else if ((src_align_bytes & 3) == (align_bytes & 3))
>> -       src_align = 4;
>> -      else if ((src_align_bytes & 1) == (align_bytes & 1))
>> -       src_align = 2;
>> +      unsigned int src_align;
>> +      for (src_align = desired_align; src_align >= 2; src_align >>= 1)
>> +       {
>> +         if ((src_align_bytes & (src_align - 1))
>> +              == (align_bytes & (src_align - 1)))
>> +           break;
>> +       }
>>        if (src_align > (unsigned int) desired_align)
>>         src_align = desired_align;
>>        if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
>> @@ -22666,42 +22650,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
>>  static int
>>  decide_alignment (int align,
>>                   enum stringop_alg alg,
>> -                 int expected_size)
>> +                 int expected_size,
>> +                 enum machine_mode move_mode)
>>  {
>>    int desired_align = 0;
>> -  switch (alg)
>> -    {
>> -      case no_stringop:
>> -       gcc_unreachable ();
>> -      case loop:
>> -      case unrolled_loop:
>> -       desired_align = GET_MODE_SIZE (Pmode);
>> -       break;
>> -      case rep_prefix_8_byte:
>> -       desired_align = 8;
>> -       break;
>> -      case rep_prefix_4_byte:
>> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>> -          copying whole cacheline at once.  */
>> -       if (TARGET_PENTIUMPRO)
>> -         desired_align = 8;
>> -       else
>> -         desired_align = 4;
>> -       break;
>> -      case rep_prefix_1_byte:
>> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>> -          copying whole cacheline at once.  */
>> -       if (TARGET_PENTIUMPRO)
>> -         desired_align = 8;
>> -       else
>> -         desired_align = 1;
>> -       break;
>> -      case loop_1_byte:
>> -       desired_align = 1;
>> -       break;
>> -      case libcall:
>> -       return 0;
>> -    }
>> +
>> +  gcc_assert (alg != no_stringop);
>> +
>> +  if (alg == libcall)
>> +    return 0;
>> +  if (move_mode == VOIDmode)
>> +    return 0;
>> +
>> +  desired_align = GET_MODE_SIZE (move_mode);
>> +  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>> +     copying whole cacheline at once.  */
>> +  if (TARGET_PENTIUMPRO
>> +      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
>> +    desired_align = 8;
>>
>>    if (optimize_size)
>>      desired_align = 1;
>> @@ -22709,6 +22675,7 @@ decide_alignment (int align,
>>      desired_align = align;
>>    if (expected_size != -1 && expected_size < 4)
>>      desired_align = align;
>> +
>>    return desired_align;
>>  }
>>
>> @@ -22765,6 +22732,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>>    int dynamic_check;
>>    bool need_zero_guard = false;
>>    bool noalign;
>> +  enum machine_mode move_mode = VOIDmode;
>> +  int unroll_factor = 1;
>>
>>    if (CONST_INT_P (align_exp))
>>      align = INTVAL (align_exp);
>> @@ -22788,50 +22757,60 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>>
>>    /* Step 0: Decide on preferred algorithm, desired alignment and
>>       size of chunks to be copied by main loop.  */
>> -
>>    alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
>> -  desired_align = decide_alignment (align, alg, expected_size);
>> -
>> -  if (!TARGET_ALIGN_STRINGOPS || noalign)
>> -    align = desired_align;
>> -
>>    if (alg == libcall)
>>      return false;
>>    gcc_assert (alg != no_stringop);
>> +
>>    if (!count)
>>      count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
>>    destreg = copy_addr_to_reg (XEXP (dst, 0));
>>    srcreg = copy_addr_to_reg (XEXP (src, 0));
>> +
>> +  unroll_factor = 1;
>> +  move_mode = word_mode;
>>    switch (alg)
>>      {
>>      case libcall:
>>      case no_stringop:
>>        gcc_unreachable ();
>> +    case loop_1_byte:
>> +      need_zero_guard = true;
>> +      move_mode = QImode;
>> +      break;
>>      case loop:
>>        need_zero_guard = true;
>> -      size_needed = GET_MODE_SIZE (word_mode);
>>        break;
>>      case unrolled_loop:
>>        need_zero_guard = true;
>> -      size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
>> +      unroll_factor = (TARGET_64BIT ? 4 : 2);
>> +      break;
>> +    case vector_loop:
>> +      need_zero_guard = true;
>> +      unroll_factor = 4;
>> +      /* Find the widest supported mode.  */
>> +      move_mode = Pmode;
>> +      while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
>> +            != CODE_FOR_nothing)
>> +         move_mode = GET_MODE_WIDER_MODE (move_mode);
>>        break;
>>      case rep_prefix_8_byte:
>> -      size_needed = 8;
>> +      move_mode = DImode;
>>        break;
>>      case rep_prefix_4_byte:
>> -      size_needed = 4;
>> +      move_mode = SImode;
>>        break;
>>      case rep_prefix_1_byte:
>> -      size_needed = 1;
>> -      break;
>> -    case loop_1_byte:
>> -      need_zero_guard = true;
>> -      size_needed = 1;
>> +      move_mode = QImode;
>>        break;
>>      }
>> -
>> +  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
>>    epilogue_size_needed = size_needed;
>>
>> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
>>
>> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
>> +  if (!TARGET_ALIGN_STRINGOPS || noalign)
>> +    align = desired_align;
>> +
>>
>> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
>>
>> Otherwise the patch seems resonable.  Thanks for submitting it by pieces so the review is easier.
>>
>> Honza
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.



--
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.
Michael Zolotukhin - May 14, 2013, 2:34 p.m.
Hi,
I attached an updated version of the patch.  Looks like the 64-bit issue is
resolved in it (now a vector mode is explicitly chosen instead of TI- or
another integer mode).  Also, some of remarks are fixed in it - some
others are still not changed, because I don't know what to do with them right
now (see my questions in the previous letter).

Could you take a look at the patch?

I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
(I checked only stability).

---
Thanks, Michael

On 18 April 2013 15:55, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Forgot to add Uros - adding now.
>
> On 18 April 2013 15:53, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> Hi,
>> Jan, thanks for the review, I hope to prepare an updated version of the patch
>> shortly.  Please see my answers to your comments below.
>>
>> Uros, there is a question of a better approach for generation of wide moves.
>> Could you please comment it (see details in bullets 3 and 5)?
>>
>> 1.
>>> +static int smallest_pow2_greater_than (int);
>>>
>>> Perhaps it is easier to use existing 1<<ceil_log2?
>> Well, yep.  Actually, this routine has already been used there, so I continued
>> using it.  I guess we could change its implementation to call
>> ceil_log2/floor_log2 or remove it entirely.
>>
>> 2.
>>> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
>>> -      srcmem = change_address (srcmem, mode, y_addr);
>>> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
>>> +      srcmem = adjust_address (srcmem, mode, 0);
>>> ...
>>> This change looks OK and can go into manline independnetly. Just please ensure that changing
>>> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
>>> set of the src/destination objects.
>> Could you explain it in more details?  Do you mean that at the beginning DST
>> and SRC could point to one memory location and have corresponding alias sets,
>> and I just change addresses they point to without invalidating alias sets?
>> I haven't thought about this, and that seems like a possible bug, but I guess
>> it could be simply fixed by calling change_address at the end.
>>
>> 3.
>>> +  /* Find the widest mode in which we could perform moves.
>>> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
>>> +     it until move of such size is supported.  */
>>> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
>>> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>>>
>>> I suppose this is a problem with SSE moves ending up in integer register, since
>>> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
>>> with the original mode parmaeter?
>> Yes, here we choose TImode instead of a vector mode, but that actually was done
>> intentionally.  I tried several approaches here and decided that using the
>> widest integer mode is the best one for now.  We could try to find out a
>> particular (vector)mode in which we want to perform copying, but isn't it better
>> to rely on a machine-description here?  My idea here was to just request a copy
>> of, for instance, 128-bit piece (i.e. one TI-move) and leave it to the compiler
>> to choose the most optimal way of performing it.  Currently, the compiler thinks
>> that move of 128bits should be splitted into two moves of 64-bits (this
>> transformation is done in split2 pass) - if it's actually not so optimal, we
>> should fix it there, IMHO.
>>
>> I think Uros could give me an advice on whether it's a reasonable approach or it
>> should be changed.
>>
>> Also, I tried to avoid such fixes in this patch - that doesn't mean I'm not
>> going to work on the fixes, quite the contrary.  But it'd be easier to work on
>> them if we have a code in the trunk that could reveal the problem.
>>
>> 4.
>>> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
>>> size optimization.
>> Do you mean removing emit_strmov?  I don't think it'll kill anything, as new
>> emit_memmov is capable of doing what emit_strmov did and is just an extended
>> version of it.  BTW, under TARGET_SINGLE_STRINGOP switch gen_strmov is used, not
>> emit_strmov - behaviour there hasn't been changed by this patch.
>>
>> 5.
>>> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
>> We try to achieve a required alignment by prologue, so in the main loop
>> destination is aligned properly.  Source, meanwhile, could be misaligned, so for
>> it unaligned moves could be generated.  Here I actually also rely on the fact
>> that we have an optimal description of aligned/unaligned moves in MD-file, i.e.
>> if it's better to emit two DI-moves instead of one unaligned TI-mode, then
>> splits/expands will manage to do that.
>>
>> 6.
>>> +  else if (TREE_CODE (expr) == MEM_REF)
>>> +    {
>>> +      tree base = TREE_OPERAND (expr, 0);
>>> +      tree byte_offset = TREE_OPERAND (expr, 1);
>>> +      if (TREE_CODE (base) != ADDR_EXPR
>>> +         || TREE_CODE (byte_offset) != INTEGER_CST)
>>> +       return -1;
>>> +      if (!DECL_P (TREE_OPERAND (base, 0))
>>> +         || DECL_ALIGN (TREE_OPERAND (base, 0)) < align)
>>>
>>> You can use TYPE_ALIGN here? In general can't we replace all the GIMPLE
>>> handling by get_object_alignment?
>>>
>>> +       return -1;
>>> +      offset += tree_low_cst (byte_offset, 1);
>>> +    }
>>>    else
>>>      return -1;
>>>
>>> This change out to go independently. I can not review it.
>>> I will make first look over the patch shortly, but please send updated patch fixing
>>> the problem with integer regs.
>> Actually, I don't know what is a right way to find out alignment, but the
>> existing one didn't work.  Routine get_mem_align_offset didn't handle MEM_REFs
>> at all, so I added some handling there - I'm not sure it's complete and
>> absoulutely correct, but that currently works for me.  I'd be glad to hear any
>> suggestions of how that should be done - whom should I ask about it?
>>
>> ---
>> Thanks, Michael
>>
>>
>> On 17 April 2013 19:12, Jan Hubicka <hubicka@ucw.cz> wrote:
>>> @@ -2392,6 +2392,7 @@ static void ix86_set_current_function (tree);
>>>  static unsigned int ix86_minimum_incoming_stack_boundary (bool);
>>>
>>>  static enum calling_abi ix86_function_abi (const_tree);
>>> +static int smallest_pow2_greater_than (int);
>>>
>>> Perhaps it is easier to use existing 1<<ceil_log2?
>>>
>>>
>>>  #ifndef SUBTARGET32_DEFAULT_CPU
>>> @@ -21829,11 +21830,10 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>>>  {
>>>    rtx out_label, top_label, iter, tmp;
>>>    enum machine_mode iter_mode = counter_mode (count);
>>> -  rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
>>> +  int piece_size_n = GET_MODE_SIZE (mode) * unroll;
>>> +  rtx piece_size = GEN_INT (piece_size_n);
>>>    rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
>>>    rtx size;
>>> -  rtx x_addr;
>>> -  rtx y_addr;
>>>    int i;
>>>
>>>    top_label = gen_label_rtx ();
>>> @@ -21854,13 +21854,18 @@ expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
>>>    emit_label (top_label);
>>>
>>>    tmp = convert_modes (Pmode, iter_mode, iter, true);
>>> -  x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
>>> -  destmem = change_address (destmem, mode, x_addr);
>>> +
>>> +  /* This assert could be relaxed - in this case we'll need to compute
>>> +     smallest power of two, containing in PIECE_SIZE_N and pass it to
>>> +     offset_address.  */
>>> +  gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
>>> +  destmem = offset_address (destmem, tmp, piece_size_n);
>>> +  destmem = adjust_address (destmem, mode, 0);
>>>
>>>    if (srcmem)
>>>      {
>>> -      y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
>>> -      srcmem = change_address (srcmem, mode, y_addr);
>>> +      srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
>>> +      srcmem = adjust_address (srcmem, mode, 0);
>>>
>>>        /* When unrolling for chips that reorder memory reads and writes,
>>>          we can save registers by using single temporary.
>>> @@ -22039,13 +22044,61 @@ expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
>>>    emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
>>>  }
>>>
>>> This change looks OK and can go into manline independnetly. Just please ensure that changing
>>> the way address is computed is not making us to preserve alias set. Memmove can not rely on the alias
>>> set of the src/destination objects.
>>>
>>> -static void
>>> -emit_strmov (rtx destmem, rtx srcmem,
>>> -            rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
>>> -{
>>> -  rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
>>> -  rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
>>> -  emit_insn (gen_strmov (destptr, dest, srcptr, src));
>>> +/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
>>> +   DESTMEM.
>>> +   SRC is passed by pointer to be updated on return.
>>> +   Return value is updated DST.  */
>>> +static rtx
>>> +emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
>>> +            HOST_WIDE_INT size_to_move)
>>> +{
>>> +  rtx dst = destmem, src = *srcmem, adjust, tempreg;
>>> +  enum insn_code code;
>>> +  enum machine_mode move_mode;
>>> +  int piece_size, i;
>>> +
>>> +  /* Find the widest mode in which we could perform moves.
>>> +     Start with the biggest power of 2 less than SIZE_TO_MOVE and half
>>> +     it until move of such size is supported.  */
>>> +  piece_size = smallest_pow2_greater_than (size_to_move) >> 1;
>>> +  move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>>>
>>> I suppose this is a problem with SSE moves ending up in integer register, since
>>> you get TImode rather than vectorized mode, like V8QImode in here.  Why not stick
>>> with the original mode parmaeter?
>>> +  code = optab_handler (mov_optab, move_mode);
>>> +  while (code == CODE_FOR_nothing && piece_size > 1)
>>> +    {
>>> +      piece_size >>= 1;
>>> +      move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
>>> +      code = optab_handler (mov_optab, move_mode);
>>> +    }
>>> +  gcc_assert (code != CODE_FOR_nothing);
>>> +
>>> +  dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
>>> +  src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
>>> +
>>> +  /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
>>> +  gcc_assert (size_to_move % piece_size == 0);
>>> +  adjust = GEN_INT (piece_size);
>>> +  for (i = 0; i < size_to_move; i += piece_size)
>>> +    {
>>> +      /* We move from memory to memory, so we'll need to do it via
>>> +        a temporary register.  */
>>> +      tempreg = gen_reg_rtx (move_mode);
>>> +      emit_insn (GEN_FCN (code) (tempreg, src));
>>> +      emit_insn (GEN_FCN (code) (dst, tempreg));
>>> +
>>> +      emit_move_insn (destptr,
>>> +                     gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
>>> +      emit_move_insn (srcptr,
>>> +                     gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
>>> +
>>> +      dst = adjust_automodify_address_nv (dst, move_mode, destptr,
>>> +                                         piece_size);
>>> +      src = adjust_automodify_address_nv (src, move_mode, srcptr,
>>> +                                         piece_size);
>>> +    }
>>> +
>>> +  /* Update DST and SRC rtx.  */
>>> +  *srcmem = src;
>>> +  return dst;
>>>
>>> Doesn't this effectively kill support for TARGET_SINGLE_STRINGOP? It is useful as
>>> size optimization.
>>>  }
>>>
>>>  /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
>>> @@ -22057,44 +22110,17 @@ expand_movmem_epilogue (rtx destmem, rtx srcmem,
>>>    if (CONST_INT_P (count))
>>>      {
>>>        HOST_WIDE_INT countval = INTVAL (count);
>>> -      int offset = 0;
>>> +      HOST_WIDE_INT epilogue_size = countval % max_size;
>>> +      int i;
>>>
>>> -      if ((countval & 0x10) && max_size > 16)
>>> -       {
>>> -         if (TARGET_64BIT)
>>> -           {
>>> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
>>> -             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
>>> -           }
>>> -         else
>>> -           gcc_unreachable ();
>>> -         offset += 16;
>>> -       }
>>> -      if ((countval & 0x08) && max_size > 8)
>>> -       {
>>> -         if (TARGET_64BIT)
>>> -           emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
>>> -         else
>>> -           {
>>> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
>>> -             emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
>>> -           }
>>> -         offset += 8;
>>> -       }
>>> -      if ((countval & 0x04) && max_size > 4)
>>> +      /* For now MAX_SIZE should be a power of 2.  This assert could be
>>> +        relaxed, but it'll require a bit more complicated epilogue
>>> +        expanding.  */
>>> +      gcc_assert ((max_size & (max_size - 1)) == 0);
>>> +      for (i = max_size; i >= 1; i >>= 1)
>>>         {
>>> -          emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
>>> -         offset += 4;
>>> -       }
>>> -      if ((countval & 0x02) && max_size > 2)
>>> -       {
>>> -          emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
>>> -         offset += 2;
>>> -       }
>>> -      if ((countval & 0x01) && max_size > 1)
>>> -       {
>>> -          emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
>>> -         offset += 1;
>>> +         if (epilogue_size & i)
>>> +           destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
>>>         }
>>>        return;
>>>      }
>>> @@ -22330,47 +22356,33 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_
>>>  }
>>>
>>>  /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
>>> -   DESIRED_ALIGNMENT.  */
>>> -static void
>>> +   DESIRED_ALIGNMENT.
>>> +   Return value is updated DESTMEM.  */
>>> +static rtx
>>>  expand_movmem_prologue (rtx destmem, rtx srcmem,
>>>                         rtx destptr, rtx srcptr, rtx count,
>>>                         int align, int desired_alignment)
>>>  {
>>> -  if (align <= 1 && desired_alignment > 1)
>>> -    {
>>> -      rtx label = ix86_expand_aligntest (destptr, 1, false);
>>> -      srcmem = change_address (srcmem, QImode, srcptr);
>>> -      destmem = change_address (destmem, QImode, destptr);
>>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>>> -      ix86_adjust_counter (count, 1);
>>> -      emit_label (label);
>>> -      LABEL_NUSES (label) = 1;
>>> -    }
>>> -  if (align <= 2 && desired_alignment > 2)
>>> -    {
>>> -      rtx label = ix86_expand_aligntest (destptr, 2, false);
>>> -      srcmem = change_address (srcmem, HImode, srcptr);
>>> -      destmem = change_address (destmem, HImode, destptr);
>>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>>> -      ix86_adjust_counter (count, 2);
>>> -      emit_label (label);
>>> -      LABEL_NUSES (label) = 1;
>>> -    }
>>> -  if (align <= 4 && desired_alignment > 4)
>>> +  int i;
>>> +  for (i = 1; i < desired_alignment; i <<= 1)
>>>      {
>>> -      rtx label = ix86_expand_aligntest (destptr, 4, false);
>>> -      srcmem = change_address (srcmem, SImode, srcptr);
>>> -      destmem = change_address (destmem, SImode, destptr);
>>> -      emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
>>> -      ix86_adjust_counter (count, 4);
>>> -      emit_label (label);
>>> -      LABEL_NUSES (label) = 1;
>>> +      if (align <= i)
>>> +       {
>>> +         rtx label = ix86_expand_aligntest (destptr, i, false);
>>> +         destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
>>> +         ix86_adjust_counter (count, i);
>>> +         emit_label (label);
>>> +         LABEL_NUSES (label) = 1;
>>> +         set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
>>> +       }
>>>      }
>>> -  gcc_assert (desired_alignment <= 8);
>>> +  return destmem;
>>>  }
>>>
>>>  /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
>>> -   ALIGN_BYTES is how many bytes need to be copied.  */
>>> +   ALIGN_BYTES is how many bytes need to be copied.
>>> +   The function updates DST and SRC, namely, it sets proper alignment.
>>> +   DST is returned via return value, SRC is updated via pointer SRCP.  */
>>>  static rtx
>>>  expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>>>                                  int desired_align, int align_bytes)
>>> @@ -22378,62 +22390,34 @@ expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
>>>    rtx src = *srcp;
>>>    rtx orig_dst = dst;
>>>    rtx orig_src = src;
>>> -  int off = 0;
>>> +  int piece_size = 1;
>>> +  int copied_bytes = 0;
>>>    int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
>>>    if (src_align_bytes >= 0)
>>>      src_align_bytes = desired_align - src_align_bytes;
>>> -  if (align_bytes & 1)
>>> -    {
>>> -      dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
>>> -      src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
>>> -      off = 1;
>>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>>> -    }
>>> -  if (align_bytes & 2)
>>> -    {
>>> -      dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
>>> -      src = adjust_automodify_address_nv (src, HImode, srcreg, off);
>>> -      if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
>>> -       set_mem_align (dst, 2 * BITS_PER_UNIT);
>>> -      if (src_align_bytes >= 0
>>> -         && (src_align_bytes & 1) == (align_bytes & 1)
>>> -         && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
>>> -       set_mem_align (src, 2 * BITS_PER_UNIT);
>>> -      off = 2;
>>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>>> -    }
>>> -  if (align_bytes & 4)
>>> +
>>> +  for (piece_size = 1;
>>> +       piece_size <= desired_align && copied_bytes < align_bytes;
>>> +       piece_size <<= 1)
>>>      {
>>> -      dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
>>> -      src = adjust_automodify_address_nv (src, SImode, srcreg, off);
>>> -      if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
>>> -       set_mem_align (dst, 4 * BITS_PER_UNIT);
>>> -      if (src_align_bytes >= 0)
>>> +      if (align_bytes & piece_size)
>>>         {
>>> -         unsigned int src_align = 0;
>>> -         if ((src_align_bytes & 3) == (align_bytes & 3))
>>> -           src_align = 4;
>>> -         else if ((src_align_bytes & 1) == (align_bytes & 1))
>>> -           src_align = 2;
>>> -         if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
>>> -           set_mem_align (src, src_align * BITS_PER_UNIT);
>>> +         dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
>>> +         copied_bytes += piece_size;
>>>         }
>>> -      off = 4;
>>> -      emit_insn (gen_strmov (destreg, dst, srcreg, src));
>>>      }
>>> -  dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
>>> -  src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
>>> +
>>>    if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
>>>      set_mem_align (dst, desired_align * BITS_PER_UNIT);
>>>    if (src_align_bytes >= 0)
>>>      {
>>> -      unsigned int src_align = 0;
>>> -      if ((src_align_bytes & 7) == (align_bytes & 7))
>>> -       src_align = 8;
>>> -      else if ((src_align_bytes & 3) == (align_bytes & 3))
>>> -       src_align = 4;
>>> -      else if ((src_align_bytes & 1) == (align_bytes & 1))
>>> -       src_align = 2;
>>> +      unsigned int src_align;
>>> +      for (src_align = desired_align; src_align >= 2; src_align >>= 1)
>>> +       {
>>> +         if ((src_align_bytes & (src_align - 1))
>>> +              == (align_bytes & (src_align - 1)))
>>> +           break;
>>> +       }
>>>        if (src_align > (unsigned int) desired_align)
>>>         src_align = desired_align;
>>>        if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
>>> @@ -22666,42 +22650,24 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
>>>  static int
>>>  decide_alignment (int align,
>>>                   enum stringop_alg alg,
>>> -                 int expected_size)
>>> +                 int expected_size,
>>> +                 enum machine_mode move_mode)
>>>  {
>>>    int desired_align = 0;
>>> -  switch (alg)
>>> -    {
>>> -      case no_stringop:
>>> -       gcc_unreachable ();
>>> -      case loop:
>>> -      case unrolled_loop:
>>> -       desired_align = GET_MODE_SIZE (Pmode);
>>> -       break;
>>> -      case rep_prefix_8_byte:
>>> -       desired_align = 8;
>>> -       break;
>>> -      case rep_prefix_4_byte:
>>> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>>> -          copying whole cacheline at once.  */
>>> -       if (TARGET_PENTIUMPRO)
>>> -         desired_align = 8;
>>> -       else
>>> -         desired_align = 4;
>>> -       break;
>>> -      case rep_prefix_1_byte:
>>> -       /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>>> -          copying whole cacheline at once.  */
>>> -       if (TARGET_PENTIUMPRO)
>>> -         desired_align = 8;
>>> -       else
>>> -         desired_align = 1;
>>> -       break;
>>> -      case loop_1_byte:
>>> -       desired_align = 1;
>>> -       break;
>>> -      case libcall:
>>> -       return 0;
>>> -    }
>>> +
>>> +  gcc_assert (alg != no_stringop);
>>> +
>>> +  if (alg == libcall)
>>> +    return 0;
>>> +  if (move_mode == VOIDmode)
>>> +    return 0;
>>> +
>>> +  desired_align = GET_MODE_SIZE (move_mode);
>>> +  /* PentiumPro has special logic triggering for 8 byte aligned blocks.
>>> +     copying whole cacheline at once.  */
>>> +  if (TARGET_PENTIUMPRO
>>> +      && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
>>> +    desired_align = 8;
>>>
>>>    if (optimize_size)
>>>      desired_align = 1;
>>> @@ -22709,6 +22675,7 @@ decide_alignment (int align,
>>>      desired_align = align;
>>>    if (expected_size != -1 && expected_size < 4)
>>>      desired_align = align;
>>> +
>>>    return desired_align;
>>>  }
>>>
>>> @@ -22765,6 +22732,8 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>>>    int dynamic_check;
>>>    bool need_zero_guard = false;
>>>    bool noalign;
>>> +  enum machine_mode move_mode = VOIDmode;
>>> +  int unroll_factor = 1;
>>>
>>>    if (CONST_INT_P (align_exp))
>>>      align = INTVAL (align_exp);
>>> @@ -22788,50 +22757,60 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
>>>
>>>    /* Step 0: Decide on preferred algorithm, desired alignment and
>>>       size of chunks to be copied by main loop.  */
>>> -
>>>    alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
>>> -  desired_align = decide_alignment (align, alg, expected_size);
>>> -
>>> -  if (!TARGET_ALIGN_STRINGOPS || noalign)
>>> -    align = desired_align;
>>> -
>>>    if (alg == libcall)
>>>      return false;
>>>    gcc_assert (alg != no_stringop);
>>> +
>>>    if (!count)
>>>      count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
>>>    destreg = copy_addr_to_reg (XEXP (dst, 0));
>>>    srcreg = copy_addr_to_reg (XEXP (src, 0));
>>> +
>>> +  unroll_factor = 1;
>>> +  move_mode = word_mode;
>>>    switch (alg)
>>>      {
>>>      case libcall:
>>>      case no_stringop:
>>>        gcc_unreachable ();
>>> +    case loop_1_byte:
>>> +      need_zero_guard = true;
>>> +      move_mode = QImode;
>>> +      break;
>>>      case loop:
>>>        need_zero_guard = true;
>>> -      size_needed = GET_MODE_SIZE (word_mode);
>>>        break;
>>>      case unrolled_loop:
>>>        need_zero_guard = true;
>>> -      size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
>>> +      unroll_factor = (TARGET_64BIT ? 4 : 2);
>>> +      break;
>>> +    case vector_loop:
>>> +      need_zero_guard = true;
>>> +      unroll_factor = 4;
>>> +      /* Find the widest supported mode.  */
>>> +      move_mode = Pmode;
>>> +      while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
>>> +            != CODE_FOR_nothing)
>>> +         move_mode = GET_MODE_WIDER_MODE (move_mode);
>>>        break;
>>>      case rep_prefix_8_byte:
>>> -      size_needed = 8;
>>> +      move_mode = DImode;
>>>        break;
>>>      case rep_prefix_4_byte:
>>> -      size_needed = 4;
>>> +      move_mode = SImode;
>>>        break;
>>>      case rep_prefix_1_byte:
>>> -      size_needed = 1;
>>> -      break;
>>> -    case loop_1_byte:
>>> -      need_zero_guard = true;
>>> -      size_needed = 1;
>>> +      move_mode = QImode;
>>>        break;
>>>      }
>>> -
>>> +  size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
>>>    epilogue_size_needed = size_needed;
>>>
>>> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
>>>
>>> +  desired_align = decide_alignment (align, alg, expected_size, move_mode);
>>> +  if (!TARGET_ALIGN_STRINGOPS || noalign)
>>> +    align = desired_align;
>>> +
>>>
>>> For SSE codegen, won't we need to track down in destination was aligned to generate aligned/unaligned moves?
>>>
>>> Otherwise the patch seems resonable.  Thanks for submitting it by pieces so the review is easier.
>>>
>>> Honza
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.
>
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.
H.J. Lu - May 14, 2013, 3:55 p.m.
On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi,
> I attached an updated version of the patch.  Looks like the 64-bit issue is
> resolved in it (now a vector mode is explicitly chosen instead of TI- or
> another integer mode).  Also, some of remarks are fixed in it - some
> others are still not changed, because I don't know what to do with them right
> now (see my questions in the previous letter).
>
> Could you take a look at the patch?
>
> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
> (I checked only stability).
>

You use Pmode as the largest integer mode.  Is word_mode better
than Pmode since word_mode is DI and Pmode may be SI for x32.


--
H.J.
Michael Zolotukhin - May 15, 2013, 12:47 p.m.
Hi HJ,
> You use Pmode as the largest integer mode.  Is word_mode better
> than Pmode since word_mode is DI and Pmode may be SI for x32.
I updated the patch to use word_mode instead of Pmode.  Bootstrap,
make check and Specs2000 are ok.

Thanks, Michael

On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> Hi,
>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>> another integer mode).  Also, some of remarks are fixed in it - some
>> others are still not changed, because I don't know what to do with them right
>> now (see my questions in the previous letter).
>>
>> Could you take a look at the patch?
>>
>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>> (I checked only stability).
>>
>
> You use Pmode as the largest integer mode.  Is word_mode better
> than Pmode since word_mode is DI and Pmode may be SI for x32.
>
>
> --
> H.J.
H.J. Lu - May 15, 2013, 3:45 p.m.
On Wed, May 15, 2013 at 5:47 AM, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi HJ,
>> You use Pmode as the largest integer mode.  Is word_mode better
>> than Pmode since word_mode is DI and Pmode may be SI for x32.
> I updated the patch to use word_mode instead of Pmode.  Bootstrap,
> make check and Specs2000 are ok.
>
> Thanks, Michael
>
> On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
>> <michael.v.zolotukhin@gmail.com> wrote:
>>> Hi,
>>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>>> another integer mode).  Also, some of remarks are fixed in it - some
>>> others are still not changed, because I don't know what to do with them right
>>> now (see my questions in the previous letter).
>>>
>>> Could you take a look at the patch?
>>>
>>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>>> (I checked only stability).
>>>
>>
>> You use Pmode as the largest integer mode.  Is word_mode better
>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>
>>

You should include tests to verify that it works as
expected.


--
H.J.
Michael Zolotukhin - June 5, 2013, 2:10 p.m.
I'll prepare some tests shortly,
What about the rest questions?

Thanks, Michael

On 15 May 2013 19:45, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Wed, May 15, 2013 at 5:47 AM, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> Hi HJ,
>>> You use Pmode as the largest integer mode.  Is word_mode better
>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>> I updated the patch to use word_mode instead of Pmode.  Bootstrap,
>> make check and Specs2000 are ok.
>>
>> Thanks, Michael
>>
>> On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>> Hi,
>>>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>>>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>>>> another integer mode).  Also, some of remarks are fixed in it - some
>>>> others are still not changed, because I don't know what to do with them right
>>>> now (see my questions in the previous letter).
>>>>
>>>> Could you take a look at the patch?
>>>>
>>>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>>>> (I checked only stability).
>>>>
>>>
>>> You use Pmode as the largest integer mode.  Is word_mode better
>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>
>>>
>
> You should include tests to verify that it works as
> expected.
>
>
> --
> H.J.
Michael Zolotukhin - June 20, 2013, 1:16 p.m.
Hi,
I added two tests to verify we generate vector instructions when
vector_loop is used.
Is the patch ok with that change?

Thanks, Michael

On 5 June 2013 18:10, Michael Zolotukhin <michael.v.zolotukhin@gmail.com> wrote:
> I'll prepare some tests shortly,
> What about the rest questions?
>
> Thanks, Michael
>
> On 15 May 2013 19:45, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Wed, May 15, 2013 at 5:47 AM, Michael Zolotukhin
>> <michael.v.zolotukhin@gmail.com> wrote:
>>> Hi HJ,
>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>> I updated the patch to use word_mode instead of Pmode.  Bootstrap,
>>> make check and Specs2000 are ok.
>>>
>>> Thanks, Michael
>>>
>>> On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
>>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>>> Hi,
>>>>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>>>>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>>>>> another integer mode).  Also, some of remarks are fixed in it - some
>>>>> others are still not changed, because I don't know what to do with them right
>>>>> now (see my questions in the previous letter).
>>>>>
>>>>> Could you take a look at the patch?
>>>>>
>>>>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>>>>> (I checked only stability).
>>>>>
>>>>
>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>>
>>>>
>>
>> You should include tests to verify that it works as
>> expected.
>>
>>
>> --
>> H.J.
>
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.
Michael Zolotukhin - June 20, 2013, 4:56 p.m.
It seems that one of the tests needed a small fix.
Attached is a corrected version.

On 20 June 2013 17:16, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi,
> I added two tests to verify we generate vector instructions when
> vector_loop is used.
> Is the patch ok with that change?
>
> Thanks, Michael
>
> On 5 June 2013 18:10, Michael Zolotukhin <michael.v.zolotukhin@gmail.com> wrote:
>> I'll prepare some tests shortly,
>> What about the rest questions?
>>
>> Thanks, Michael
>>
>> On 15 May 2013 19:45, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Wed, May 15, 2013 at 5:47 AM, Michael Zolotukhin
>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>> Hi HJ,
>>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>> I updated the patch to use word_mode instead of Pmode.  Bootstrap,
>>>> make check and Specs2000 are ok.
>>>>
>>>> Thanks, Michael
>>>>
>>>> On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
>>>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>>>> Hi,
>>>>>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>>>>>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>>>>>> another integer mode).  Also, some of remarks are fixed in it - some
>>>>>> others are still not changed, because I don't know what to do with them right
>>>>>> now (see my questions in the previous letter).
>>>>>>
>>>>>> Could you take a look at the patch?
>>>>>>
>>>>>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>>>>>> (I checked only stability).
>>>>>>
>>>>>
>>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>>>
>>>>>
>>>
>>> You should include tests to verify that it works as
>>> expected.
>>>
>>>
>>> --
>>> H.J.
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.
>
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.
Michael Zolotukhin - June 25, 2013, 1:36 p.m.
Ping.

On 20 June 2013 20:56, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> It seems that one of the tests needed a small fix.
> Attached is a corrected version.
>
> On 20 June 2013 17:16, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> Hi,
>> I added two tests to verify we generate vector instructions when
>> vector_loop is used.
>> Is the patch ok with that change?
>>
>> Thanks, Michael
>>
>> On 5 June 2013 18:10, Michael Zolotukhin <michael.v.zolotukhin@gmail.com> wrote:
>>> I'll prepare some tests shortly,
>>> What about the rest questions?
>>>
>>> Thanks, Michael
>>>
>>> On 15 May 2013 19:45, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Wed, May 15, 2013 at 5:47 AM, Michael Zolotukhin
>>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>>> Hi HJ,
>>>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>>> I updated the patch to use word_mode instead of Pmode.  Bootstrap,
>>>>> make check and Specs2000 are ok.
>>>>>
>>>>> Thanks, Michael
>>>>>
>>>>> On 14 May 2013 19:55, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>>> On Tue, May 14, 2013 at 7:34 AM, Michael Zolotukhin
>>>>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>>>>> Hi,
>>>>>>> I attached an updated version of the patch.  Looks like the 64-bit issue is
>>>>>>> resolved in it (now a vector mode is explicitly chosen instead of TI- or
>>>>>>> another integer mode).  Also, some of remarks are fixed in it - some
>>>>>>> others are still not changed, because I don't know what to do with them right
>>>>>>> now (see my questions in the previous letter).
>>>>>>>
>>>>>>> Could you take a look at the patch?
>>>>>>>
>>>>>>> I checked it on i386/x86_64 bootstrap and make check, and on Specs2000/2006
>>>>>>> (I checked only stability).
>>>>>>>
>>>>>>
>>>>>> You use Pmode as the largest integer mode.  Is word_mode better
>>>>>> than Pmode since word_mode is DI and Pmode may be SI for x32.
>>>>>>
>>>>>>
>>>>
>>>> You should include tests to verify that it works as
>>>> expected.
>>>>
>>>>
>>>> --
>>>> H.J.
>>>
>>>
>>>
>>> --
>>> ---
>>> Best regards,
>>> Michael V. Zolotukhin,
>>> Software Engineer
>>> Intel Corporation.
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.
>
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.
Uros Bizjak - June 30, 2013, 8:55 a.m.
On Tue, Jun 25, 2013 at 3:36 PM, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Ping.
>
> On 20 June 2013 20:56, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> It seems that one of the tests needed a small fix.
>> Attached is a corrected version.

Jan, do you plan to review this patch? It touches the area that you
worked on extensively some time ago, so your expert opinion would be
much appreciated here.

Thanks and best regards,
Uros.
Jan Hubicka - June 30, 2013, 9:06 a.m.
> On Tue, Jun 25, 2013 at 3:36 PM, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
> > Ping.
> >
> > On 20 June 2013 20:56, Michael Zolotukhin
> > <michael.v.zolotukhin@gmail.com> wrote:
> >> It seems that one of the tests needed a small fix.
> >> Attached is a corrected version.
> 
> Jan, do you plan to review this patch? It touches the area that you
> worked on extensively some time ago, so your expert opinion would be
> much appreciated here.

Yes, I looked on the patch in detail this week (I am currently on a travel with
sporadic internet access and my days before leaving was extremely hectic).  The
patch is OK except for the expr.c bits I can not apporve myself and they ought
to go in separately anyway.

The reason I took so long to decide on the patch is that I did some experiments
with the SSE loop and it is hard to find scenarios with the current
alignment/block size code where it is a win over library call on the current
implementation (or with Ondra's). So I did not find block size/chip combination
where the current inline SSE loop would be default codegen strategy. One of
cases where inline loop wins is when the extra register pressure impossed by
the call cause IRA to not do very good job on surrounding code.

I think this can be improved by putting the loop kernels themselves offline
with custom calling conventions. Doing so seems bit hard however to get right
with unwind info.

In gneral it IMO it makes sense to add the inline codegen for both memcpy and memset
so it stays on the radar on all chips (I tested only core and recent AMD chips, not
Atoms etc.).

Michael, my apologizes for taking so long to decide here. Do you think you can work on the
memset and move_by_pieces/store_by_pieces bits?
I think the by pieces code is a lot more promising then the actual inline SSE loops.

Honza
> 
> Thanks and best regards,
> Uros.
Ondrej Bilka - June 30, 2013, 7:22 p.m.
On Sun, Jun 30, 2013 at 11:06:47AM +0200, Jan Hubicka wrote:
> > On Tue, Jun 25, 2013 at 3:36 PM, Michael Zolotukhin
> > <michael.v.zolotukhin@gmail.com> wrote:
> > > Ping.
> > >
> > > On 20 June 2013 20:56, Michael Zolotukhin
> > > <michael.v.zolotukhin@gmail.com> wrote:
> > >> It seems that one of the tests needed a small fix.
> > >> Attached is a corrected version.
> > 
> > Jan, do you plan to review this patch? It touches the area that you
> > worked on extensively some time ago, so your expert opinion would be
> > much appreciated here.
> 
> Yes, I looked on the patch in detail this week (I am currently on a travel with
> sporadic internet access and my days before leaving was extremely hectic).  The
> patch is OK except for the expr.c bits I can not apporve myself and they ought
> to go in separately anyway.
> 
> The reason I took so long to decide on the patch is that I did some experiments
> with the SSE loop and it is hard to find scenarios with the current
> alignment/block size code where it is a win over library call on the current
> implementation (or with Ondra's). So I did not find block size/chip combination
> where the current inline SSE loop would be default codegen strategy. One of
> cases where inline loop wins is when the extra register pressure impossed by
> the call cause IRA to not do very good job on surrounding code.
>
It does not measure real world issues like icache pressure etc. I would
like more definitive data. One possibility is compile gcc with various
options and repeately run it to compile simple project for day or so.

I am interested upto which size inlining code makes sense, my guess is
that after 64 bytes and no FP registers inlining is unproductive.

I tried this to see how LD_PRELOADing memcpy affects performance and it
is measurable after about day, a performance impact is small in my case
it was about 0.5% so you really need that long to converge.

Ondra
Michael Zolotukhin - July 2, 2013, 2:37 p.m.
Hi guys,
Thanks for the review and for your responses - please find my answers
below.

> Yes, I looked on the patch in detail this week (I am currently on a travel with
> sporadic internet access and my days before leaving was extremely hectic).  The
> patch is OK except for the expr.c bits I can not apporve myself and they ought
> to go in separately anyway.
You probably meant changes in emit-rtl.c.  Originally they were made to
overcome an issue with defining alignment of MEM_REF, but now they seem
to be unneeded - everything works well even without them.

> The reason I took so long to decide on the patch is that I did some experiments
> with the SSE loop and it is hard to find scenarios with the current
> alignment/block size code where it is a win over library call on the current
> implementation (or with Ondra's). So I did not find block size/chip combination
> where the current inline SSE loop would be default codegen strategy. One of
> cases where inline loop wins is when the extra register pressure impossed by
> the call cause IRA to not do very good job on surrounding code.
As I mentioned earlier, this implementation doesn't pretend to be a
finished and completely tuned implementation - it's just a base for
future work in this direction.  But nevertheless, I think that even now
it could be used for cases when alignment is statically known and
blocksize is big enough - I doubt that libcall would beat inlining in
that case due to call overheads (but yes, currently that's just my
assumptions).

> Michael, my apologizes for taking so long to decide here.
No problem, Jan, thanks for the review.

> Do you think you can work on the
> memset and move_by_pieces/store_by_pieces bits?
> I think the by pieces code is a lot more promising then the actual inline SSE loops.
I think I'll find some time for implementing memset (in i386.c) - but
I'm going to restrict SSE loops only for the cases when we are zeroing
memory.  That means, we'll give up if we're filling memory with some
other value.  I suppose that zeroing is the most common use of memset,
so it'll be enough.  What do you think, does it sound reasonable?

As for move_by_pieces and store_by_pieces - I thought about them.
Implementation there would be much more complicated and it'd be much
harder to tune it - the reason is obvious: we need to make common
implementation which will work for all architectures.
Instead of this, I've been thinking of adding yet another algorithm for
expanding memcpy/memset in i386.c - something like fully_unrolled_loop.
It'll perform copying without any loop at all and assumingly will be
useful for copying small blocks.  With this implemented, we could change
MOVE_RATIO to always expand memmov/memset with some algorithm from
i386.c and not to use move_by_pieces/store_by_pieces.  I think such
implementation could be made much more optimized than any other in
move_by_pieces.

> It does not measure real world issues like icache pressure etc. I would
> like more definitive data. One possibility is compile gcc with various
> options and repeately run it to compile simple project for day or so.
I agree, but anyway we need some tests to measure performance - whether
these tests are Specs or some others.


I attached the updated patch - it's the same as the previous, but
without emit-rtl.c changes. Is it ok for trunk?

The patch is bootstrapped and regtested on i386 and x86-64.  Specs2000 are also
passing without regressions (I checked only stability, not performance).

Changelog:
2013-07-02  Michael Zolotukhin  <michael.v.zolotukhin@gmail.com>

        * config/i386/i386-opts.h (enum stringop_alg): Add vector_loop.
        * config/i386/i386.c (expand_set_or_movmem_via_loop): Use
        adjust_address instead of change_address to keep info about alignment.
        (emit_strmov): Remove.
        (emit_memmov): New function.
        (expand_movmem_epilogue): Refactor to properly handle bigger sizes.
        (expand_movmem_epilogue): Likewise and return updated rtx for
        destination.
        (expand_constant_movmem_prologue): Likewise and return updated rtx for
        destination and source.
        (decide_alignment): Refactor, handle vector_loop.
        (ix86_expand_movmem): Likewise.
        (ix86_expand_setmem): Likewise.
        * config/i386/i386.opt (Enum): Add vector_loop to option stringop_alg.


Thanks, Michael
> Honza
>
> Ondra

On 30 June 2013 23:22, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Sun, Jun 30, 2013 at 11:06:47AM +0200, Jan Hubicka wrote:
>> > On Tue, Jun 25, 2013 at 3:36 PM, Michael Zolotukhin
>> > <michael.v.zolotukhin@gmail.com> wrote:
>> > > Ping.
>> > >
>> > > On 20 June 2013 20:56, Michael Zolotukhin
>> > > <michael.v.zolotukhin@gmail.com> wrote:
>> > >> It seems that one of the tests needed a small fix.
>> > >> Attached is a corrected version.
>> >
>> > Jan, do you plan to review this patch? It touches the area that you
>> > worked on extensively some time ago, so your expert opinion would be
>> > much appreciated here.
>>
>> Yes, I looked on the patch in detail this week (I am currently on a travel with
>> sporadic internet access and my days before leaving was extremely hectic).  The
>> patch is OK except for the expr.c bits I can not apporve myself and they ought
>> to go in separately anyway.
>>
>> The reason I took so long to decide on the patch is that I did some experiments
>> with the SSE loop and it is hard to find scenarios with the current
>> alignment/block size code where it is a win over library call on the current
>> implementation (or with Ondra's). So I did not find block size/chip combination
>> where the current inline SSE loop would be default codegen strategy. One of
>> cases where inline loop wins is when the extra register pressure impossed by
>> the call cause IRA to not do very good job on surrounding code.
>>
> It does not measure real world issues like icache pressure etc. I would
> like more definitive data. One possibility is compile gcc with various
> options and repeately run it to compile simple project for day or so.
>
> I am interested upto which size inlining code makes sense, my guess is
> that after 64 bytes and no FP registers inlining is unproductive.
>
> I tried this to see how LD_PRELOADing memcpy affects performance and it
> is measurable after about day, a performance impact is small in my case
> it was about 0.5% so you really need that long to converge.
>
> Ondra


--
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.
Michael Zolotukhin - July 5, 2013, 7:58 a.m.
Ping.

On 2 July 2013 18:37, Michael Zolotukhin <michael.v.zolotukhin@gmail.com> wrote:
> Hi guys,
> Thanks for the review and for your responses - please find my answers
> below.
>
>> Yes, I looked on the patch in detail this week (I am currently on a travel with
>> sporadic internet access and my days before leaving was extremely hectic).  The
>> patch is OK except for the expr.c bits I can not apporve myself and they ought
>> to go in separately anyway.
> You probably meant changes in emit-rtl.c.  Originally they were made to
> overcome an issue with defining alignment of MEM_REF, but now they seem
> to be unneeded - everything works well even without them.
>
>> The reason I took so long to decide on the patch is that I did some experiments
>> with the SSE loop and it is hard to find scenarios with the current
>> alignment/block size code where it is a win over library call on the current
>> implementation (or with Ondra's). So I did not find block size/chip combination
>> where the current inline SSE loop would be default codegen strategy. One of
>> cases where inline loop wins is when the extra register pressure impossed by
>> the call cause IRA to not do very good job on surrounding code.
> As I mentioned earlier, this implementation doesn't pretend to be a
> finished and completely tuned implementation - it's just a base for
> future work in this direction.  But nevertheless, I think that even now
> it could be used for cases when alignment is statically known and
> blocksize is big enough - I doubt that libcall would beat inlining in
> that case due to call overheads (but yes, currently that's just my
> assumptions).
>
>> Michael, my apologizes for taking so long to decide here.
> No problem, Jan, thanks for the review.
>
>> Do you think you can work on the
>> memset and move_by_pieces/store_by_pieces bits?
>> I think the by pieces code is a lot more promising then the actual inline SSE loops.
> I think I'll find some time for implementing memset (in i386.c) - but
> I'm going to restrict SSE loops only for the cases when we are zeroing
> memory.  That means, we'll give up if we're filling memory with some
> other value.  I suppose that zeroing is the most common use of memset,
> so it'll be enough.  What do you think, does it sound reasonable?
>
> As for move_by_pieces and store_by_pieces - I thought about them.
> Implementation there would be much more complicated and it'd be much
> harder to tune it - the reason is obvious: we need to make common
> implementation which will work for all architectures.
> Instead of this, I've been thinking of adding yet another algorithm for
> expanding memcpy/memset in i386.c - something like fully_unrolled_loop.
> It'll perform copying without any loop at all and assumingly will be
> useful for copying small blocks.  With this implemented, we could change
> MOVE_RATIO to always expand memmov/memset with some algorithm from
> i386.c and not to use move_by_pieces/store_by_pieces.  I think such
> implementation could be made much more optimized than any other in
> move_by_pieces.
>
>> It does not measure real world issues like icache pressure etc. I would
>> like more definitive data. One possibility is compile gcc with various
>> options and repeately run it to compile simple project for day or so.
> I agree, but anyway we need some tests to measure performance - whether
> these tests are Specs or some others.
>
>
> I attached the updated patch - it's the same as the previous, but
> without emit-rtl.c changes. Is it ok for trunk?
>
> The patch is bootstrapped and regtested on i386 and x86-64.  Specs2000 are also
> passing without regressions (I checked only stability, not performance).
>
> Changelog:
> 2013-07-02  Michael Zolotukhin  <michael.v.zolotukhin@gmail.com>
>
>         * config/i386/i386-opts.h (enum stringop_alg): Add vector_loop.
>         * config/i386/i386.c (expand_set_or_movmem_via_loop): Use
>         adjust_address instead of change_address to keep info about alignment.
>         (emit_strmov): Remove.
>         (emit_memmov): New function.
>         (expand_movmem_epilogue): Refactor to properly handle bigger sizes.
>         (expand_movmem_epilogue): Likewise and return updated rtx for
>         destination.
>         (expand_constant_movmem_prologue): Likewise and return updated rtx for
>         destination and source.
>         (decide_alignment): Refactor, handle vector_loop.
>         (ix86_expand_movmem): Likewise.
>         (ix86_expand_setmem): Likewise.
>         * config/i386/i386.opt (Enum): Add vector_loop to option stringop_alg.
>
>
> Thanks, Michael
>> Honza
>>
>> Ondra
>
> On 30 June 2013 23:22, Ondřej Bílka <neleai@seznam.cz> wrote:
>> On Sun, Jun 30, 2013 at 11:06:47AM +0200, Jan Hubicka wrote:
>>> > On Tue, Jun 25, 2013 at 3:36 PM, Michael Zolotukhin
>>> > <michael.v.zolotukhin@gmail.com> wrote:
>>> > > Ping.
>>> > >
>>> > > On 20 June 2013 20:56, Michael Zolotukhin
>>> > > <michael.v.zolotukhin@gmail.com> wrote:
>>> > >> It seems that one of the tests needed a small fix.
>>> > >> Attached is a corrected version.
>>> >
>>> > Jan, do you plan to review this patch? It touches the area that you
>>> > worked on extensively some time ago, so your expert opinion would be
>>> > much appreciated here.
>>>
>>> Yes, I looked on the patch in detail this week (I am currently on a travel with
>>> sporadic internet access and my days before leaving was extremely hectic).  The
>>> patch is OK except for the expr.c bits I can not apporve myself and they ought
>>> to go in separately anyway.
>>>
>>> The reason I took so long to decide on the patch is that I did some experiments
>>> with the SSE loop and it is hard to find scenarios with the current
>>> alignment/block size code where it is a win over library call on the current
>>> implementation (or with Ondra's). So I did not find block size/chip combination
>>> where the current inline SSE loop would be default codegen strategy. One of
>>> cases where inline loop wins is when the extra register pressure impossed by
>>> the call cause IRA to not do very good job on surrounding code.
>>>
>> It does not measure real world issues like icache pressure etc. I would
>> like more definitive data. One possibility is compile gcc with various
>> options and repeately run it to compile simple project for day or so.
>>
>> I am interested upto which size inlining code makes sense, my guess is
>> that after 64 bytes and no FP registers inlining is unproductive.
>>
>> I tried this to see how LD_PRELOADing memcpy affects performance and it
>> is measurable after about day, a performance impact is small in my case
>> it was about 0.5% so you really need that long to converge.
>>
>> Ondra
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.
Jan Hubicka - July 5, 2013, 11:25 a.m.
> > I agree, but anyway we need some tests to measure performance - whether
> > these tests are Specs or some others.
> >
> >
> > I attached the updated patch - it's the same as the previous, but
> > without emit-rtl.c changes. Is it ok for trunk?

Yes, this patch is OK (I meant my previous mail as an approval).
There is a lot of things we can do about string operations, taking incremental steps is good
plan.

Honza
Kirill Yukhin - July 8, 2013, 6:49 a.m.
Hello,

> Yes, this patch is OK (I meant my previous mail as an approval).
> There is a lot of things we can do about string operations, taking incremental steps is good
> plan.


Changes were checked into trunk:
http://gcc.gnu.org/ml/gcc-cvs/2013-07/msg00179.html

Thanks, K
Michael Zolotukhin - July 8, 2013, 6:56 a.m.
> Changes were checked into trunk:
> http://gcc.gnu.org/ml/gcc-cvs/2013-07/msg00179.html
Thanks, Kirill

>> There is a lot of things we can do about string operations, taking incremental steps is good
>> plan.
Then my next step will be implementation of vector_loop for memset
with 0.  Thanks for the feedback and review!

Best regards, Michael

On 8 July 2013 10:49, Kirill Yukhin <kirill.yukhin@gmail.com> wrote:
> Hello,
>
>> Yes, this patch is OK (I meant my previous mail as an approval).
>> There is a lot of things we can do about string operations, taking incremental steps is good
>> plan.
>
>
> Changes were checked into trunk:
> http://gcc.gnu.org/ml/gcc-cvs/2013-07/msg00179.html
>
> Thanks, K



--
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.
Eric Botcazou - Sept. 3, 2013, 7:01 p.m.
> Changes were checked into trunk:
> http://gcc.gnu.org/ml/gcc-cvs/2013-07/msg00179.html

The patch miscompiles the MPFR library on x86 Pentium Pro.  Reduced testcase 
attached, compile for x86 with -mtune=pentiumpro.
Michael Zolotukhin - Sept. 3, 2013, 7:05 p.m.
> The patch miscompiles the MPFR library on x86 Pentium Pro.  Reduced testcase 
> attached, compile for x86 with -mtune=pentiumpro.
Thanks, I'll look at that in the nearest future.

Best regards, Michael
H.J. Lu - Sept. 3, 2013, 7:25 p.m.
On Tue, Sep 3, 2013 at 12:05 PM, Michael V. Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
>> The patch miscompiles the MPFR library on x86 Pentium Pro.  Reduced testcase
>> attached, compile for x86 with -mtune=pentiumpro.
> Thanks, I'll look at that in the nearest future.
>
> Best regards, Michael

The problem is

        testb   $2, %bl
        je      .L6

It should be cmpb instead of testb.
H.J. Lu - Sept. 6, 2013, 4:58 p.m.
On Tue, Sep 3, 2013 at 12:01 PM, Eric Botcazou <ebotcazou@adacore.com> wrote:
>> Changes were checked into trunk:
>> http://gcc.gnu.org/ml/gcc-cvs/2013-07/msg00179.html
>
> The patch miscompiles the MPFR library on x86 Pentium Pro.  Reduced testcase
> attached, compile for x86 with -mtune=pentiumpro.
>

The change in question has

@@ -23169,7 +23150,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx
count_exp, rtx align_exp,

   if (count_exp != const0_rtx && epilogue_size_needed > 1)
     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
-                           epilogue_size_needed);
+                           size_needed);
   if (jump_around_label)
     emit_label (jump_around_label);
   return true;

Michael, why did you change epilogue_size_needed to size_needed
here?  It looks wrong to me.
Michael Zolotukhin - Sept. 6, 2013, 8:50 p.m.
> Michael, why did you change epilogue_size_needed to size_needed
> here?  It looks wrong to me.
This function was changed in several places and meaning of
'size_needed' and 'epilogue_size_needed' could've been changed. It
needs more careful examination and I'll do it shortly.
Briefly, I significantly changed prologue and epilogue generation
routines and that obviously caused that regression. I guess that in
some of these function I began to look at size_needed instead of
desired_align (usually, size_needed >= desired_align, but for pentium
pro that's not true). I hope to dig into it on the weekend.

Thanks for reporting and investigation.

Michael
> H.J.

Patch

diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index 73a59b5..edb59da 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -1565,6 +1565,18 @@  get_mem_align_offset (rtx mem, unsigned int align)
 	  expr = inner;
 	}
     }
+  else if (TREE_CODE (expr) == MEM_REF)
+    {
+      tree base = TREE_OPERAND (expr, 0);
+      tree byte_offset = TREE_OPERAND (expr, 1);
+      if (TREE_CODE (base) != ADDR_EXPR
+	  || TREE_CODE (byte_offset) != INTEGER_CST)
+	return -1;
+      if (!DECL_P (TREE_OPERAND (base, 0))
+	  || DECL_ALIGN (TREE_OPERAND (base, 0)) < align)

You can use TYPE_ALIGN here? In general can't we replace all the GIMPLE
handling by get_object_alignment?

+	return -1;
+      offset += tree_low_cst (byte_offset, 1);