Patchwork Add value range support into memcpy/memset expansion

login
register
mail settings
Submitter Jan Hubicka
Date Sept. 28, 2013, 10:05 p.m.
Message ID <20130928220501.GA7126@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/278780/
State New
Headers show

Comments

Jan Hubicka - Sept. 28, 2013, 10:05 p.m.
> > Nice extension. Test cases would be great to have.
> Fore those you need i386 changes to actually use the info.  I will post that
> after some cleanup and additional testing.

Hi,
since I already caught your attention, here is the target specific part for
comments.

this patch implements memcpy/memset prologues and epilogues as suggested by
Ondrej Bilka.  His glibc implementation use IMO very smart trick with single
misaligned move to copy first N and last N bytes of the block.  The remainder
of the block is then copied by the usual loop that gets aligned to the proper
address.

This leads to partial memory stall, but that is handled well by modern x86
chips.

For example in the following testcase:
char *a;
char *b;
t1()
{
  memcpy (a,b,140);
}

We now produce:
        movq    b(%rip), %rsi
        movq    a(%rip), %rcx
        movq    (%rsi), %rax <- first 8 bytes are moved
        leaq    8(%rcx), %rdi
        andq    $-8, %rdi   <- dest is aligned
        movq    %rax, (%rcx)
        movq    132(%rsi), %rax  <- last 8 bytes are moved
        movq    %rax, 132(%rcx)
        subq    %rdi, %rcx  <- alignment is subtracted from count
        subq    %rcx, %rsi  <- source is aligned
        addl    $140, %ecx  <- normal copying of 8 byte chunks
        shrl    $3, %ecx
        rep; movsq
        ret

Instead of:
	movq	a(%rip), %rdi
	movq	b(%rip), %rsi
	movl	$140, %eax
	testb	$1, %dil
	jne	.L28
	testb	$2, %dil
	jne	.L29
.L3:
	testb	$4, %dil
	jne	.L30
.L4:
	movl	%eax, %ecx
	xorl	%edx, %edx
	shrl	$3, %ecx
	testb	$4, %al
	rep movsq
	je	.L5
	movl	(%rsi), %edx
	movl	%edx, (%rdi)
	movl	$4, %edx
.L5:
	testb	$2, %al
	je	.L6
	movzwl	(%rsi,%rdx), %ecx
	movw	%cx, (%rdi,%rdx)
	addq	$2, %rdx
.L6:
	testb	$1, %al
	je	.L8
	movzbl	(%rsi,%rdx), %eax
	movb	%al, (%rdi,%rdx)
.L8:
	rep
	ret
	.p2align 4,,10
	.p2align 3
.L28:
	movzbl	(%rsi), %eax
	addq	$1, %rsi
	movb	%al, (%rdi)
	addq	$1, %rdi
	movl	$139, %eax
	testb	$2, %dil
	je	.L3
	.p2align 4,,10
	.p2align 3
.L29:
	movzwl	(%rsi), %edx
	subl	$2, %eax
	addq	$2, %rsi
	movw	%dx, (%rdi)
	addq	$2, %rdi
	testb	$4, %dil
	je	.L4
	.p2align 4,,10
	.p2align 3
.L30:
	movl	(%rsi), %edx
	subl	$4, %eax
	addq	$4, %rsi
	movl	%edx, (%rdi)
	addq	$4, %rdi
	jmp	.L4

With the proposed value range code we can now take advantage of it even for
non-constant moves.  Somewhat artificial testcase:

char *p,*q;
t(unsigned int a)
{
  if (a>8 && a<100)
    memcpy(q,p,a);

}

Still generate pretty much same code (while -minline-all-stringops code on
mainline is just horrible):
        leal    -9(%rdi), %edx
        movl    %edi, %eax
        cmpl    $90, %edx
        jbe     .L5
        rep; ret
        .p2align 4,,10
        .p2align 3
.L5:
        movq    p(%rip), %rsi
        movq    q(%rip), %rcx
        movq    (%rsi), %rdx
        movq    %rdx, (%rcx)
        movl    %edi, %edx
        movq    -8(%rsi,%rdx), %rdi
        movq    %rdi, -8(%rcx,%rdx)
        leaq    8(%rcx), %rdi
        andq    $-8, %rdi
        subq    %rdi, %rcx
        subq    %rcx, %rsi
        addl    %eax, %ecx
        shrl    $3, %ecx
        rep; movsq
        ret

Of course it is quite common to know only upper bound on the block.  In this case
we need to generate prologue for first few bytes:
char *p,*q;
t(unsigned int a)
{
  if (a<100)
    memcpy(q,p,a);

}
t:
.LFB0:
        .cfi_startproc
        cmpl    $99, %edi
        jbe     .L15
.L7:
        rep; ret
        .p2align 4,,10
        .p2align 3
.L15:
        cmpl    $8, %edi
        movq    q(%rip), %rdx
        movq    p(%rip), %rsi
        jae     .L3
        testb   $4, %dil
        jne     .L16
        testl   %edi, %edi
        je      .L7
        movzbl  (%rsi), %eax
        testb   $2, %dil
        movb    %al, (%rdx)
        je      .L7
        movl    %edi, %edi
        movzwl  -2(%rsi,%rdi), %eax
        movw    %ax, -2(%rdx,%rdi)
        ret
        .p2align 4,,10
        .p2align 3
.L3:
        movq    (%rsi), %rax
        movq    %rax, (%rdx)
        movl    %edi, %eax
        movq    -8(%rsi,%rax), %rcx
        movq    %rcx, -8(%rdx,%rax)
        leaq    8(%rdx), %rax
        andq    $-8, %rax
        subq    %rax, %rdx
        addl    %edx, %edi
        subq    %rdx, %rsi
        shrl    $3, %edi
        movl    %edi, %ecx
        movq    %rax, %rdi
        rep; movsq
        ret
        .p2align 4,,10
        .p2align 3
.L16:
        movl    (%rsi), %eax
        movl    %edi, %edi
        movl    %eax, (%rdx)
        movl    -4(%rsi,%rdi), %eax
        movl    %eax, -4(%rdx,%rdi)
        ret
        .cfi_endproc
.LFE0:

Mainline would output a libcall here (because size is unknown to it) and with
inlining all stringops it winds up 210 bytes of code instead of 142 bytes
above.

Unforutnately the following testcase:
char *p,*q;
t(int a)
{
  if (a<100)
    memcpy(q,p,a);

}
Won't get inlined.  This is because A is known to be smaller than 100 that
results in anti range after conversion to size_t.  This anti range allows very
large values (above INT_MAX) and thus we do not know the block size.
I am not sure if the sane range can be recovered somehow.  If not, maybe
this is common enough to add support for "probable" upper bound parameter to
the template.

Use of value ranges makes it harder to choose proper algorithm since the average
size is no longer known.  For the moment I take simple average of lower and upper
bound, but this is wrong.

Libcall starts to win only for pretty large blocks (over 4GB definitely) so it makes
sense to inline functions with range 0....4096 even though the cost tables tells
to expand libcall for everything bigger than 140 bytes:  if blocks are small we will
get noticeable win and if blocks are big, we won't lose much.

I am considering assigning value ranges to the algorithms, too, for more sane
choices in decide_alg.

I also think the misaligned move trick can/should be performed by
move_by_pieces and we ought to consider sane use of SSE - current vector_loop
with unrolling factor of 4 seems bit extreme.  At least buldozer is happy with
2 and I would expect SSE moves to be especially useful for moving blocks with
known size where they are not used at all.

Currently I disabled misaligned move prologues/epilogues for Michael's vector
loop path since they ends up longer than the traditional code (that use loop
for epilogue)

I will deal with that incrementally.

Bootstrapped/regtested x86_64-linux with and without misaligned move prologues
and also with -minline-all-stringop.

I plan to do some furhter testing tomorrow and add testcases while waiting for
the generic part to be reviewed.
Comments are welcome.

	* i386.h (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES): New macro.
	* i386.md (movmem, setmem insn patterms): Add new parameters for value
	range.
	* i386-protos.h (ix86_expand_movmem, ix86_expand_setmem): Add new
	parameters for value range.
	* i386.c (expand_small_movmem): New function.
	(expand_movmem_prologue_epilogue_by_misaligned_moves): New function.
	(expand_small_setmem): New function.
	(expand_setmem_prologue_epilogue_by_misaligned_moves): New funcion.
	(alg_usable_p): New function.
	(decide_alg): Add zero_memset parameter; cleanup; consider value ranges.
	(ix86_expand_movmem): Add value range parameters; update block comment;
	add support for value ranges; add support for misaligned move prologues.
	(ix86_expand_setmem): Likewise.
Xinliang David Li - Sept. 29, 2013, midnight
On Sat, Sep 28, 2013 at 3:05 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> > Nice extension. Test cases would be great to have.
>> Fore those you need i386 changes to actually use the info.  I will post that
>> after some cleanup and additional testing.
>
> Hi,
> since I already caught your attention, here is the target specific part for
> comments.
>
> this patch implements memcpy/memset prologues and epilogues as suggested by
> Ondrej Bilka.  His glibc implementation use IMO very smart trick with single
> misaligned move to copy first N and last N bytes of the block.  The remainder
> of the block is then copied by the usual loop that gets aligned to the proper
> address.
>
> This leads to partial memory stall, but that is handled well by modern x86
> chips.
>
> For example in the following testcase:
> char *a;
> char *b;
> t1()
> {
>   memcpy (a,b,140);
> }
>
> We now produce:
>         movq    b(%rip), %rsi
>         movq    a(%rip), %rcx
>         movq    (%rsi), %rax <- first 8 bytes are moved
>         leaq    8(%rcx), %rdi
>         andq    $-8, %rdi   <- dest is aligned
>         movq    %rax, (%rcx)
>         movq    132(%rsi), %rax  <- last 8 bytes are moved
>         movq    %rax, 132(%rcx)
>         subq    %rdi, %rcx  <- alignment is subtracted from count

>         subq    %rcx, %rsi  <- source is aligned

This (source aligned) is not always true, but nevertheless the
sequence is very tight.

>         addl    $140, %ecx  <- normal copying of 8 byte chunks
>         shrl    $3, %ecx
>         rep; movsq
>         ret

> Of course it is quite common to know only upper bound on the block.  In this case
> we need to generate prologue for first few bytes:
> char *p,*q;
> t(unsigned int a)
> {
>   if (a<100)
>     memcpy(q,p,a);
>
> }
> t:
> .LFB0:
>         .cfi_startproc
>         cmpl    $99, %edi
>         jbe     .L15
> .L7:
>         rep; ret
>         .p2align 4,,10
>         .p2align 3
> .L15:
>         cmpl    $8, %edi
>         movq    q(%rip), %rdx
>         movq    p(%rip), %rsi
>         jae     .L3
>         testb   $4, %dil
>         jne     .L16
>         testl   %edi, %edi
>         je      .L7
>         movzbl  (%rsi), %eax
>         testb   $2, %dil
>         movb    %al, (%rdx)
>         je      .L7
>         movl    %edi, %edi
>         movzwl  -2(%rsi,%rdi), %eax
>         movw    %ax, -2(%rdx,%rdi)
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L3:
>         movq    (%rsi), %rax
>         movq    %rax, (%rdx)
>         movl    %edi, %eax
>         movq    -8(%rsi,%rax), %rcx
>         movq    %rcx, -8(%rdx,%rax)
>         leaq    8(%rdx), %rax
>         andq    $-8, %rax
>         subq    %rax, %rdx
>         addl    %edx, %edi
>         subq    %rdx, %rsi
>         shrl    $3, %edi
>         movl    %edi, %ecx
>         movq    %rax, %rdi
>         rep; movsq
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L16:
>         movl    (%rsi), %eax
>         movl    %edi, %edi
>         movl    %eax, (%rdx)
>         movl    -4(%rsi,%rdi), %eax
>         movl    %eax, -4(%rdx,%rdi)
>         ret
>         .cfi_endproc
> .LFE0:
>
> Mainline would output a libcall here (because size is unknown to it) and with
> inlining all stringops it winds up 210 bytes of code instead of 142 bytes
> above.
>
> Unforutnately the following testcase:
> char *p,*q;
> t(int a)
> {
>   if (a<100)
>     memcpy(q,p,a);
>
> }
> Won't get inlined.  This is because A is known to be smaller than 100 that
> results in anti range after conversion to size_t.  This anti range allows very
> large values (above INT_MAX) and thus we do not know the block size.
> I am not sure if the sane range can be recovered somehow.  If not, maybe
> this is common enough to add support for "probable" upper bound parameter to
> the template.

Do we know if there is real code that intentionally does that other
than security flaws as result of improperly done range check?

I think by default GCC should assume the memcpy size range is (0, 100)
here with perhaps an option to override it.

thanks,

David

>
> Use of value ranges makes it harder to choose proper algorithm since the average
> size is no longer known.  For the moment I take simple average of lower and upper
> bound, but this is wrong.
>
> Libcall starts to win only for pretty large blocks (over 4GB definitely) so it makes
> sense to inline functions with range 0....4096 even though the cost tables tells
> to expand libcall for everything bigger than 140 bytes:  if blocks are small we will
> get noticeable win and if blocks are big, we won't lose much.
>
> I am considering assigning value ranges to the algorithms, too, for more sane
> choices in decide_alg.
>
> I also think the misaligned move trick can/should be performed by
> move_by_pieces and we ought to consider sane use of SSE - current vector_loop
> with unrolling factor of 4 seems bit extreme.  At least buldozer is happy with
> 2 and I would expect SSE moves to be especially useful for moving blocks with
> known size where they are not used at all.
>
> Currently I disabled misaligned move prologues/epilogues for Michael's vector
> loop path since they ends up longer than the traditional code (that use loop
> for epilogue)
>
> I will deal with that incrementally.
>
> Bootstrapped/regtested x86_64-linux with and without misaligned move prologues
> and also with -minline-all-stringop.
>
> I plan to do some furhter testing tomorrow and add testcases while waiting for
> the generic part to be reviewed.
> Comments are welcome.
>
>         * i386.h (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES): New macro.
>         * i386.md (movmem, setmem insn patterms): Add new parameters for value
>         range.
>         * i386-protos.h (ix86_expand_movmem, ix86_expand_setmem): Add new
>         parameters for value range.
>         * i386.c (expand_small_movmem): New function.
>         (expand_movmem_prologue_epilogue_by_misaligned_moves): New function.
>         (expand_small_setmem): New function.
>         (expand_setmem_prologue_epilogue_by_misaligned_moves): New funcion.
>         (alg_usable_p): New function.
>         (decide_alg): Add zero_memset parameter; cleanup; consider value ranges.
>         (ix86_expand_movmem): Add value range parameters; update block comment;
>         add support for value ranges; add support for misaligned move prologues.
>         (ix86_expand_setmem): Likewise.
> Index: config/i386/i386.h
> ===================================================================
> --- config/i386/i386.h  (revision 203004)
> +++ config/i386/i386.h  (working copy)
> @@ -378,6 +378,8 @@ extern unsigned char ix86_tune_features[
>         ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
>  #define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
>         ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
> +#define TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES \
> +       ix86_tune_features[X86_TUNE_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES]
>
>  /* Feature tests against the various architecture variations.  */
>  enum ix86_arch_indices {
> Index: config/i386/i386.md
> ===================================================================
> --- config/i386/i386.md (revision 203004)
> +++ config/i386/i386.md (working copy)
> @@ -15380,11 +15380,13 @@
>     (use (match_operand:SWI48 2 "nonmemory_operand"))
>     (use (match_operand:SWI48 3 "const_int_operand"))
>     (use (match_operand:SI 4 "const_int_operand"))
> -   (use (match_operand:SI 5 "const_int_operand"))]
> +   (use (match_operand:SI 5 "const_int_operand"))
> +   (use (match_operand:SI 6 "const_int_operand"))
> +   (use (match_operand:SI 7 ""))]
>    ""
>  {
>   if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3],
> -                        operands[4], operands[5]))
> +                        operands[4], operands[5], operands[6], operands[7]))
>     DONE;
>   else
>     FAIL;
> @@ -15572,12 +15574,15 @@
>      (use (match_operand:QI 2 "nonmemory_operand"))
>      (use (match_operand 3 "const_int_operand"))
>      (use (match_operand:SI 4 "const_int_operand"))
> -    (use (match_operand:SI 5 "const_int_operand"))]
> +    (use (match_operand:SI 5 "const_int_operand"))
> +    (use (match_operand:SI 6 "const_int_operand"))
> +    (use (match_operand:SI 7 ""))]
>    ""
>  {
>   if (ix86_expand_setmem (operands[0], operands[1],
>                          operands[2], operands[3],
> -                        operands[4], operands[5]))
> +                        operands[4], operands[5],
> +                        operands[6], operands[7]))
>     DONE;
>   else
>     FAIL;
> Index: config/i386/x86-tune.def
> ===================================================================
> --- config/i386/x86-tune.def    (revision 203004)
> +++ config/i386/x86-tune.def    (working copy)
> @@ -230,3 +230,7 @@ DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM
>     fp converts to destination register.  */
>  DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
>            m_SLM)
> +/* Use misaligned move to avoid need for conditionals for string operations.
> +   This is a win on targets that resulve resonably well partial memory stalls.  */
> +DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES, "misaligned_move_string_prologues_epilogues",
> +          m_GENERIC | m_CORE_ALL | m_AMD_MULTIPLE | m_SLM | m_ATOM)
> Index: config/i386/i386-protos.h
> ===================================================================
> --- config/i386/i386-protos.h   (revision 203004)
> +++ config/i386/i386-protos.h   (working copy)
> @@ -58,8 +58,8 @@ extern enum machine_mode ix86_cc_mode (e
>  extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode);
>  extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode);
>
> -extern bool ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
> -extern bool ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
> +extern bool ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
> +extern bool ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
>  extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
>
>  extern bool constant_address_p (rtx);
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 203004)
> +++ config/i386/i386.c  (working copy)
> @@ -22661,6 +22661,250 @@ expand_movmem_prologue (rtx destmem, rtx
>    return destmem;
>  }
>
> +/* Test if COUNT&SIZE is nonzero and if so, expand memmov
> +   sequence that is valid for SIZE..2*SIZE-1 bytes
> +   and jump to DONE_LABEL.  */
> +static void
> +expand_small_movmem (rtx destmem, rtx srcmem,
> +                    rtx destptr, rtx srcptr, rtx count,
> +                    int size, rtx done_label)
> +{
> +  rtx label = ix86_expand_aligntest (count, size, false);
> +  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
> +  rtx modesize;
> +  int n;
> +
> +  if (size >= 32)
> +    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
> +  else if (size >= 16)
> +    mode = TARGET_SSE ? V16QImode : DImode;
> +
> +  srcmem = change_address (srcmem, mode, srcptr);
> +  destmem = change_address (destmem, mode, destptr);
> +  modesize = GEN_INT (GET_MODE_SIZE (mode));
> +  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
> +    {
> +      emit_move_insn (destmem, srcmem);
> +      srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
> +      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
> +    }
> +
> +  srcmem = offset_address (srcmem, count, 1);
> +  destmem = offset_address (destmem, count, 1);
> +  srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
> +                          GET_MODE_SIZE (mode));
> +  destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
> +                           GET_MODE_SIZE (mode));
> +  emit_move_insn (destmem, srcmem);
> +  emit_jump_insn (gen_jump (done_label));
> +  emit_barrier ();
> +
> +  emit_label (label);
> +  LABEL_NUSES (label) = 1;
> +}
> +
> +/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
> +   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
> +   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
> +   proceed with an loop copying SIZE bytes at once.
> +   DONE_LABEL is a label after the whole copying sequence. The label is created
> +   on demand if *DONE_LABEL is NULL.
> +   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
> +   bounds after the initial copies.
> +
> +   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
> +   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
> +   we will dispatch to a library call for large blocks.
> +
> +   In pseudocode we do:
> +
> +   if (COUNT < SIZE)
> +     {
> +       Assume that SIZE is 4. Bigger sizes are handled analogously
> +       if (COUNT & 4)
> +        {
> +           copy 4 bytes from SRCPTR to DESTPTR
> +           copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
> +           goto done_label
> +        }
> +       if (!COUNT)
> +        goto done_label;
> +       copy 1 byte from SRCPTR to DESTPTR
> +       if (COUNT & 2)
> +        {
> +           copy 2 bytes from SRCPTR to DESTPTR
> +           copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
> +        }
> +     }
> +   else
> +     {
> +       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
> +       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
> +
> +       OLD_DESPTR = DESTPTR;
> +       Align DESTPTR up to DESIRED_ALIGN
> +       SRCPTR += DESTPTR - OLD_DESTPTR
> +       COUNT -= DEST_PTR - OLD_DESTPTR
> +       if (DYNAMIC_CHECK)
> +        Round COUNT down to multiple of SIZE
> +       << optional caller supplied zero size guard is here >>
> +       << optional caller suppplied dynamic check is here >>
> +       << caller supplied main copy loop is here >>
> +     }
> +   done_label:
> +  */
> +static void
> +expand_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
> +                                                    rtx *destptr, rtx *srcptr,
> +                                                    rtx *count,
> +                                                    rtx *done_label,
> +                                                    int size,
> +                                                    int desired_align,
> +                                                    int align,
> +                                                    unsigned HOST_WIDE_INT *min_size,
> +                                                    bool dynamic_check)
> +{
> +  rtx loop_label = NULL, label;
> +  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 2);
> +  int n;
> +  rtx modesize;
> +  int prolog_size = 0;
> +
> +  if (size >= 32)
> +    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
> +  else if (size >= 16)
> +    mode = TARGET_SSE ? V16QImode : DImode;
> +
> +  /* See if block is big or small, handle small blocks.  */
> +  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
> +    {
> +      int size2 = size;
> +      loop_label = gen_label_rtx ();
> +
> +      if (!*done_label)
> +       *done_label = gen_label_rtx ();
> +
> +      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
> +                              1, loop_label);
> +      size2 >>= 1;
> +
> +      /* Handle sizes > 3.  */
> +      for (;size2 > 2; size2 >>= 1)
> +       expand_small_movmem (destmem, srcmem, *destptr, *srcptr, *count,
> +                            size2, *done_label);
> +      /* Nothing to copy?  Jump to DONE_LABEL if so */
> +      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
> +                              1, *done_label);
> +
> +      /* Do a byte copy.  */
> +      srcmem = change_address (srcmem, QImode, *srcptr);
> +      destmem = change_address (destmem, QImode, *destptr);
> +      emit_move_insn (destmem, srcmem);
> +
> +      /* Handle sizes 2 and 3.  */
> +      label = ix86_expand_aligntest (*count, 2, false);
> +      srcmem = change_address (srcmem, HImode, *srcptr);
> +      destmem = change_address (destmem, HImode, *destptr);
> +      srcmem = offset_address (srcmem, *count, 1);
> +      destmem = offset_address (destmem, *count, 1);
> +      srcmem = offset_address (srcmem, GEN_INT (-2), 2);
> +      destmem = offset_address (destmem, GEN_INT (-2), 2);
> +      emit_move_insn (destmem, srcmem);
> +
> +      emit_label (label);
> +      LABEL_NUSES (label) = 1;
> +      emit_jump_insn (gen_jump (*done_label));
> +      emit_barrier ();
> +    }
> +  else
> +    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
> +               || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
> +
> +  /* Start memcpy for COUNT >= SIZE.  */
> +  if (loop_label)
> +    {
> +       emit_label (loop_label);
> +       LABEL_NUSES (loop_label) = 1;
> +    }
> +
> +  /* Copy first desired_align bytes.  */
> +  srcmem = change_address (srcmem, mode, *srcptr);
> +  destmem = change_address (destmem, mode, *destptr);
> +  modesize = GEN_INT (GET_MODE_SIZE (mode));
> +  for (n = 0; prolog_size < desired_align - align; n++)
> +    {
> +      emit_move_insn (destmem, srcmem);
> +      srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
> +      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
> +      prolog_size += GET_MODE_SIZE (mode);
> +    }
> +
> +
> +  /* Copy last SIZE bytes.  */
> +  srcmem = offset_address (srcmem, *count, 1);
> +  destmem = offset_address (destmem, *count, 1);
> +  srcmem = offset_address (srcmem,
> +                          GEN_INT (-size - prolog_size),
> +                          1);
> +  destmem = offset_address (destmem,
> +                           GEN_INT (-size - prolog_size),
> +                           1);
> +  emit_move_insn (destmem, srcmem);
> +  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
> +    {
> +      srcmem = offset_address (srcmem, modesize, 1);
> +      destmem = offset_address (destmem, modesize, 1);
> +      emit_move_insn (destmem, srcmem);
> +    }
> +
> +  /* Align destination.  */
> +  if (desired_align > 1 && desired_align > align)
> +    {
> +      rtx saveddest = *destptr;
> +
> +      gcc_assert (desired_align <= size);
> +      /* Align destptr up, place it to new register.  */
> +      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
> +                                     GEN_INT (prolog_size),
> +                                     NULL_RTX, 1, OPTAB_DIRECT);
> +      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
> +                                     GEN_INT (-desired_align),
> +                                     *destptr, 1, OPTAB_DIRECT);
> +      /* See how many bytes we skipped.  */
> +      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
> +                                      *destptr,
> +                                      saveddest, 1, OPTAB_DIRECT);
> +      /* Adjust srcptr and count.  */
> +      *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
> +                                     *srcptr, 1, OPTAB_DIRECT);
> +      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
> +                                   saveddest, *count, 1, OPTAB_DIRECT);
> +      /* We copied at most size + prolog_size.  */
> +      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
> +       *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
> +      else
> +       *min_size = 0;
> +
> +      /* Our loops always round down the bock size, but for dispatch to library
> +        we need precise value.  */
> +      if (dynamic_check)
> +       *count = expand_simple_binop (GET_MODE (*count), AND, *count,
> +                                     GEN_INT (-size), *count, 1, OPTAB_DIRECT);
> +    }
> +  else
> +    {
> +      gcc_assert (prolog_size == 0);
> +      /* Decrease count, so we won't end up copying last word twice.  */
> +      if (!CONST_INT_P (*count))
> +       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
> +                                     constm1_rtx, *count, 1, OPTAB_DIRECT);
> +      else
> +       *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
> +      if (*min_size)
> +       *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
> +    }
> +}
> +
>  /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
>     ALIGN_BYTES is how many bytes need to be copied.
>     The function updates DST and SRC, namely, it sets proper alignment.
> @@ -22749,6 +22993,190 @@ expand_setmem_prologue (rtx destmem, rtx
>    gcc_assert (desired_alignment <= 8);
>  }
>
> +/* Test if COUNT&SIZE is nonzero and if so, expand setmem
> +   sequence that is valid for SIZE..2*SIZE-1 bytes
> +   and jump to DONE_LABEL.  */
> +
> +static void
> +expand_small_setmem (rtx destmem,
> +                    rtx destptr, rtx count,
> +                    rtx value,
> +                    int size, rtx done_label)
> +{
> +  rtx label = ix86_expand_aligntest (count, size, false);
> +  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
> +  rtx modesize;
> +  int n;
> +
> +  if (size >= 32)
> +    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
> +  else if (size >= 16)
> +    mode = TARGET_SSE ? V16QImode : DImode;
> +
> +  destmem = change_address (destmem, mode, destptr);
> +  modesize = GEN_INT (GET_MODE_SIZE (mode));
> +  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
> +    {
> +      emit_move_insn (destmem, gen_lowpart (mode, value));
> +      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
> +    }
> +
> +  destmem = offset_address (destmem, count, 1);
> +  destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
> +                           GET_MODE_SIZE (mode));
> +  emit_move_insn (destmem, gen_lowpart (mode, value));
> +  emit_jump_insn (gen_jump (done_label));
> +  emit_barrier ();
> +
> +  emit_label (label);
> +  LABEL_NUSES (label) = 1;
> +}
> +
> +/* Handle small memcpy (up to SIZE that is supposed to be small power of 2)
> +   and get ready for the main memset loop.
> +   See expand_movmem_prologue_epilogue_by_misaligned_moves for detailed
> +   description.  */
> +
> +static void
> +expand_setmem_prologue_epilogue_by_misaligned_moves (rtx destmem,
> +                                                    rtx *destptr,
> +                                                    rtx *count,
> +                                                    rtx value,
> +                                                    rtx *done_label,
> +                                                    int size,
> +                                                    int desired_align,
> +                                                    int align,
> +                                                    unsigned HOST_WIDE_INT *min_size,
> +                                                    bool dynamic_check)
> +{
> +  rtx loop_label = NULL, label;
> +  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 2);
> +  int n;
> +  rtx modesize;
> +  int prolog_size = 0;
> +
> +  if (size >= 32)
> +    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
> +  else if (size >= 16)
> +    mode = TARGET_SSE ? V16QImode : DImode;
> +
> +  /* See if block is big or small, handle small blocks.  */
> +  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
> +    {
> +      int size2 = size;
> +      loop_label = gen_label_rtx ();
> +
> +      if (!*done_label)
> +       *done_label = gen_label_rtx ();
> +
> +      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
> +                              1, loop_label);
> +      size2 >>= 1;
> +
> +      /* Handle sizes > 3.  */
> +      for (;size2 > 2; size2 >>= 1)
> +       expand_small_setmem (destmem, *destptr, *count, value,
> +                            size2, *done_label);
> +      /* Nothing to copy?  Jump to DONE_LABEL if so */
> +      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
> +                              1, *done_label);
> +
> +      /* Do a byte copy.  */
> +      destmem = change_address (destmem, QImode, *destptr);
> +      emit_move_insn (destmem, gen_lowpart (QImode, value));
> +
> +      /* Handle sizes 2 and 3.  */
> +      label = ix86_expand_aligntest (*count, 2, false);
> +      destmem = change_address (destmem, HImode, *destptr);
> +      destmem = offset_address (destmem, *count, 1);
> +      destmem = offset_address (destmem, GEN_INT (-2), 2);
> +      emit_move_insn (destmem, gen_lowpart (HImode, value));
> +
> +      emit_label (label);
> +      LABEL_NUSES (label) = 1;
> +      emit_jump_insn (gen_jump (*done_label));
> +      emit_barrier ();
> +    }
> +  else
> +    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
> +               || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
> +
> +  /* Start memcpy for COUNT >= SIZE.  */
> +  if (loop_label)
> +    {
> +       emit_label (loop_label);
> +       LABEL_NUSES (loop_label) = 1;
> +    }
> +
> +  /* Copy first desired_align bytes.  */
> +  destmem = change_address (destmem, mode, *destptr);
> +  modesize = GEN_INT (GET_MODE_SIZE (mode));
> +  for (n = 0; prolog_size < desired_align - align; n++)
> +    {
> +      emit_move_insn (destmem, gen_lowpart (mode, value));
> +      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
> +      prolog_size += GET_MODE_SIZE (mode);
> +    }
> +
> +
> +  /* Copy last SIZE bytes.  */
> +  destmem = offset_address (destmem, *count, 1);
> +  destmem = offset_address (destmem,
> +                           GEN_INT (-size - prolog_size),
> +                           1);
> +  emit_move_insn (destmem, gen_lowpart (mode, value));
> +  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
> +    {
> +      destmem = offset_address (destmem, modesize, 1);
> +      emit_move_insn (destmem, gen_lowpart (mode, value));
> +    }
> +
> +  /* Align destination.  */
> +  if (desired_align > 1 && desired_align > align)
> +    {
> +      rtx saveddest = *destptr;
> +
> +      gcc_assert (desired_align <= size);
> +      /* Align destptr up, place it to new register.  */
> +      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
> +                                     GEN_INT (prolog_size),
> +                                     NULL_RTX, 1, OPTAB_DIRECT);
> +      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
> +                                     GEN_INT (-desired_align),
> +                                     *destptr, 1, OPTAB_DIRECT);
> +      /* See how many bytes we skipped.  */
> +      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
> +                                      *destptr,
> +                                      saveddest, 1, OPTAB_DIRECT);
> +      /* Adjust count.  */
> +      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
> +                                   saveddest, *count, 1, OPTAB_DIRECT);
> +      /* We copied at most size + prolog_size.  */
> +      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
> +       *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
> +      else
> +       *min_size = 0;
> +
> +      /* Our loops always round down the bock size, but for dispatch to library
> +        we need precise value.  */
> +      if (dynamic_check)
> +       *count = expand_simple_binop (GET_MODE (*count), AND, *count,
> +                                     GEN_INT (-size), *count, 1, OPTAB_DIRECT);
> +    }
> +  else
> +    {
> +      gcc_assert (prolog_size == 0);
> +      /* Decrease count, so we won't end up copying last word twice.  */
> +      if (!CONST_INT_P (*count))
> +       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
> +                                     constm1_rtx, *count, 1, OPTAB_DIRECT);
> +      else
> +       *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
> +      if (*min_size)
> +       *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
> +    }
> +}
> +
>  /* Set enough from DST to align DST known to by aligned by ALIGN to
>     DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
>  static rtx
> @@ -22790,61 +23218,98 @@ expand_constant_setmem_prologue (rtx dst
>    return dst;
>  }
>
> -/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
> -static enum stringop_alg
> -decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
> -           int *dynamic_check, bool *noalign)
> +/* Return true if ALG can be used in current context.
> +   Assume we expand memset if MEMSET is true.  */
> +static bool
> +alg_usable_p (enum stringop_alg alg, bool memset)
>  {
> -  const struct stringop_algs * algs;
> -  bool optimize_for_speed;
> +  if (alg == no_stringop)
> +    return false;
> +  if (alg == vector_loop)
> +    return TARGET_SSE || TARGET_AVX;
>    /* Algorithms using the rep prefix want at least edi and ecx;
>       additionally, memset wants eax and memcpy wants esi.  Don't
>       consider such algorithms if the user has appropriated those
>       registers for their own purposes. */
> -  bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
> -                             || (memset
> -                                ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
> -  *noalign = false;
> +  if (alg == rep_prefix_1_byte
> +      || alg == rep_prefix_4_byte
> +      || alg == rep_prefix_8_byte)
> +    return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
> +             || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
> +  return true;
> +}
> +
> +
>
> -#define ALG_USABLE_P(alg) (rep_prefix_usable                   \
> -                          || (alg != rep_prefix_1_byte         \
> -                              && alg != rep_prefix_4_byte      \
> -                              && alg != rep_prefix_8_byte))
> +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
> +static enum stringop_alg
> +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
> +           unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
> +           bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
> +{
> +  const struct stringop_algs * algs;
> +  bool optimize_for_speed;
> +  int max = -1;
>    const struct processor_costs *cost;
> +  int i;
> +  bool any_alg_usable_p = false;
> +
> +  *noalign = false;
> +  *dynamic_check = -1;
>
>    /* Even if the string operation call is cold, we still might spend a lot
>       of time processing large blocks.  */
>    if (optimize_function_for_size_p (cfun)
>        || (optimize_insn_for_size_p ()
> -          && expected_size != -1 && expected_size < 256))
> +         && (max_size < 256
> +              || (expected_size != -1 && expected_size < 256))))
>      optimize_for_speed = false;
>    else
>      optimize_for_speed = true;
>
>    cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
> -
> -  *dynamic_check = -1;
>    if (memset)
>      algs = &cost->memset[TARGET_64BIT != 0];
>    else
>      algs = &cost->memcpy[TARGET_64BIT != 0];
> -  if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
> +
> +  /* See maximal size for user defined algorithm.  */
> +  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
> +    {
> +      enum stringop_alg candidate = algs->size[i].alg;
> +      bool usable = alg_usable_p (candidate, memset);
> +      any_alg_usable_p |= usable;
> +
> +      if (candidate != libcall && candidate && usable)
> +         max = algs->size[i].max;
> +    }
> +
> +  /* If expected size is not known but max size is small enough
> +     so inline version is a win, set expected size into
> +     the range.  */
> +  if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
> +    expected_size = min_size / 2 + max_size / 2;
> +
> +  /* If user specified the algorithm, honnor it if possible.  */
> +  if (ix86_stringop_alg != no_stringop
> +      && alg_usable_p (ix86_stringop_alg, memset))
>      return ix86_stringop_alg;
>    /* rep; movq or rep; movl is the smallest variant.  */
>    else if (!optimize_for_speed)
>      {
> -      if (!count || (count & 3))
> -       return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
> +      if (!count || (count & 3) || (memset && !zero_memset))
> +       return alg_usable_p (rep_prefix_1_byte, memset)
> +              ? rep_prefix_1_byte : loop_1_byte;
>        else
> -       return rep_prefix_usable ? rep_prefix_4_byte : loop;
> +       return alg_usable_p (rep_prefix_4_byte, memset)
> +              ? rep_prefix_4_byte : loop;
>      }
> -  /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
> -   */
> +  /* Very tiny blocks are best handled via the loop, REP is expensive to
> +     setup.  */
>    else if (expected_size != -1 && expected_size < 4)
>      return loop_1_byte;
>    else if (expected_size != -1)
>      {
> -      unsigned int i;
>        enum stringop_alg alg = libcall;
>        for (i = 0; i < MAX_STRINGOP_ALGS; i++)
>         {
> @@ -22858,7 +23323,7 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
>             {
>               enum stringop_alg candidate = algs->size[i].alg;
>
> -             if (candidate != libcall && ALG_USABLE_P (candidate))
> +             if (candidate != libcall && alg_usable_p (candidate, memset))
>                 alg = candidate;
>               /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
>                  last non-libcall inline algorithm.  */
> @@ -22871,14 +23336,13 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
>                     return alg;
>                   break;
>                 }
> -             else if (ALG_USABLE_P (candidate))
> +             else if (alg_usable_p (candidate, memset))
>                 {
>                   *noalign = algs->size[i].noalign;
>                   return candidate;
>                 }
>             }
>         }
> -      gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
>      }
>    /* When asked to inline the call anyway, try to pick meaningful choice.
>       We look for maximal size of block that is faster to copy by hand and
> @@ -22888,22 +23352,11 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
>       If this turns out to be bad, we might simply specify the preferred
>       choice in ix86_costs.  */
>    if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
> -      && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
> +      && (algs->unknown_size == libcall
> +         || !alg_usable_p (algs->unknown_size, memset)))
>      {
> -      int max = -1;
>        enum stringop_alg alg;
> -      int i;
> -      bool any_alg_usable_p = true;
> -
> -      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
> -        {
> -          enum stringop_alg candidate = algs->size[i].alg;
> -          any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
>
> -          if (candidate != libcall && candidate
> -              && ALG_USABLE_P (candidate))
> -              max = algs->size[i].max;
> -        }
>        /* If there aren't any usable algorithms, then recursing on
>           smaller sizes isn't going to find anything.  Just return the
>           simple byte-at-a-time copy loop.  */
> @@ -22916,15 +23369,16 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
>          }
>        if (max == -1)
>         max = 4096;
> -      alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
> +      alg = decide_alg (count, max / 2, min_size, max_size, memset,
> +                       zero_memset, dynamic_check, noalign);
>        gcc_assert (*dynamic_check == -1);
>        gcc_assert (alg != libcall);
>        if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
>         *dynamic_check = max;
>        return alg;
>      }
> -  return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
> -#undef ALG_USABLE_P
> +  return (alg_usable_p (algs->unknown_size, memset)
> +         ? algs->unknown_size : libcall);
>  }
>
>  /* Decide on alignment.  We know that the operand is already aligned to ALIGN
> @@ -22964,31 +23418,51 @@ decide_alignment (int align,
>  /* Expand string move (memcpy) operation.  Use i386 string operations
>     when profitable.  expand_setmem contains similar code.  The code
>     depends upon architecture, block size and alignment, but always has
> -   the same overall structure:
> +   one of the following overall structures:
> +
> +   Aligned move sequence:
> +
> +     1) Prologue guard: Conditional that jumps up to epilogues for small
> +       blocks that can be handled by epilogue alone.  This is faster
> +       but also needed for correctness, since prologue assume the block
> +       is larger than the desired alignment.
> +
> +       Optional dynamic check for size and libcall for large
> +       blocks is emitted here too, with -minline-stringops-dynamically.
> +
> +     2) Prologue: copy first few bytes in order to get destination
> +       aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
> +       than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
> +       copied.  We emit either a jump tree on power of two sized
> +       blocks, or a byte loop.
> +
> +     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
> +       with specified algorithm.
>
> -   1) Prologue guard: Conditional that jumps up to epilogues for small
> -      blocks that can be handled by epilogue alone.  This is faster
> -      but also needed for correctness, since prologue assume the block
> -      is larger than the desired alignment.
> -
> -      Optional dynamic check for size and libcall for large
> -      blocks is emitted here too, with -minline-stringops-dynamically.
> -
> -   2) Prologue: copy first few bytes in order to get destination
> -      aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
> -      than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
> -      copied.  We emit either a jump tree on power of two sized
> -      blocks, or a byte loop.
> +     4) Epilogue: code copying tail of the block that is too small to be
> +       handled by main body (or up to size guarded by prologue guard).
>
> -   3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
> -      with specified algorithm.
> +  Misaligned move sequence
>
> -   4) Epilogue: code copying tail of the block that is too small to be
> -      handled by main body (or up to size guarded by prologue guard).  */
> +     1) missaligned move prologue/epilogue containing:
> +        a) Prologue handling small memory blocks and jumping to done_label
> +          (skipped if blocks are known to be large enough)
> +       b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
> +           needed by single possibly misaligned move
> +          (skipped if alignment is not needed)
> +        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
> +
> +     2) Zero size guard dispatching to done_label, if needed
> +
> +     3) dispatch to library call, if needed,
> +
> +     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
> +       with specified algorithm.  */
>
>  bool
>  ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
> -                   rtx expected_align_exp, rtx expected_size_exp)
> +                   rtx expected_align_exp, rtx expected_size_exp,
> +                   rtx min_size_exp, rtx max_size_exp)
>  {
>    rtx destreg;
>    rtx srcreg;
> @@ -23006,6 +23480,9 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>    bool noalign;
>    enum machine_mode move_mode = VOIDmode;
>    int unroll_factor = 1;
> +  unsigned HOST_WIDE_INT min_size = UINTVAL (min_size_exp);
> +  unsigned HOST_WIDE_INT max_size = max_size_exp ? UINTVAL (max_size_exp) : -1;
> +  bool misaligned_prologue_used = false;
>
>    if (CONST_INT_P (align_exp))
>      align = INTVAL (align_exp);
> @@ -23029,7 +23506,8 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>
>    /* Step 0: Decide on preferred algorithm, desired alignment and
>       size of chunks to be copied by main loop.  */
> -  alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
> +  alg = decide_alg (count, expected_size, min_size, max_size,
> +                   false, false, &dynamic_check, &noalign);
>    if (alg == libcall)
>      return false;
>    gcc_assert (alg != no_stringop);
> @@ -23115,8 +23593,44 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>      }
>    gcc_assert (desired_align >= 1 && align >= 1);
>
> +  /* Misaligned move sequences handles both prologues and epilogues at once.
> +     Default code generation results in smaller code for large alignments and
> +     also avoids redundant job when sizes are known precisely.  */
> +  if (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES
> +      && MAX (desired_align, epilogue_size_needed) <= 32
> +      && ((desired_align > align && !align_bytes)
> +         || (!count && epilogue_size_needed > 1)))
> +    {
> +      /* Misaligned move prologue handled small blocks by itself.  */
> +      misaligned_prologue_used = true;
> +      expand_movmem_prologue_epilogue_by_misaligned_moves
> +          (dst, src, &destreg, &srcreg, &count_exp, &jump_around_label,
> +            desired_align < align
> +           ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
> +           desired_align, align, &min_size, dynamic_check);
> +      src = change_address (src, BLKmode, srcreg);
> +      dst = change_address (dst, BLKmode, destreg);
> +      set_mem_align (dst, desired_align * BITS_PER_UNIT);
> +      epilogue_size_needed = 0;
> +      if (need_zero_guard && !min_size)
> +       {
> +         /* It is possible that we copied enough so the main loop will not
> +            execute.  */
> +         gcc_assert (size_needed > 1);
> +         if (jump_around_label == NULL_RTX)
> +           jump_around_label = gen_label_rtx ();
> +         emit_cmp_and_jump_insns (count_exp,
> +                                  GEN_INT (size_needed),
> +                                  LTU, 0, counter_mode (count_exp), 1, jump_around_label);
> +         if (expected_size == -1
> +             || expected_size < (desired_align - align) / 2 + size_needed)
> +           predict_jump (REG_BR_PROB_BASE * 20 / 100);
> +         else
> +           predict_jump (REG_BR_PROB_BASE * 60 / 100);
> +       }
> +    }
>    /* Ensure that alignment prologue won't copy past end of block.  */
> -  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
> +  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
>      {
>        epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
>        /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
> @@ -23135,8 +23649,9 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>                 goto epilogue;
>             }
>         }
> -      else
> +      else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
>         {
> +         gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
>           label = gen_label_rtx ();
>           emit_cmp_and_jump_insns (count_exp,
>                                    GEN_INT (epilogue_size_needed),
> @@ -23176,18 +23691,23 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>
>    /* Step 2: Alignment prologue.  */
>
> -  if (desired_align > align)
> +  if (desired_align > align && !misaligned_prologue_used)
>      {
>        if (align_bytes == 0)
>         {
> -         /* Except for the first move in epilogue, we no longer know
> +         /* Except for the first move in prologue, we no longer know
>              constant offset in aliasing info.  It don't seems to worth
>              the pain to maintain it for the first move, so throw away
>              the info early.  */
>           src = change_address (src, BLKmode, srcreg);
>           dst = change_address (dst, BLKmode, destreg);
> -         dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
> -                                       desired_align);
> +         dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp,
> +                                       align, desired_align);
> +         /* At most desired_align - align bytes are copied.  */
> +         if (min_size < (unsigned)(desired_align - align))
> +           min_size = 0;
> +         else
> +           min_size -= desired_align;
>         }
>        else
>         {
> @@ -23200,6 +23720,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>           count -= align_bytes;
>         }
>        if (need_zero_guard
> +         && !min_size
>           && (count < (unsigned HOST_WIDE_INT) size_needed
>               || (align_bytes == 0
>                   && count < ((unsigned HOST_WIDE_INT) size_needed
> @@ -23227,7 +23748,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>        label = NULL;
>        epilogue_size_needed = 1;
>      }
> -  else if (label == NULL_RTX)
> +  else if (label == NULL_RTX && !misaligned_prologue_used)
>      epilogue_size_needed = size_needed;
>
>    /* Step 3: Main loop.  */
> @@ -23395,7 +23916,8 @@ promote_duplicated_reg_to_size (rtx val,
>     steps performed.  */
>  bool
>  ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
> -                   rtx expected_align_exp, rtx expected_size_exp)
> +                   rtx expected_align_exp, rtx expected_size_exp,
> +                   rtx min_size_exp, rtx max_size_exp)
>  {
>    rtx destreg;
>    rtx label = NULL;
> @@ -23414,6 +23936,9 @@ ix86_expand_setmem (rtx dst, rtx count_e
>    bool noalign;
>    enum machine_mode move_mode = VOIDmode;
>    int unroll_factor;
> +  unsigned HOST_WIDE_INT min_size = UINTVAL (min_size_exp);
> +  unsigned HOST_WIDE_INT max_size = max_size_exp ? UINTVAL (max_size_exp) : -1;
> +  bool misaligned_prologue_used = false;
>
>    if (CONST_INT_P (align_exp))
>      align = INTVAL (align_exp);
> @@ -23433,7 +23958,8 @@ ix86_expand_setmem (rtx dst, rtx count_e
>    /* Step 0: Decide on preferred algorithm, desired alignment and
>       size of chunks to be copied by main loop.  */
>
> -  alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
> +  alg = decide_alg (count, expected_size, min_size, max_size,
> +                   true, val_exp == const0_rtx, &dynamic_check, &noalign);
>    if (alg == libcall)
>      return false;
>    gcc_assert (alg != no_stringop);
> @@ -23508,8 +24034,47 @@ ix86_expand_setmem (rtx dst, rtx count_e
>    if (CONST_INT_P (val_exp))
>      promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
>                                                    desired_align, align);
> +  /* Misaligned move sequences handles both prologues and epilogues at once.
> +     Default code generation results in smaller code for large alignments and
> +     also avoids redundant job when sizes are known precisely.  */
> +  if (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES
> +      && MAX (desired_align, epilogue_size_needed) <= 32
> +      && ((desired_align > align && !align_bytes)
> +         || (!count && epilogue_size_needed > 1)))
> +    {
> +      /* We always need promoted value.  */
> +      if (!promoted_val)
> +       promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
> +                                                      desired_align, align);
> +      /* Misaligned move prologue handled small blocks by itself.  */
> +      misaligned_prologue_used = true;
> +      expand_setmem_prologue_epilogue_by_misaligned_moves
> +          (dst, &destreg, &count_exp, promoted_val, &jump_around_label,
> +           desired_align < align
> +          ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
> +           desired_align, align, &min_size, dynamic_check);
> +      dst = change_address (dst, BLKmode, destreg);
> +      set_mem_align (dst, desired_align * BITS_PER_UNIT);
> +      epilogue_size_needed = 0;
> +      if (need_zero_guard && !min_size)
> +        {
> +          /* It is possible that we copied enough so the main loop will not
> +             execute.  */
> +          gcc_assert (size_needed > 1);
> +          if (jump_around_label == NULL_RTX)
> +            jump_around_label = gen_label_rtx ();
> +          emit_cmp_and_jump_insns (count_exp,
> +                                   GEN_INT (size_needed),
> +                                   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
> +          if (expected_size == -1
> +              || expected_size < (desired_align - align) / 2 + size_needed)
> +            predict_jump (REG_BR_PROB_BASE * 20 / 100);
> +          else
> +            predict_jump (REG_BR_PROB_BASE * 60 / 100);
> +        }
> +    }
>    /* Ensure that alignment prologue won't copy past end of block.  */
> -  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
> +  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
>      {
>        epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
>        /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
> @@ -23534,8 +24099,9 @@ ix86_expand_setmem (rtx dst, rtx count_e
>                 goto epilogue;
>             }
>         }
> -      else
> +      else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
>         {
> +         gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
>           label = gen_label_rtx ();
>           emit_cmp_and_jump_insns (count_exp,
>                                    GEN_INT (epilogue_size_needed),
> @@ -23566,7 +24132,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>                                                    desired_align, align);
>    gcc_assert (desired_align >= 1 && align >= 1);
>
> -  if (desired_align > align)
> +  if (desired_align > align && !misaligned_prologue_used)
>      {
>        if (align_bytes == 0)
>         {
> @@ -23577,6 +24143,11 @@ ix86_expand_setmem (rtx dst, rtx count_e
>           dst = change_address (dst, BLKmode, destreg);
>           expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
>                                   desired_align);
> +         /* At most desired_align - align bytes are copied.  */
> +         if (min_size < (unsigned)(desired_align - align))
> +           min_size = 0;
> +         else
> +           min_size -= desired_align;
>         }
>        else
>         {
> @@ -23589,6 +24160,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>           count -= align_bytes;
>         }
>        if (need_zero_guard
> +         && !min_size
>           && (count < (unsigned HOST_WIDE_INT) size_needed
>               || (align_bytes == 0
>                   && count < ((unsigned HOST_WIDE_INT) size_needed
> @@ -23617,7 +24189,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>        promoted_val = val_exp;
>        epilogue_size_needed = 1;
>      }
> -  else if (label == NULL_RTX)
> +  else if (label == NULL_RTX && !misaligned_prologue_used)
>      epilogue_size_needed = size_needed;
>
>    /* Step 3: Main loop.  */
Jan Hubicka - Sept. 29, 2013, 9:24 a.m.
> > We now produce:
> >         movq    b(%rip), %rsi
> >         movq    a(%rip), %rcx
> >         movq    (%rsi), %rax <- first 8 bytes are moved
> >         leaq    8(%rcx), %rdi
> >         andq    $-8, %rdi   <- dest is aligned
> >         movq    %rax, (%rcx)
> >         movq    132(%rsi), %rax  <- last 8 bytes are moved
> >         movq    %rax, 132(%rcx)
> >         subq    %rdi, %rcx  <- alignment is subtracted from count
> 
> >         subq    %rcx, %rsi  <- source is aligned
> 
> This (source aligned) is not always true, but nevertheless the
> sequence is very tight.

Yep, sure, it is algigned only if source-dest is aligned, but that is best we
can ask for.
> > Unforutnately the following testcase:
> > char *p,*q;
> > t(int a)
> > {
> >   if (a<100)
> >     memcpy(q,p,a);
> >
> > }
> > Won't get inlined.  This is because A is known to be smaller than 100 that
> > results in anti range after conversion to size_t.  This anti range allows very
> > large values (above INT_MAX) and thus we do not know the block size.
> > I am not sure if the sane range can be recovered somehow.  If not, maybe
> > this is common enough to add support for "probable" upper bound parameter to
> > the template.
> 
> Do we know if there is real code that intentionally does that other
> than security flaws as result of improperly done range check?

I do not think so.
> 
> I think by default GCC should assume the memcpy size range is (0, 100)
> here with perhaps an option to override it.

Indeed, this is what I was suggesting.  Problem is what to pass down to the
expanders as a value range.

We either need to update documentation of the expanders that the ranges are
just highly probably - and I do not want to do that since I want to use the
ranges for move_by_pieces, too.  So I think we will have to introduce two upper
bounds parameters - one sure and other very likely if there is no other
solution.

We play similar tricks in niter code.

Honza
Michael Zolotukhin - Sept. 29, 2013, 10:05 a.m.
Hi Jan,

> I also think the misaligned move trick can/should be performed by
> move_by_pieces and we ought to consider sane use of SSE - current vector_loop
> with unrolling factor of 4 seems bit extreme.  At least buldozer is happy with
> 2 and I would expect SSE moves to be especially useful for moving blocks with
> known size where they are not used at all.
> 
> Currently I disabled misaligned move prologues/epilogues for Michael's vector
> loop path since they ends up longer than the traditional code (that use loop
> for epilogue)
Prologues could use this techniques even with vector_loop, as they actually
don't have a loop.
As for epilogues - have you tried to use misaligned vector_moves (movdqu)?  It
looks to me that we need approx. the same amount of instructions in vector-loop
and in usual-loop epilogues, if we use vector-instructions in vector-loop
epilogue.

> Comments are welcome.
BTW, maybe we could generalize expand_small_movmem a bit and make a separate
expanding strategy out of it?  It will expand a memmov with no loop (and
probably no alignment prologue) - just with the biggest available moves.  Also,
a cost model could be added here to make decisions on when we actually want to
align the moves.  Here is a couple of examples of that:

memcpy (a, b, 32); // alignment is unknown
will expand to
  movdqu a, %xmm0
  movdqu a+16, %xmm1
  movdqu %xmm0, b
  movdqu %xmm1, b+16
memcpy (a, b, 32); // alignment is known and equals 64bit
will expand to
a)
  movdqu a, %xmm0
  movdqu a+16, %xmm1
  movdqu %xmm0, b
  movdqu %xmm1, b+16
or b)
  movq	  a,   %xmm0
  movdqa  a+8, %xmm1
  movq	  a+24,%xmm2
  movq	  %xmm0, b
  movdqa  %xmm1, b+8
  movq	  %xmm2, b+24

We would compute the total cost of both variant and choose the best - for
computation we need just a cost of aligned and misaligned moves.

This strategy is actually pretty similar to move_by_pieces, but as it have much
more target-specific information, it would be able to produce much more
effective code.

And one more question - in a work on vector_loop for memset I tried to merge
many of movmem and setmem routines (e.g. instead of expand_movmem_prologue and
expand_setmem_prologue I made a single routine
expand_movmem_or_setmem_prologue).  What do you think, is it a good idea?  It
reduces code size in i386.c, but slightly complicates the code.  I'll send a
patch shortly, as soon as the testing completes.

Thanks, Michael
Jan Hubicka - Sept. 29, 2013, 12:38 p.m.
> Hi Jan,
> 
> > I also think the misaligned move trick can/should be performed by
> > move_by_pieces and we ought to consider sane use of SSE - current vector_loop
> > with unrolling factor of 4 seems bit extreme.  At least buldozer is happy with
> > 2 and I would expect SSE moves to be especially useful for moving blocks with
> > known size where they are not used at all.
> > 
> > Currently I disabled misaligned move prologues/epilogues for Michael's vector
> > loop path since they ends up longer than the traditional code (that use loop
> > for epilogue)
> Prologues could use this techniques even with vector_loop, as they actually
> don't have a loop.

Were new prologues lose is the fact that we need to handle all sizes smaller than
SIZE_NEEDED.  This is 64bytes that leads to a variant for 32..64, 16..32, 8...16
4..8 and the tail.  It is quite a lot of code.

When block is known to be greater than 64, this is also a win but my current patch
does not fine tune this, yet.
Similarly misaligned moves are win when size is known, alignment is not performed
and normal fixed size epiogue needs more than one move or when alignment is known
but offset is non-zero.
It will need bit of tweaking to handle all the paths well - it is usual problem
with the stringops, they get way too complex as number of factors increase.

It is why I think we may consider vector loop with less than 4 unrollings.
AMD optimization manual recommends two for buldozer... Is there difference between
4 and two for Atom?

To be honest I am not quite sure from where constant of 4 comes.  I think I
introduced it long time ago for K8 where it apparently got some extra % of
performance.

It is used for larger blocks only for PPro. AMD chips preffer it for small
blocks apparently becuase they preffer loop-less sequence.

> As for epilogues - have you tried to use misaligned vector_moves (movdqu)?  It
> looks to me that we need approx. the same amount of instructions in vector-loop
> and in usual-loop epilogues, if we use vector-instructions in vector-loop
> epilogue.

Yes, code is in place for this.  You can just remove the check for size_needed
being smaller than 32 and it will produce the movdqu sequence for you (I tested
it on the vector loop testcase in the testusite).

The code will also use SSE for unrolled_loop prologue expansion at least for
memcpy (for memset it does not have broadcast value so it should skip it).

> 
> > Comments are welcome.
> BTW, maybe we could generalize expand_small_movmem a bit and make a separate
> expanding strategy out of it?  It will expand a memmov with no loop (and
> probably no alignment prologue) - just with the biggest available moves.  Also,
> a cost model could be added here to make decisions on when we actually want to
> align the moves.  Here is a couple of examples of that:
> 
> memcpy (a, b, 32); // alignment is unknown
> will expand to
>   movdqu a, %xmm0
>   movdqu a+16, %xmm1
>   movdqu %xmm0, b
>   movdqu %xmm1, b+16
> memcpy (a, b, 32); // alignment is known and equals 64bit
> will expand to
> a)
>   movdqu a, %xmm0
>   movdqu a+16, %xmm1
>   movdqu %xmm0, b
>   movdqu %xmm1, b+16
> or b)
>   movq	  a,   %xmm0
>   movdqa  a+8, %xmm1
>   movq	  a+24,%xmm2
>   movq	  %xmm0, b
>   movdqa  %xmm1, b+8
>   movq	  %xmm2, b+24
> 
> We would compute the total cost of both variant and choose the best - for
> computation we need just a cost of aligned and misaligned moves.
> 
> This strategy is actually pretty similar to move_by_pieces, but as it have much
> more target-specific information, it would be able to produce much more
> effective code.

I was actually thinking more along lines of teaching move_by_pieces to do the tricks.
It seems there is not that much of x86 specific knowledge in here and other architectures
will also benefit from it.  We can also enable it when value range is close enough.

I plan to look into it today or tomorrow - revisit your old patch to move_by_pieces and see
how much of extra API I need to get move_by_pieces to do what expand_small_movmem does.
> 
> And one more question - in a work on vector_loop for memset I tried to merge
> many of movmem and setmem routines (e.g. instead of expand_movmem_prologue and
> expand_setmem_prologue I made a single routine
> expand_movmem_or_setmem_prologue).  What do you think, is it a good idea?  It
> reduces code size in i386.c, but slightly complicates the code.  I'll send a
> patch shortly, as soon as the testing completes.

I would like to see it.  I am not too thrilled by the duplication. My original
motivation for that was to keep under control number of code paths thorugh the
expanders.  We already have many of them (and it is easy to get wrong code) as
different variants of prologues/epilgoues and main loops are not exactly the
same and thus the code is not as moduler as I would like.  I am not sure if
adding differences in between memset and memmove is not going to add too much
of extra cases to think of.  Maybe not, like for the misaligned prologues
the change is actually quite straighforward.

I however do not handle well the case where broadcasting of the constant value
should happen - currently I simply do it on the beggining that is quite cheap
in integer, but once we add SSE into a play we will need to push it down.

Honza

Patch

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 203004)
+++ config/i386/i386.h	(working copy)
@@ -378,6 +378,8 @@  extern unsigned char ix86_tune_features[
 	ix86_tune_features[X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE]
 #define TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS \
 	ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS]
+#define TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES \
+	ix86_tune_features[X86_TUNE_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 203004)
+++ config/i386/i386.md	(working copy)
@@ -15380,11 +15380,13 @@ 
    (use (match_operand:SWI48 2 "nonmemory_operand"))
    (use (match_operand:SWI48 3 "const_int_operand"))
    (use (match_operand:SI 4 "const_int_operand"))
-   (use (match_operand:SI 5 "const_int_operand"))]
+   (use (match_operand:SI 5 "const_int_operand"))
+   (use (match_operand:SI 6 "const_int_operand"))
+   (use (match_operand:SI 7 ""))]
   ""
 {
  if (ix86_expand_movmem (operands[0], operands[1], operands[2], operands[3],
-			 operands[4], operands[5]))
+			 operands[4], operands[5], operands[6], operands[7]))
    DONE;
  else
    FAIL;
@@ -15572,12 +15574,15 @@ 
     (use (match_operand:QI 2 "nonmemory_operand"))
     (use (match_operand 3 "const_int_operand"))
     (use (match_operand:SI 4 "const_int_operand"))
-    (use (match_operand:SI 5 "const_int_operand"))]
+    (use (match_operand:SI 5 "const_int_operand"))
+    (use (match_operand:SI 6 "const_int_operand"))
+    (use (match_operand:SI 7 ""))]
   ""
 {
  if (ix86_expand_setmem (operands[0], operands[1],
 			 operands[2], operands[3],
-			 operands[4], operands[5]))
+			 operands[4], operands[5],
+			 operands[6], operands[7]))
    DONE;
  else
    FAIL;
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def	(revision 203004)
+++ config/i386/x86-tune.def	(working copy)
@@ -230,3 +230,7 @@  DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CM
    fp converts to destination register.  */
 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
           m_SLM)
+/* Use misaligned move to avoid need for conditionals for string operations.
+   This is a win on targets that resulve resonably well partial memory stalls.  */
+DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES, "misaligned_move_string_prologues_epilogues",
+          m_GENERIC | m_CORE_ALL | m_AMD_MULTIPLE | m_SLM | m_ATOM)
Index: config/i386/i386-protos.h
===================================================================
--- config/i386/i386-protos.h	(revision 203004)
+++ config/i386/i386-protos.h	(working copy)
@@ -58,8 +58,8 @@  extern enum machine_mode ix86_cc_mode (e
 extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode);
 extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode);
 
-extern bool ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
-extern bool ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
+extern bool ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+extern bool ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx);
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
 
 extern bool constant_address_p (rtx);
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 203004)
+++ config/i386/i386.c	(working copy)
@@ -22661,6 +22661,250 @@  expand_movmem_prologue (rtx destmem, rtx
   return destmem;
 }
 
+/* Test if COUNT&SIZE is nonzero and if so, expand memmov
+   sequence that is valid for SIZE..2*SIZE-1 bytes
+   and jump to DONE_LABEL.  */
+static void
+expand_small_movmem (rtx destmem, rtx srcmem,
+		     rtx destptr, rtx srcptr, rtx count,
+		     int size, rtx done_label)
+{
+  rtx label = ix86_expand_aligntest (count, size, false);
+  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
+  rtx modesize;
+  int n;
+
+  if (size >= 32)
+    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+  else if (size >= 16)
+    mode = TARGET_SSE ? V16QImode : DImode;
+
+  srcmem = change_address (srcmem, mode, srcptr);
+  destmem = change_address (destmem, mode, destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      emit_move_insn (destmem, srcmem);
+      srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+
+  srcmem = offset_address (srcmem, count, 1);
+  destmem = offset_address (destmem, count, 1);
+  srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
+			   GET_MODE_SIZE (mode));
+  destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
+			    GET_MODE_SIZE (mode));
+  emit_move_insn (destmem, srcmem);
+  emit_jump_insn (gen_jump (done_label));
+  emit_barrier ();
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+}
+
+/* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
+   and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
+   bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
+   proceed with an loop copying SIZE bytes at once.
+   DONE_LABEL is a label after the whole copying sequence. The label is created
+   on demand if *DONE_LABEL is NULL.
+   MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
+   bounds after the initial copies. 
+
+   DESTMEM/SRCMEM are memory expressions pointing to the copies block,
+   DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
+   we will dispatch to a library call for large blocks.
+
+   In pseudocode we do:
+
+   if (COUNT < SIZE)
+     {
+       Assume that SIZE is 4. Bigger sizes are handled analogously
+       if (COUNT & 4)
+	 {
+	    copy 4 bytes from SRCPTR to DESTPTR
+	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
+	    goto done_label
+	 }
+       if (!COUNT)
+	 goto done_label;
+       copy 1 byte from SRCPTR to DESTPTR
+       if (COUNT & 2)
+	 {
+	    copy 2 bytes from SRCPTR to DESTPTR
+	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
+	 }
+     }
+   else
+     {
+       copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
+       copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
+
+       OLD_DESPTR = DESTPTR;
+       Align DESTPTR up to DESIRED_ALIGN
+       SRCPTR += DESTPTR - OLD_DESTPTR
+       COUNT -= DEST_PTR - OLD_DESTPTR
+       if (DYNAMIC_CHECK)
+	 Round COUNT down to multiple of SIZE
+       << optional caller supplied zero size guard is here >>
+       << optional caller suppplied dynamic check is here >>
+       << caller supplied main copy loop is here >>
+     }
+   done_label:
+  */
+static void
+expand_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
+						     rtx *destptr, rtx *srcptr,
+						     rtx *count,
+						     rtx *done_label,
+						     int size,
+						     int desired_align,
+						     int align,
+						     unsigned HOST_WIDE_INT *min_size,
+						     bool dynamic_check)
+{
+  rtx loop_label = NULL, label;
+  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 2);
+  int n;
+  rtx modesize;
+  int prolog_size = 0;
+
+  if (size >= 32)
+    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+  else if (size >= 16)
+    mode = TARGET_SSE ? V16QImode : DImode;
+
+  /* See if block is big or small, handle small blocks.  */
+  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+    {
+      int size2 = size;
+      loop_label = gen_label_rtx ();
+
+      if (!*done_label)
+	*done_label = gen_label_rtx ();
+
+      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+			       1, loop_label);
+      size2 >>= 1;
+
+      /* Handle sizes > 3.  */
+      for (;size2 > 2; size2 >>= 1)
+	expand_small_movmem (destmem, srcmem, *destptr, *srcptr, *count,
+		             size2, *done_label);
+      /* Nothing to copy?  Jump to DONE_LABEL if so */
+      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+			       1, *done_label);
+
+      /* Do a byte copy.  */
+      srcmem = change_address (srcmem, QImode, *srcptr);
+      destmem = change_address (destmem, QImode, *destptr);
+      emit_move_insn (destmem, srcmem);
+
+      /* Handle sizes 2 and 3.  */
+      label = ix86_expand_aligntest (*count, 2, false);
+      srcmem = change_address (srcmem, HImode, *srcptr);
+      destmem = change_address (destmem, HImode, *destptr);
+      srcmem = offset_address (srcmem, *count, 1);
+      destmem = offset_address (destmem, *count, 1);
+      srcmem = offset_address (srcmem, GEN_INT (-2), 2);
+      destmem = offset_address (destmem, GEN_INT (-2), 2);
+      emit_move_insn (destmem, srcmem);
+
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      emit_jump_insn (gen_jump (*done_label));
+      emit_barrier ();
+    }
+  else
+    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+
+  /* Start memcpy for COUNT >= SIZE.  */
+  if (loop_label)
+    {
+       emit_label (loop_label);
+       LABEL_NUSES (loop_label) = 1;
+    }
+
+  /* Copy first desired_align bytes.  */
+  srcmem = change_address (srcmem, mode, *srcptr);
+  destmem = change_address (destmem, mode, *destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; prolog_size < desired_align - align; n++)
+    {
+      emit_move_insn (destmem, srcmem);
+      srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+      prolog_size += GET_MODE_SIZE (mode);
+    }
+
+
+  /* Copy last SIZE bytes.  */
+  srcmem = offset_address (srcmem, *count, 1);
+  destmem = offset_address (destmem, *count, 1);
+  srcmem = offset_address (srcmem,
+			   GEN_INT (-size - prolog_size),
+			   1);
+  destmem = offset_address (destmem,
+			    GEN_INT (-size - prolog_size),
+			    1);
+  emit_move_insn (destmem, srcmem);
+  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      srcmem = offset_address (srcmem, modesize, 1);
+      destmem = offset_address (destmem, modesize, 1);
+      emit_move_insn (destmem, srcmem);
+    }
+
+  /* Align destination.  */
+  if (desired_align > 1 && desired_align > align)
+    {
+      rtx saveddest = *destptr;
+
+      gcc_assert (desired_align <= size);
+      /* Align destptr up, place it to new register.  */
+      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+				      GEN_INT (prolog_size),
+				      NULL_RTX, 1, OPTAB_DIRECT);
+      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+				      GEN_INT (-desired_align),
+				      *destptr, 1, OPTAB_DIRECT);
+      /* See how many bytes we skipped.  */
+      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+				       *destptr,
+				       saveddest, 1, OPTAB_DIRECT);
+      /* Adjust srcptr and count.  */
+      *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
+				      *srcptr, 1, OPTAB_DIRECT);
+      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				    saveddest, *count, 1, OPTAB_DIRECT);
+      /* We copied at most size + prolog_size.  */
+      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+	*min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
+      else
+	*min_size = 0;
+
+      /* Our loops always round down the bock size, but for dispatch to library
+	 we need precise value.  */
+      if (dynamic_check)
+	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
+				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+    }
+  else
+    {
+      gcc_assert (prolog_size == 0);
+      /* Decrease count, so we won't end up copying last word twice.  */
+      if (!CONST_INT_P (*count))
+	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				      constm1_rtx, *count, 1, OPTAB_DIRECT);
+      else
+	*count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
+      if (*min_size)
+	*min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
+    }
+}
+
 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
    ALIGN_BYTES is how many bytes need to be copied.
    The function updates DST and SRC, namely, it sets proper alignment.
@@ -22749,6 +22993,190 @@  expand_setmem_prologue (rtx destmem, rtx
   gcc_assert (desired_alignment <= 8);
 }
 
+/* Test if COUNT&SIZE is nonzero and if so, expand setmem
+   sequence that is valid for SIZE..2*SIZE-1 bytes
+   and jump to DONE_LABEL.  */
+
+static void
+expand_small_setmem (rtx destmem, 
+		     rtx destptr, rtx count,
+		     rtx value,
+		     int size, rtx done_label)
+{
+  rtx label = ix86_expand_aligntest (count, size, false);
+  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
+  rtx modesize;
+  int n;
+
+  if (size >= 32)
+    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+  else if (size >= 16)
+    mode = TARGET_SSE ? V16QImode : DImode;
+
+  destmem = change_address (destmem, mode, destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      emit_move_insn (destmem, gen_lowpart (mode, value));
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+    }
+
+  destmem = offset_address (destmem, count, 1);
+  destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
+			    GET_MODE_SIZE (mode));
+  emit_move_insn (destmem, gen_lowpart (mode, value));
+  emit_jump_insn (gen_jump (done_label));
+  emit_barrier ();
+
+  emit_label (label);
+  LABEL_NUSES (label) = 1;
+}
+
+/* Handle small memcpy (up to SIZE that is supposed to be small power of 2)
+   and get ready for the main memset loop. 
+   See expand_movmem_prologue_epilogue_by_misaligned_moves for detailed
+   description.  */
+
+static void
+expand_setmem_prologue_epilogue_by_misaligned_moves (rtx destmem,
+						     rtx *destptr, 
+						     rtx *count,
+						     rtx value,
+						     rtx *done_label,
+						     int size,
+						     int desired_align,
+						     int align,
+						     unsigned HOST_WIDE_INT *min_size,
+						     bool dynamic_check)
+{
+  rtx loop_label = NULL, label;
+  enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 2);
+  int n;
+  rtx modesize;
+  int prolog_size = 0;
+
+  if (size >= 32)
+    mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+  else if (size >= 16)
+    mode = TARGET_SSE ? V16QImode : DImode;
+
+  /* See if block is big or small, handle small blocks.  */
+  if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
+    {
+      int size2 = size;
+      loop_label = gen_label_rtx ();
+
+      if (!*done_label)
+	*done_label = gen_label_rtx ();
+
+      emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
+			       1, loop_label);
+      size2 >>= 1;
+
+      /* Handle sizes > 3.  */
+      for (;size2 > 2; size2 >>= 1)
+	expand_small_setmem (destmem, *destptr, *count, value,
+		             size2, *done_label);
+      /* Nothing to copy?  Jump to DONE_LABEL if so */
+      emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
+			       1, *done_label);
+
+      /* Do a byte copy.  */
+      destmem = change_address (destmem, QImode, *destptr);
+      emit_move_insn (destmem, gen_lowpart (QImode, value));
+
+      /* Handle sizes 2 and 3.  */
+      label = ix86_expand_aligntest (*count, 2, false);
+      destmem = change_address (destmem, HImode, *destptr);
+      destmem = offset_address (destmem, *count, 1);
+      destmem = offset_address (destmem, GEN_INT (-2), 2);
+      emit_move_insn (destmem, gen_lowpart (HImode, value));
+
+      emit_label (label);
+      LABEL_NUSES (label) = 1;
+      emit_jump_insn (gen_jump (*done_label));
+      emit_barrier ();
+    }
+  else
+    gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
+		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
+
+  /* Start memcpy for COUNT >= SIZE.  */
+  if (loop_label)
+    {
+       emit_label (loop_label);
+       LABEL_NUSES (loop_label) = 1;
+    }
+
+  /* Copy first desired_align bytes.  */
+  destmem = change_address (destmem, mode, *destptr);
+  modesize = GEN_INT (GET_MODE_SIZE (mode));
+  for (n = 0; prolog_size < desired_align - align; n++)
+    {
+      emit_move_insn (destmem, gen_lowpart (mode, value));
+      destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
+      prolog_size += GET_MODE_SIZE (mode);
+    }
+
+
+  /* Copy last SIZE bytes.  */
+  destmem = offset_address (destmem, *count, 1);
+  destmem = offset_address (destmem,
+			    GEN_INT (-size - prolog_size),
+			    1);
+  emit_move_insn (destmem, gen_lowpart (mode, value));
+  for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
+    {
+      destmem = offset_address (destmem, modesize, 1);
+      emit_move_insn (destmem, gen_lowpart (mode, value));
+    }
+
+  /* Align destination.  */
+  if (desired_align > 1 && desired_align > align)
+    {
+      rtx saveddest = *destptr;
+
+      gcc_assert (desired_align <= size);
+      /* Align destptr up, place it to new register.  */
+      *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
+				      GEN_INT (prolog_size),
+				      NULL_RTX, 1, OPTAB_DIRECT);
+      *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
+				      GEN_INT (-desired_align),
+				      *destptr, 1, OPTAB_DIRECT);
+      /* See how many bytes we skipped.  */
+      saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
+				       *destptr,
+				       saveddest, 1, OPTAB_DIRECT);
+      /* Adjust count.  */
+      *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				    saveddest, *count, 1, OPTAB_DIRECT);
+      /* We copied at most size + prolog_size.  */
+      if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
+	*min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
+      else
+	*min_size = 0;
+
+      /* Our loops always round down the bock size, but for dispatch to library
+	 we need precise value.  */
+      if (dynamic_check)
+	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
+				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
+    }
+  else
+    {
+      gcc_assert (prolog_size == 0);
+      /* Decrease count, so we won't end up copying last word twice.  */
+      if (!CONST_INT_P (*count))
+	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
+				      constm1_rtx, *count, 1, OPTAB_DIRECT);
+      else
+	*count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
+      if (*min_size)
+	*min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
+    }
+}
+
 /* Set enough from DST to align DST known to by aligned by ALIGN to
    DESIRED_ALIGN.  ALIGN_BYTES is how many bytes need to be stored.  */
 static rtx
@@ -22790,61 +23218,98 @@  expand_constant_setmem_prologue (rtx dst
   return dst;
 }
 
-/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
-static enum stringop_alg
-decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
-	    int *dynamic_check, bool *noalign)
+/* Return true if ALG can be used in current context.  
+   Assume we expand memset if MEMSET is true.  */
+static bool
+alg_usable_p (enum stringop_alg alg, bool memset)
 {
-  const struct stringop_algs * algs;
-  bool optimize_for_speed;
+  if (alg == no_stringop)
+    return false;
+  if (alg == vector_loop)
+    return TARGET_SSE || TARGET_AVX;
   /* Algorithms using the rep prefix want at least edi and ecx;
      additionally, memset wants eax and memcpy wants esi.  Don't
      consider such algorithms if the user has appropriated those
      registers for their own purposes.	*/
-  bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
-                             || (memset
-				 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
-  *noalign = false;
+  if (alg == rep_prefix_1_byte
+      || alg == rep_prefix_4_byte
+      || alg == rep_prefix_8_byte)
+    return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
+             || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
+  return true;
+}
+
+
 
-#define ALG_USABLE_P(alg) (rep_prefix_usable			\
-			   || (alg != rep_prefix_1_byte		\
-			       && alg != rep_prefix_4_byte      \
-			       && alg != rep_prefix_8_byte))
+/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
+static enum stringop_alg
+decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
+	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
+	    bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
+{
+  const struct stringop_algs * algs;
+  bool optimize_for_speed;
+  int max = -1;
   const struct processor_costs *cost;
+  int i;
+  bool any_alg_usable_p = false;
+
+  *noalign = false;
+  *dynamic_check = -1;
 
   /* Even if the string operation call is cold, we still might spend a lot
      of time processing large blocks.  */
   if (optimize_function_for_size_p (cfun)
       || (optimize_insn_for_size_p ()
-          && expected_size != -1 && expected_size < 256))
+ 	  && (max_size < 256
+              || (expected_size != -1 && expected_size < 256))))
     optimize_for_speed = false;
   else
     optimize_for_speed = true;
 
   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
-
-  *dynamic_check = -1;
   if (memset)
     algs = &cost->memset[TARGET_64BIT != 0];
   else
     algs = &cost->memcpy[TARGET_64BIT != 0];
-  if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
+
+  /* See maximal size for user defined algorithm.  */
+  for (i = 0; i < MAX_STRINGOP_ALGS; i++)
+    {
+      enum stringop_alg candidate = algs->size[i].alg;
+      bool usable = alg_usable_p (candidate, memset);
+      any_alg_usable_p |= usable;
+
+      if (candidate != libcall && candidate && usable)
+	  max = algs->size[i].max;
+    }
+
+  /* If expected size is not known but max size is small enough
+     so inline version is a win, set expected size into
+     the range.  */
+  if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
+    expected_size = min_size / 2 + max_size / 2;
+
+  /* If user specified the algorithm, honnor it if possible.  */
+  if (ix86_stringop_alg != no_stringop
+      && alg_usable_p (ix86_stringop_alg, memset))
     return ix86_stringop_alg;
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (!optimize_for_speed)
     {
-      if (!count || (count & 3))
-	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
+      if (!count || (count & 3) || (memset && !zero_memset))
+	return alg_usable_p (rep_prefix_1_byte, memset)
+	       ? rep_prefix_1_byte : loop_1_byte;
       else
-	return rep_prefix_usable ? rep_prefix_4_byte : loop;
+	return alg_usable_p (rep_prefix_4_byte, memset)
+	       ? rep_prefix_4_byte : loop;
     }
-  /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
-   */
+  /* Very tiny blocks are best handled via the loop, REP is expensive to
+     setup.  */
   else if (expected_size != -1 && expected_size < 4)
     return loop_1_byte;
   else if (expected_size != -1)
     {
-      unsigned int i;
       enum stringop_alg alg = libcall;
       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
 	{
@@ -22858,7 +23323,7 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
 	    {
 	      enum stringop_alg candidate = algs->size[i].alg;
 
-	      if (candidate != libcall && ALG_USABLE_P (candidate))
+	      if (candidate != libcall && alg_usable_p (candidate, memset))
 		alg = candidate;
 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
 		 last non-libcall inline algorithm.  */
@@ -22871,14 +23336,13 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
 		    return alg;
 		  break;
 		}
-	      else if (ALG_USABLE_P (candidate))
+	      else if (alg_usable_p (candidate, memset))
 		{
 		  *noalign = algs->size[i].noalign;
 		  return candidate;
 		}
 	    }
 	}
-      gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
     }
   /* When asked to inline the call anyway, try to pick meaningful choice.
      We look for maximal size of block that is faster to copy by hand and
@@ -22888,22 +23352,11 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
      If this turns out to be bad, we might simply specify the preferred
      choice in ix86_costs.  */
   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-      && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
+      && (algs->unknown_size == libcall
+	  || !alg_usable_p (algs->unknown_size, memset)))
     {
-      int max = -1;
       enum stringop_alg alg;
-      int i;
-      bool any_alg_usable_p = true;
-
-      for (i = 0; i < MAX_STRINGOP_ALGS; i++)
-        {
-          enum stringop_alg candidate = algs->size[i].alg;
-          any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
 
-          if (candidate != libcall && candidate
-              && ALG_USABLE_P (candidate))
-              max = algs->size[i].max;
-        }
       /* If there aren't any usable algorithms, then recursing on
          smaller sizes isn't going to find anything.  Just return the
          simple byte-at-a-time copy loop.  */
@@ -22916,15 +23369,16 @@  decide_alg (HOST_WIDE_INT count, HOST_WI
         }
       if (max == -1)
 	max = 4096;
-      alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
+      alg = decide_alg (count, max / 2, min_size, max_size, memset,
+			zero_memset, dynamic_check, noalign);
       gcc_assert (*dynamic_check == -1);
       gcc_assert (alg != libcall);
       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
 	*dynamic_check = max;
       return alg;
     }
-  return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
-#undef ALG_USABLE_P
+  return (alg_usable_p (algs->unknown_size, memset)
+	  ? algs->unknown_size : libcall);
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -22964,31 +23418,51 @@  decide_alignment (int align,
 /* Expand string move (memcpy) operation.  Use i386 string operations
    when profitable.  expand_setmem contains similar code.  The code
    depends upon architecture, block size and alignment, but always has
-   the same overall structure:
+   one of the following overall structures:
+
+   Aligned move sequence:
+
+     1) Prologue guard: Conditional that jumps up to epilogues for small
+	blocks that can be handled by epilogue alone.  This is faster
+	but also needed for correctness, since prologue assume the block
+	is larger than the desired alignment.
+
+	Optional dynamic check for size and libcall for large
+	blocks is emitted here too, with -minline-stringops-dynamically.
+
+     2) Prologue: copy first few bytes in order to get destination
+	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
+	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
+	copied.  We emit either a jump tree on power of two sized
+	blocks, or a byte loop.
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.
 
-   1) Prologue guard: Conditional that jumps up to epilogues for small
-      blocks that can be handled by epilogue alone.  This is faster
-      but also needed for correctness, since prologue assume the block
-      is larger than the desired alignment.
-
-      Optional dynamic check for size and libcall for large
-      blocks is emitted here too, with -minline-stringops-dynamically.
-
-   2) Prologue: copy first few bytes in order to get destination
-      aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
-      than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
-      copied.  We emit either a jump tree on power of two sized
-      blocks, or a byte loop.
+     4) Epilogue: code copying tail of the block that is too small to be
+	handled by main body (or up to size guarded by prologue guard). 
 
-   3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
-      with specified algorithm.
+  Misaligned move sequence
 
-   4) Epilogue: code copying tail of the block that is too small to be
-      handled by main body (or up to size guarded by prologue guard).  */
+     1) missaligned move prologue/epilogue containing:
+        a) Prologue handling small memory blocks and jumping to done_label
+	   (skipped if blocks are known to be large enough)
+	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
+           needed by single possibly misaligned move
+	   (skipped if alignment is not needed)
+        c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
+
+     2) Zero size guard dispatching to done_label, if needed
+
+     3) dispatch to library call, if needed,
+
+     3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
+	with specified algorithm.  */
 
 bool
 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
-		    rtx expected_align_exp, rtx expected_size_exp)
+		    rtx expected_align_exp, rtx expected_size_exp,
+		    rtx min_size_exp, rtx max_size_exp)
 {
   rtx destreg;
   rtx srcreg;
@@ -23006,6 +23480,9 @@  ix86_expand_movmem (rtx dst, rtx src, rt
   bool noalign;
   enum machine_mode move_mode = VOIDmode;
   int unroll_factor = 1;
+  unsigned HOST_WIDE_INT min_size = UINTVAL (min_size_exp);
+  unsigned HOST_WIDE_INT max_size = max_size_exp ? UINTVAL (max_size_exp) : -1;
+  bool misaligned_prologue_used = false;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -23029,7 +23506,8 @@  ix86_expand_movmem (rtx dst, rtx src, rt
 
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
-  alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
+  alg = decide_alg (count, expected_size, min_size, max_size,
+		    false, false, &dynamic_check, &noalign);
   if (alg == libcall)
     return false;
   gcc_assert (alg != no_stringop);
@@ -23115,8 +23593,44 @@  ix86_expand_movmem (rtx dst, rtx src, rt
     }
   gcc_assert (desired_align >= 1 && align >= 1);
 
+  /* Misaligned move sequences handles both prologues and epilogues at once.
+     Default code generation results in smaller code for large alignments and
+     also avoids redundant job when sizes are known precisely.  */
+  if (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES
+      && MAX (desired_align, epilogue_size_needed) <= 32
+      && ((desired_align > align && !align_bytes)
+	  || (!count && epilogue_size_needed > 1)))
+    {
+      /* Misaligned move prologue handled small blocks by itself.  */
+      misaligned_prologue_used = true;
+      expand_movmem_prologue_epilogue_by_misaligned_moves
+	   (dst, src, &destreg, &srcreg, &count_exp, &jump_around_label,
+            desired_align < align
+	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+	    desired_align, align, &min_size, dynamic_check);
+      src = change_address (src, BLKmode, srcreg);
+      dst = change_address (dst, BLKmode, destreg);
+      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      epilogue_size_needed = 0;
+      if (need_zero_guard && !min_size)
+	{
+	  /* It is possible that we copied enough so the main loop will not
+	     execute.  */
+	  gcc_assert (size_needed > 1);
+	  if (jump_around_label == NULL_RTX)
+	    jump_around_label = gen_label_rtx ();
+	  emit_cmp_and_jump_insns (count_exp,
+				   GEN_INT (size_needed),
+				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+	  if (expected_size == -1
+	      || expected_size < (desired_align - align) / 2 + size_needed)
+	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
+	  else
+	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
+	}
+    }
   /* Ensure that alignment prologue won't copy past end of block.  */
-  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
     {
       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
@@ -23135,8 +23649,9 @@  ix86_expand_movmem (rtx dst, rtx src, rt
 		goto epilogue;
 	    }
 	}
-      else
+      else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
 	{
+	  gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
 	  label = gen_label_rtx ();
 	  emit_cmp_and_jump_insns (count_exp,
 				   GEN_INT (epilogue_size_needed),
@@ -23176,18 +23691,23 @@  ix86_expand_movmem (rtx dst, rtx src, rt
 
   /* Step 2: Alignment prologue.  */
 
-  if (desired_align > align)
+  if (desired_align > align && !misaligned_prologue_used)
     {
       if (align_bytes == 0)
 	{
-	  /* Except for the first move in epilogue, we no longer know
+	  /* Except for the first move in prologue, we no longer know
 	     constant offset in aliasing info.  It don't seems to worth
 	     the pain to maintain it for the first move, so throw away
 	     the info early.  */
 	  src = change_address (src, BLKmode, srcreg);
 	  dst = change_address (dst, BLKmode, destreg);
-	  dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
-					desired_align);
+	  dst = expand_movmem_prologue (dst, src, destreg, srcreg, count_exp,
+					align, desired_align);
+	  /* At most desired_align - align bytes are copied.  */
+	  if (min_size < (unsigned)(desired_align - align))
+	    min_size = 0;
+	  else
+	    min_size -= desired_align;
 	}
       else
 	{
@@ -23200,6 +23720,7 @@  ix86_expand_movmem (rtx dst, rtx src, rt
 	  count -= align_bytes;
 	}
       if (need_zero_guard
+	  && !min_size
 	  && (count < (unsigned HOST_WIDE_INT) size_needed
 	      || (align_bytes == 0
 		  && count < ((unsigned HOST_WIDE_INT) size_needed
@@ -23227,7 +23748,7 @@  ix86_expand_movmem (rtx dst, rtx src, rt
       label = NULL;
       epilogue_size_needed = 1;
     }
-  else if (label == NULL_RTX)
+  else if (label == NULL_RTX && !misaligned_prologue_used)
     epilogue_size_needed = size_needed;
 
   /* Step 3: Main loop.  */
@@ -23395,7 +23916,8 @@  promote_duplicated_reg_to_size (rtx val,
    steps performed.  */
 bool
 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
-		    rtx expected_align_exp, rtx expected_size_exp)
+		    rtx expected_align_exp, rtx expected_size_exp,
+		    rtx min_size_exp, rtx max_size_exp)
 {
   rtx destreg;
   rtx label = NULL;
@@ -23414,6 +23936,9 @@  ix86_expand_setmem (rtx dst, rtx count_e
   bool noalign;
   enum machine_mode move_mode = VOIDmode;
   int unroll_factor;
+  unsigned HOST_WIDE_INT min_size = UINTVAL (min_size_exp);
+  unsigned HOST_WIDE_INT max_size = max_size_exp ? UINTVAL (max_size_exp) : -1;
+  bool misaligned_prologue_used = false;
 
   if (CONST_INT_P (align_exp))
     align = INTVAL (align_exp);
@@ -23433,7 +23958,8 @@  ix86_expand_setmem (rtx dst, rtx count_e
   /* Step 0: Decide on preferred algorithm, desired alignment and
      size of chunks to be copied by main loop.  */
 
-  alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
+  alg = decide_alg (count, expected_size, min_size, max_size,
+		    true, val_exp == const0_rtx, &dynamic_check, &noalign);
   if (alg == libcall)
     return false;
   gcc_assert (alg != no_stringop);
@@ -23508,8 +24034,47 @@  ix86_expand_setmem (rtx dst, rtx count_e
   if (CONST_INT_P (val_exp))
     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
 						   desired_align, align);
+  /* Misaligned move sequences handles both prologues and epilogues at once.
+     Default code generation results in smaller code for large alignments and
+     also avoids redundant job when sizes are known precisely.  */
+  if (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES_EPILOGUES
+      && MAX (desired_align, epilogue_size_needed) <= 32
+      && ((desired_align > align && !align_bytes)
+         || (!count && epilogue_size_needed > 1)))
+    {
+      /* We always need promoted value.  */
+      if (!promoted_val)
+	promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
+						       desired_align, align);
+      /* Misaligned move prologue handled small blocks by itself.  */
+      misaligned_prologue_used = true;
+      expand_setmem_prologue_epilogue_by_misaligned_moves
+          (dst, &destreg, &count_exp, promoted_val, &jump_around_label,
+           desired_align < align
+	   ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
+           desired_align, align, &min_size, dynamic_check);
+      dst = change_address (dst, BLKmode, destreg);
+      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      epilogue_size_needed = 0;
+      if (need_zero_guard && !min_size)
+        {
+          /* It is possible that we copied enough so the main loop will not
+             execute.  */
+          gcc_assert (size_needed > 1);
+          if (jump_around_label == NULL_RTX)
+            jump_around_label = gen_label_rtx ();
+          emit_cmp_and_jump_insns (count_exp,
+                                   GEN_INT (size_needed),
+                                   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
+          if (expected_size == -1
+              || expected_size < (desired_align - align) / 2 + size_needed)
+            predict_jump (REG_BR_PROB_BASE * 20 / 100);
+          else
+            predict_jump (REG_BR_PROB_BASE * 60 / 100);
+        }
+    }
   /* Ensure that alignment prologue won't copy past end of block.  */
-  if (size_needed > 1 || (desired_align > 1 && desired_align > align))
+  else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
     {
       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
       /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
@@ -23534,8 +24099,9 @@  ix86_expand_setmem (rtx dst, rtx count_e
 		goto epilogue;
 	    }
 	}
-      else
+      else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
 	{
+	  gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
 	  label = gen_label_rtx ();
 	  emit_cmp_and_jump_insns (count_exp,
 				   GEN_INT (epilogue_size_needed),
@@ -23566,7 +24132,7 @@  ix86_expand_setmem (rtx dst, rtx count_e
 						   desired_align, align);
   gcc_assert (desired_align >= 1 && align >= 1);
 
-  if (desired_align > align)
+  if (desired_align > align && !misaligned_prologue_used)
     {
       if (align_bytes == 0)
 	{
@@ -23577,6 +24143,11 @@  ix86_expand_setmem (rtx dst, rtx count_e
 	  dst = change_address (dst, BLKmode, destreg);
 	  expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
 				  desired_align);
+	  /* At most desired_align - align bytes are copied.  */
+	  if (min_size < (unsigned)(desired_align - align))
+	    min_size = 0;
+	  else
+	    min_size -= desired_align;
 	}
       else
 	{
@@ -23589,6 +24160,7 @@  ix86_expand_setmem (rtx dst, rtx count_e
 	  count -= align_bytes;
 	}
       if (need_zero_guard
+	  && !min_size
 	  && (count < (unsigned HOST_WIDE_INT) size_needed
 	      || (align_bytes == 0
 		  && count < ((unsigned HOST_WIDE_INT) size_needed
@@ -23617,7 +24189,7 @@  ix86_expand_setmem (rtx dst, rtx count_e
       promoted_val = val_exp;
       epilogue_size_needed = 1;
     }
-  else if (label == NULL_RTX)
+  else if (label == NULL_RTX && !misaligned_prologue_used)
     epilogue_size_needed = size_needed;
 
   /* Step 3: Main loop.  */