diff mbox

Improve { x, x + 3, x + 6, x + 9 } expansion

Message ID 20131120084245.GE892@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Nov. 20, 2013, 8:42 a.m. UTC
Hi!

I've noticed we generate terrible code for the testcase below.
E.g. with -mavx2 it is:
        leal    6(%rdi), %edx
        leal    12(%rdi), %ecx
        leal    18(%rdi), %esi
        leal    3(%rdi), %eax
        movl    %edx, -20(%rsp)
        movl    %ecx, -24(%rsp)
        leal    9(%rdi), %edx
        movl    %esi, -28(%rsp)
        vmovd   -20(%rsp), %xmm6
        leal    15(%rdi), %ecx
        movl    %edi, -20(%rsp)
        leal    21(%rdi), %esi
        vmovd   -28(%rsp), %xmm4
        vmovd   -24(%rsp), %xmm5
        vpinsrd $1, %edx, %xmm6, %xmm3
        vmovd   -20(%rsp), %xmm7
        vpinsrd $1, %esi, %xmm4, %xmm2
        vpinsrd $1, %ecx, %xmm5, %xmm1
        vpinsrd $1, %eax, %xmm7, %xmm0
        vpunpcklqdq     %xmm2, %xmm1, %xmm1
        vpunpcklqdq     %xmm3, %xmm0, %xmm0
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 }
CONSTRUCTOR into a vector register.  This patch expands it as:
        movl    %edi, -20(%rsp)
        vbroadcastss    -20(%rsp), %ymm0
        vpaddd  .LC0(%rip), %ymm0, %ymm0
instead.  Similarly for SSE2:
        leal    3(%rdi), %eax
        movl    %eax, -20(%rsp)
        leal    6(%rdi), %eax
        movd    -20(%rsp), %xmm4
        movl    %eax, -16(%rsp)
        leal    9(%rdi), %eax
        movd    -16(%rsp), %xmm1
        movl    %edi, -16(%rsp)
        movl    %eax, -12(%rsp)
        movd    -16(%rsp), %xmm0
        movd    -12(%rsp), %xmm3
        punpckldq       %xmm4, %xmm0
        punpckldq       %xmm3, %xmm1
        punpcklqdq      %xmm1, %xmm0
instead of:
        movl    %edi, -12(%rsp)
        movd    -12(%rsp), %xmm3
        pshufd  $0, %xmm3, %xmm0
        paddd   .LC0(%rip), %xmm0

The patch leaves that decision to the target (in it's
*_expand_vector_init).

Ok?

2013-11-20  Jakub Jelinek  <jakub@redhat.com>

	* expr.c (store_constructor): If all CONSTRUCTOR values are
	the same SSA_NAME plus optional constant, pass plus_constant results
	down to vec_init.
	* config/i386/i386.c (ix86_expand_vector_init): Optimize it using
	broadcast followed by addition of a constant vector.

	* gcc.dg/vect/vect-124.c: New test.


	Jakub

Comments

Richard Biener Nov. 20, 2013, 9:31 a.m. UTC | #1
On Wed, 20 Nov 2013, Jakub Jelinek wrote:

> Hi!
> 
> I've noticed we generate terrible code for the testcase below.
> E.g. with -mavx2 it is:
>         leal    6(%rdi), %edx
>         leal    12(%rdi), %ecx
>         leal    18(%rdi), %esi
>         leal    3(%rdi), %eax
>         movl    %edx, -20(%rsp)
>         movl    %ecx, -24(%rsp)
>         leal    9(%rdi), %edx
>         movl    %esi, -28(%rsp)
>         vmovd   -20(%rsp), %xmm6
>         leal    15(%rdi), %ecx
>         movl    %edi, -20(%rsp)
>         leal    21(%rdi), %esi
>         vmovd   -28(%rsp), %xmm4
>         vmovd   -24(%rsp), %xmm5
>         vpinsrd $1, %edx, %xmm6, %xmm3
>         vmovd   -20(%rsp), %xmm7
>         vpinsrd $1, %esi, %xmm4, %xmm2
>         vpinsrd $1, %ecx, %xmm5, %xmm1
>         vpinsrd $1, %eax, %xmm7, %xmm0
>         vpunpcklqdq     %xmm2, %xmm1, %xmm1
>         vpunpcklqdq     %xmm3, %xmm0, %xmm0
>         vinserti128     $0x1, %xmm1, %ymm0, %ymm0
> to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 }
> CONSTRUCTOR into a vector register.  This patch expands it as:
>         movl    %edi, -20(%rsp)
>         vbroadcastss    -20(%rsp), %ymm0
>         vpaddd  .LC0(%rip), %ymm0, %ymm0
> instead.  Similarly for SSE2:
>         leal    3(%rdi), %eax
>         movl    %eax, -20(%rsp)
>         leal    6(%rdi), %eax
>         movd    -20(%rsp), %xmm4
>         movl    %eax, -16(%rsp)
>         leal    9(%rdi), %eax
>         movd    -16(%rsp), %xmm1
>         movl    %edi, -16(%rsp)
>         movl    %eax, -12(%rsp)
>         movd    -16(%rsp), %xmm0
>         movd    -12(%rsp), %xmm3
>         punpckldq       %xmm4, %xmm0
>         punpckldq       %xmm3, %xmm1
>         punpcklqdq      %xmm1, %xmm0
> instead of:
>         movl    %edi, -12(%rsp)
>         movd    -12(%rsp), %xmm3
>         pshufd  $0, %xmm3, %xmm0
>         paddd   .LC0(%rip), %xmm0
> 
> The patch leaves that decision to the target (in it's
> *_expand_vector_init).
> 
> Ok?

Aww ;)  Nice improvement.  Generally when I see this I always wonder
whether we want to do this kind of stuff pre RTL expansion.
1st to not rely on being able to TER, 2nd to finally eventually
get rid of TER.

These patches are unfortunately a step backward for #2.

As of the patch, do we have a way to query whether the target
can efficiently broadcast?  If so this IMHO belongs in generic
code, not in ix86_expand_vector_init - passing down plus_constant
results may pessimize other targets that don't understand this
trick, no?  (OTOH, which vector ISA does _not_ know how to
broadcast ...?)

Thanks,
Richard.

> 2013-11-20  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* expr.c (store_constructor): If all CONSTRUCTOR values are
> 	the same SSA_NAME plus optional constant, pass plus_constant results
> 	down to vec_init.
> 	* config/i386/i386.c (ix86_expand_vector_init): Optimize it using
> 	broadcast followed by addition of a constant vector.
> 
> 	* gcc.dg/vect/vect-124.c: New test.
> 
> --- gcc/expr.c.jj	2013-11-19 21:56:27.000000000 +0100
> +++ gcc/expr.c	2013-11-20 09:24:14.435860130 +0100
> @@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target,
>  	rtvec vector = NULL;
>  	unsigned n_elts;
>  	alias_set_type alias;
> +	tree base = NULL_TREE;
> +	rtx base_rtx = NULL_RTX;
>  
>  	gcc_assert (eltmode != BLKmode);
>  
> @@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target,
>  				    TYPE_SIZE (TREE_TYPE (value)),
>  				    TYPE_SIZE (elttype)));
>  
> +		/* Try to detect fairly common pattern of
> +		   { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }.  */
> +		if (vector
> +		    && (count == 0 || base)
> +		    && TREE_CODE (value) == SSA_NAME)
> +		  {
> +		    gimple g = get_gimple_for_ssa_name (value);
> +		    tree this_base = value;
> +		    if (g && is_gimple_assign (g))
> +		      switch (gimple_assign_rhs_code (g))
> +			{
> +			case PLUS_EXPR:
> +			case POINTER_PLUS_EXPR:
> +			  if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
> +			      && tree_fits_shwi_p (gimple_assign_rhs2 (g)))
> +			    this_base = gimple_assign_rhs1 (g);
> +			  break;
> +			default:
> +			  break;
> +			}
> +		    if (count == 0)
> +		      base = this_base;
> +		    else if (base != this_base)
> +		      base = NULL_TREE;
> +		  }
>  		count += n_elts_here;
>  		if (mostly_zeros_p (value))
>  		  zero_count += n_elts_here;
> @@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target,
>  	    /* Clear the entire vector first if there are any missing elements,
>  	       or if the incidence of zero elements is >= 75%.  */
>  	    need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count);
> +
> +	    if (count < n_elts)
> +	      base = NULL_TREE;
>  	  }
>  
>  	if (need_to_clear && size > 0 && !vector)
> @@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target,
>  	else
>  	  alias = get_alias_set (elttype);
>  
> +	if (base)
> +	  base_rtx = expand_normal (base);
> +
>          /* Store each element of the constructor into the corresponding
>  	   element of TARGET, determined by counting the elements.  */
>  	for (idx = 0, i = 0;
> @@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target,
>  	        /* Vector CONSTRUCTORs should only be built from smaller
>  		   vectors in the case of BLKmode vectors.  */
>  		gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
> -		RTVEC_ELT (vector, eltpos)
> -		  = expand_normal (value);
> +		if (base)
> +		  {
> +		    if (value == base)
> +		      RTVEC_ELT (vector, eltpos) = base_rtx;
> +		    else
> +		      {
> +			gimple g = get_gimple_for_ssa_name (value);
> +			rtx cst = expand_normal (gimple_assign_rhs2 (g));
> +			RTVEC_ELT (vector, eltpos)
> +			  = plus_constant (TYPE_MODE (TREE_TYPE (value)),
> +					   base_rtx, INTVAL (cst));
> +		      }
> +		  }
> +		else
> +		  RTVEC_ELT (vector, eltpos) = expand_normal (value);
>  	      }
>  	    else
>  	      {
> --- gcc/config/i386/i386.c.jj	2013-11-19 21:56:41.000000000 +0100
> +++ gcc/config/i386/i386.c	2013-11-20 09:20:16.837649403 +0100
> @@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt
>    enum machine_mode inner_mode = GET_MODE_INNER (mode);
>    int n_elts = GET_MODE_NUNITS (mode);
>    int n_var = 0, one_var = -1;
> -  bool all_same = true, all_const_zero = true;
> +  bool all_same = true, all_const_zero = true, need_force_reg = false;
>    int i;
>    rtx x;
> +  rtx base = NULL_RTX;
>  
>    for (i = 0; i < n_elts; ++i)
>      {
> @@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt
>        if (!(CONST_INT_P (x)
>  	    || GET_CODE (x) == CONST_DOUBLE
>  	    || GET_CODE (x) == CONST_FIXED))
> -	n_var++, one_var = i;
> +	{
> +	  n_var++;
> +	  one_var = i;
> +	  if (!REG_P (x) && !MEM_P (x))
> +	    need_force_reg = true;
> +	}
>        else if (x != CONST0_RTX (inner_mode))
>  	all_const_zero = false;
>        if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
>  	all_same = false;
> +      if (i == 0 || base)
> +	{
> +	  if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
> +	    x = XEXP (x, 0);
> +	  if (i == 0)
> +	    base = x;
> +	  else if (!rtx_equal_p (x, base))
> +	    base = NULL_RTX;
> +	}
>      }
>  
>    /* Constants are best loaded from the constant pool.  */
> @@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt
>        return;
>      }
>  
> +  /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast
> +     followed by addition of constant.  */
> +  if (!all_same
> +      && base
> +      && n_elts >= 4
> +      && (REG_P (base) || MEM_P (base)))
> +    {
> +      rtx subtarget = gen_reg_rtx (mode);
> +      if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base))
> +	{
> +	  rtvec cstvals = rtvec_alloc (n_elts);
> +	  rtx cstsubtarget = gen_reg_rtx (mode);
> +	  for (i = 0; i < n_elts; i++)
> +	    {
> +	      x = XVECEXP (vals, 0, i);
> +	      if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
> +		RTVEC_ELT (cstvals, i) = XEXP (x, 1);
> +	      else
> +		RTVEC_ELT (cstvals, i) = const0_rtx;
> +	    }
> +	  ix86_expand_vector_init (mmx_ok, cstsubtarget,
> +				   gen_rtx_PARALLEL (mode, cstvals));
> +	  subtarget = expand_simple_binop (mode, PLUS, subtarget,
> +					   cstsubtarget, target, 1,
> +					   OPTAB_DIRECT);
> +	  if (subtarget != target)
> +	    emit_move_insn (target, subtarget);
> +	  return;
> +	}
> +    }
> +
> +  if (need_force_reg)
> +    for (i = 0; i < n_elts; ++i)
> +      {
> +	x = XVECEXP (vals, 0, i);
> +	if (!(CONST_INT_P (x)
> +	      || GET_CODE (x) == CONST_DOUBLE
> +	      || GET_CODE (x) == CONST_FIXED
> +	      || REG_P (x)
> +	      || MEM_P (x)))
> +	  XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x);
> +      }
> +
>    /* If all values are identical, broadcast the value.  */
>    if (all_same
>        && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
> --- gcc/testsuite/gcc.dg/vect/vect-124.c.jj	2013-11-20 09:27:35.760798649 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-124.c	2013-11-20 09:29:32.219199794 +0100
> @@ -0,0 +1,28 @@
> +#include "tree-vect.h"
> +
> +#ifndef N
> +#define N 64
> +#endif
> +
> +int a[N];
> +
> +__attribute__((noinline, noclone)) void
> +foo (int x)
> +{
> +  int i;
> +  for (i = 0; i < N; i++, x += 3)
> +    a[i] = x;
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +  
> +  check_vect ();
> +  foo (6);
> +  for (i = 0; i < N; i++)
> +    if (a[i] != i * 3 + 6)
> +      abort ();
> +  return 0;
> +}
> 
> 	Jakub
> 
>
Jakub Jelinek Nov. 20, 2013, 9:44 a.m. UTC | #2
On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote:
> Aww ;)  Nice improvement.  Generally when I see this I always wonder
> whether we want to do this kind of stuff pre RTL expansion.
> 1st to not rely on being able to TER, 2nd to finally eventually
> get rid of TER.
> 
> These patches are unfortunately a step backward for #2.
> 
> As of the patch, do we have a way to query whether the target
> can efficiently broadcast?  If so this IMHO belongs in generic

We don't.  Perhaps if we'd add optab for vec_dup<mode> and mentioned
clearly in the documentation that it should be used only if it is reasonably
efficient.  But still, even with optab, it would probably better to do it
in the veclower* passes than in the vectorizer itself.

	Jakub
Richard Biener Nov. 20, 2013, 10:16 a.m. UTC | #3
On Wed, 20 Nov 2013, Jakub Jelinek wrote:

> On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote:
> > Aww ;)  Nice improvement.  Generally when I see this I always wonder
> > whether we want to do this kind of stuff pre RTL expansion.
> > 1st to not rely on being able to TER, 2nd to finally eventually
> > get rid of TER.
> > 
> > These patches are unfortunately a step backward for #2.
> > 
> > As of the patch, do we have a way to query whether the target
> > can efficiently broadcast?  If so this IMHO belongs in generic
> 
> We don't.  Perhaps if we'd add optab for vec_dup<mode> and mentioned
> clearly in the documentation that it should be used only if it is reasonably
> efficient.  But still, even with optab, it would probably better to do it
> in the veclower* passes than in the vectorizer itself.

Yes, veclower would be a good fit.

I was of course thinking of a new "apply TER" pass right before cfgexpand
(for the cases where TER ends up doing some kind of "folding").

Richard.
Richard Henderson Nov. 20, 2013, 9:43 p.m. UTC | #4
On 11/20/2013 07:44 PM, Jakub Jelinek wrote:
> On Wed, Nov 20, 2013 at 10:31:38AM +0100, Richard Biener wrote:
>> Aww ;)  Nice improvement.  Generally when I see this I always wonder
>> whether we want to do this kind of stuff pre RTL expansion.
>> 1st to not rely on being able to TER, 2nd to finally eventually
>> get rid of TER.
>>
>> These patches are unfortunately a step backward for #2.
>>
>> As of the patch, do we have a way to query whether the target
>> can efficiently broadcast?  If so this IMHO belongs in generic
> 
> We don't.  Perhaps if we'd add optab for vec_dup<mode> and mentioned
> clearly in the documentation that it should be used only if it is reasonably
> efficient.  But still, even with optab, it would probably better to do it
> in the veclower* passes than in the vectorizer itself.

I think we can assume that broadcast is relatively efficient, whether or not
vec_dup is present.  I'd lean to making the transformation generic to start
with, so that you don't need extra handling in the i386 backend.

Surely any operation with a common operand and a constant operand should be
applicable for this transformation.


r~
diff mbox

Patch

--- gcc/expr.c.jj	2013-11-19 21:56:27.000000000 +0100
+++ gcc/expr.c	2013-11-20 09:24:14.435860130 +0100
@@ -6296,6 +6296,8 @@  store_constructor (tree exp, rtx target,
 	rtvec vector = NULL;
 	unsigned n_elts;
 	alias_set_type alias;
+	tree base = NULL_TREE;
+	rtx base_rtx = NULL_RTX;
 
 	gcc_assert (eltmode != BLKmode);
 
@@ -6334,6 +6336,31 @@  store_constructor (tree exp, rtx target,
 				    TYPE_SIZE (TREE_TYPE (value)),
 				    TYPE_SIZE (elttype)));
 
+		/* Try to detect fairly common pattern of
+		   { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }.  */
+		if (vector
+		    && (count == 0 || base)
+		    && TREE_CODE (value) == SSA_NAME)
+		  {
+		    gimple g = get_gimple_for_ssa_name (value);
+		    tree this_base = value;
+		    if (g && is_gimple_assign (g))
+		      switch (gimple_assign_rhs_code (g))
+			{
+			case PLUS_EXPR:
+			case POINTER_PLUS_EXPR:
+			  if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
+			      && tree_fits_shwi_p (gimple_assign_rhs2 (g)))
+			    this_base = gimple_assign_rhs1 (g);
+			  break;
+			default:
+			  break;
+			}
+		    if (count == 0)
+		      base = this_base;
+		    else if (base != this_base)
+		      base = NULL_TREE;
+		  }
 		count += n_elts_here;
 		if (mostly_zeros_p (value))
 		  zero_count += n_elts_here;
@@ -6342,6 +6369,9 @@  store_constructor (tree exp, rtx target,
 	    /* Clear the entire vector first if there are any missing elements,
 	       or if the incidence of zero elements is >= 75%.  */
 	    need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count);
+
+	    if (count < n_elts)
+	      base = NULL_TREE;
 	  }
 
 	if (need_to_clear && size > 0 && !vector)
@@ -6362,6 +6392,9 @@  store_constructor (tree exp, rtx target,
 	else
 	  alias = get_alias_set (elttype);
 
+	if (base)
+	  base_rtx = expand_normal (base);
+
         /* Store each element of the constructor into the corresponding
 	   element of TARGET, determined by counting the elements.  */
 	for (idx = 0, i = 0;
@@ -6385,8 +6418,21 @@  store_constructor (tree exp, rtx target,
 	        /* Vector CONSTRUCTORs should only be built from smaller
 		   vectors in the case of BLKmode vectors.  */
 		gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
-		RTVEC_ELT (vector, eltpos)
-		  = expand_normal (value);
+		if (base)
+		  {
+		    if (value == base)
+		      RTVEC_ELT (vector, eltpos) = base_rtx;
+		    else
+		      {
+			gimple g = get_gimple_for_ssa_name (value);
+			rtx cst = expand_normal (gimple_assign_rhs2 (g));
+			RTVEC_ELT (vector, eltpos)
+			  = plus_constant (TYPE_MODE (TREE_TYPE (value)),
+					   base_rtx, INTVAL (cst));
+		      }
+		  }
+		else
+		  RTVEC_ELT (vector, eltpos) = expand_normal (value);
 	      }
 	    else
 	      {
--- gcc/config/i386/i386.c.jj	2013-11-19 21:56:41.000000000 +0100
+++ gcc/config/i386/i386.c	2013-11-20 09:20:16.837649403 +0100
@@ -37736,9 +37736,10 @@  ix86_expand_vector_init (bool mmx_ok, rt
   enum machine_mode inner_mode = GET_MODE_INNER (mode);
   int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
-  bool all_same = true, all_const_zero = true;
+  bool all_same = true, all_const_zero = true, need_force_reg = false;
   int i;
   rtx x;
+  rtx base = NULL_RTX;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -37746,11 +37747,25 @@  ix86_expand_vector_init (bool mmx_ok, rt
       if (!(CONST_INT_P (x)
 	    || GET_CODE (x) == CONST_DOUBLE
 	    || GET_CODE (x) == CONST_FIXED))
-	n_var++, one_var = i;
+	{
+	  n_var++;
+	  one_var = i;
+	  if (!REG_P (x) && !MEM_P (x))
+	    need_force_reg = true;
+	}
       else if (x != CONST0_RTX (inner_mode))
 	all_const_zero = false;
       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
 	all_same = false;
+      if (i == 0 || base)
+	{
+	  if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+	    x = XEXP (x, 0);
+	  if (i == 0)
+	    base = x;
+	  else if (!rtx_equal_p (x, base))
+	    base = NULL_RTX;
+	}
     }
 
   /* Constants are best loaded from the constant pool.  */
@@ -37760,6 +37775,49 @@  ix86_expand_vector_init (bool mmx_ok, rt
       return;
     }
 
+  /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast
+     followed by addition of constant.  */
+  if (!all_same
+      && base
+      && n_elts >= 4
+      && (REG_P (base) || MEM_P (base)))
+    {
+      rtx subtarget = gen_reg_rtx (mode);
+      if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base))
+	{
+	  rtvec cstvals = rtvec_alloc (n_elts);
+	  rtx cstsubtarget = gen_reg_rtx (mode);
+	  for (i = 0; i < n_elts; i++)
+	    {
+	      x = XVECEXP (vals, 0, i);
+	      if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+		RTVEC_ELT (cstvals, i) = XEXP (x, 1);
+	      else
+		RTVEC_ELT (cstvals, i) = const0_rtx;
+	    }
+	  ix86_expand_vector_init (mmx_ok, cstsubtarget,
+				   gen_rtx_PARALLEL (mode, cstvals));
+	  subtarget = expand_simple_binop (mode, PLUS, subtarget,
+					   cstsubtarget, target, 1,
+					   OPTAB_DIRECT);
+	  if (subtarget != target)
+	    emit_move_insn (target, subtarget);
+	  return;
+	}
+    }
+
+  if (need_force_reg)
+    for (i = 0; i < n_elts; ++i)
+      {
+	x = XVECEXP (vals, 0, i);
+	if (!(CONST_INT_P (x)
+	      || GET_CODE (x) == CONST_DOUBLE
+	      || GET_CODE (x) == CONST_FIXED
+	      || REG_P (x)
+	      || MEM_P (x)))
+	  XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x);
+      }
+
   /* If all values are identical, broadcast the value.  */
   if (all_same
       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
--- gcc/testsuite/gcc.dg/vect/vect-124.c.jj	2013-11-20 09:27:35.760798649 +0100
+++ gcc/testsuite/gcc.dg/vect/vect-124.c	2013-11-20 09:29:32.219199794 +0100
@@ -0,0 +1,28 @@ 
+#include "tree-vect.h"
+
+#ifndef N
+#define N 64
+#endif
+
+int a[N];
+
+__attribute__((noinline, noclone)) void
+foo (int x)
+{
+  int i;
+  for (i = 0; i < N; i++, x += 3)
+    a[i] = x;
+}
+
+int
+main ()
+{
+  int i;
+  
+  check_vect ();
+  foo (6);
+  for (i = 0; i < N; i++)
+    if (a[i] != i * 3 + 6)
+      abort ();
+  return 0;
+}