Patchwork Try to avoid vector mode punning in SET_DEST on i?86

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 31, 2013, 4:49 p.m.
Message ID <20131031164947.GV27813@tucnak.zalov.cz>
Download mbox | patch
Permalink /patch/287564/
State New
Headers show

Comments

Jakub Jelinek - Oct. 31, 2013, 4:49 p.m.
Hi!

On Wed, Oct 30, 2013 at 12:11:25PM +0100, Jakub Jelinek wrote:
> > > the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
> > > loads, which usually means combine will fail, by doing the load into a
> > > temporary pseudo in that case and then doing a pseudo to pseudo move with
> > > gen_lowpart on the rhs (which will be merged soon after into following
> > > instructions).
> > 
> > Is this similar to PR44141? There were similar problems with V4SFmode
> > subregs, so combine was not able to merge load to the arithemtic insn.
> 
> >From the work on the vectorization last year I remember many cases where
> subregs (even equal size) on the LHS of instructions prevented combiner or
> other RTL optimizations from doing it's job.  I believe I've changed some
> easy places that did that completely unnecessarily, but certainly have not
> went through all the code to look for other places where this is done.
> 
> Perhaps let's hack up a checking pass that will after expansion walk the
> whole IL and complain about same sized subregs on the LHS of insns, then do make
> check with it for a couple of ISAs (-msse2,-msse4,-mavx,-mavx2 e.g.?

So, I've used a hack in cfgexpand.c:


	Jakub
Richard Henderson - Oct. 31, 2013, 6:53 p.m.
On 10/31/2013 09:49 AM, Jakub Jelinek wrote:
> 2013-10-31  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* optabs.c (expand_vec_perm): Avoid vector mode punning
> 	SUBREGs in SET_DEST.
> 	* expmed.c (store_bit_field_1): Likewise.
> 	* config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
> 	vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
> 	vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
> 	* config/i386/i386.c (ix86_expand_vector_move_misalign,
> 	ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
> 	ix86_expand_sse_unpack, ix86_expand_args_builtin,
> 	ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
> 	emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
> 	expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
> 	expand_vec_perm_vpshufb2_vpermq,
> 	expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
> 	expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
> 	ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
> 	(expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
> 	than *d itself.

Ok.


r~
James Greenhalgh - Nov. 4, 2013, 6:23 p.m.
On Thu, Oct 31, 2013 at 04:49:47PM +0000, Jakub Jelinek wrote:

> 2013-10-31  Jakub Jelinek  <jakub@redhat.com>
> 
>         * optabs.c (expand_vec_perm): Avoid vector mode punning
>         SUBREGs in SET_DEST.
>         * expmed.c (store_bit_field_1): Likewise.
>         * config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
>         vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
>         vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
>         * config/i386/i386.c (ix86_expand_vector_move_misalign,
>         ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
>         ix86_expand_sse_unpack, ix86_expand_args_builtin,
>         ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
>         emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
>         expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
>         expand_vec_perm_vpshufb2_vpermq,
>         expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
>         expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
>         ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
>         (expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
>         than *d itself.
> 
> --- gcc/optabs.c.jj     2013-10-29 09:25:45.000000000 +0100
> +++ gcc/optabs.c        2013-10-31 13:20:40.384808642 +0100
> @@ -6674,7 +6674,7 @@ expand_vec_perm (enum machine_mode mode,
>         }
>        tmp = gen_rtx_CONST_VECTOR (qimode, vec);
>        sel = gen_lowpart (qimode, sel);
> -      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
> +      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
>        gcc_assert (sel != NULL);
> 
>        /* Add the byte offset to each byte element.  */

This hunk causes issues on AArch64 and ARM.

We look to see which permute operation we should generate in
aarch64_expand_vec_perm_const. If we notice that all elements in 
would select from op0 we copy op0 to op1 and generate appropriate code.

With this hunk applied we end up selecting from the register
generated by gen_reg_rtx (qimode), rather than from 'sel' as we
intended. Thus we lose the value of 'sel' and everything starts to
go wrong!

The hunk looks suspicious to me (why do we pick a register out of
thin air?), and reverting it fixes the problems I see. Could you
give me a pointer as to what this hunk fixes on i?86?

I don't think I can fix this up in the expander very easily?

Thanks,
James

Patch

--- cfgexpand.c.jj	2013-10-30 08:15:36.000000000 +0100
+++ cfgexpand.c	2013-10-30 16:19:33.974203309 +0100
@@ -4535,6 +4535,31 @@  expand_stack_alignment (void)
     }
 }
 
+static void
+check_vector_subregs (rtx, const_rtx pat, void *data)
+{
+  rtx x;
+  if (GET_CODE (pat) != SET)
+    return;
+  x = SET_DEST (pat);
+  if (GET_CODE (x) == SUBREG
+      && GET_CODE (SUBREG_REG (x)) == REG
+      && (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT
+	  || GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
+	  || GET_MODE_CLASS (GET_MODE (SUBREG_REG (x))) == MODE_VECTOR_INT
+	  || GET_MODE_CLASS (GET_MODE (SUBREG_REG (x))) == MODE_VECTOR_FLOAT)
+      && GET_MODE_BITSIZE (GET_MODE (x))
+	 == GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (x))))
+    {
+      FILE *f = fopen ("/tmp/subregs", "a");
+      fprintf (f, "%s %d ", main_input_filename ? main_input_filename : "-",
+	       (int) BITS_PER_WORD);
+      print_inline_rtx (f, (rtx) data, 0);
+      putc ('\n', f);
+      fclose (f);
+    }
+}
+
 /* Translate the intermediate representation contained in the CFG
    from GIMPLE trees to RTL.
 
@@ -4922,6 +4947,13 @@  gimple_expand_cfg (void)
   set_block_levels (DECL_INITIAL (cfun->decl), 0);
   default_rtl_profile ();
 
+  {
+    rtx insn;
+    for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+      if (INSN_P (insn))
+	note_stores (PATTERN (insn), check_vector_subregs, insn);
+  }
+
   timevar_pop (TV_POST_EXPAND);
 
   return 0;

which revealed about 18000 vector mode punning subregs in SET_DEST in
x86_64 64-bit i386.exp testing alone, this patch brings that down to
zero (and in the whole make check-gcc down to about 40 which are from
various parts of the middle-end).  The disadvantage of mode punning
subregs in SET_DEST is that then CSE, combine, etc. don't attempt to
optimize those.  Additionally, I've noticed that expand_vec_perm_palignr
would happily clobber *d, that seems undesirable to me, other places always
modify a copy of the structure.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2013-10-31  Jakub Jelinek  <jakub@redhat.com>

	* optabs.c (expand_vec_perm): Avoid vector mode punning
	SUBREGs in SET_DEST.
	* expmed.c (store_bit_field_1): Likewise.
	* config/i386/sse.md (movdi_to_sse, vec_pack_sfix_trunc_v2df,
	vec_pack_sfix_v2df, vec_shl_<mode>, vec_shr_<mode>,
	vec_interleave_high<mode>, vec_interleave_low<mode>): Likewise.
	* config/i386/i386.c (ix86_expand_vector_move_misalign,
	ix86_expand_sse_movcc, ix86_expand_int_vcond, ix86_expand_vec_perm,
	ix86_expand_sse_unpack, ix86_expand_args_builtin,
	ix86_expand_vector_init_duplicate, ix86_expand_vector_set,
	emit_reduc_half, expand_vec_perm_blend, expand_vec_perm_pshufb,
	expand_vec_perm_interleave2, expand_vec_perm_pshufb2,
	expand_vec_perm_vpshufb2_vpermq,
	expand_vec_perm_vpshufb2_vpermq_even_odd, expand_vec_perm_even_odd_1,
	expand_vec_perm_broadcast_1, expand_vec_perm_vpshufb4_vpermq2,
	ix86_expand_sse2_mulv4si3, ix86_expand_pinsr): Likewise.
	(expand_vec_perm_palignr): Likewise.  Modify a copy of *d rather
	than *d itself.

--- gcc/optabs.c.jj	2013-10-29 09:25:45.000000000 +0100
+++ gcc/optabs.c	2013-10-31 13:20:40.384808642 +0100
@@ -6624,8 +6624,8 @@  expand_vec_perm (enum machine_mode mode,
 	  icode = direct_optab_handler (vec_perm_const_optab, qimode);
 	  if (icode != CODE_FOR_nothing)
 	    {
-	      tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
-				       gen_lowpart (qimode, v0),
+	      tmp = mode != qimode ? gen_reg_rtx (qimode) : target;
+	      tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0),
 				       gen_lowpart (qimode, v1), sel_qi);
 	      if (tmp)
 		return gen_lowpart (mode, tmp);
@@ -6674,7 +6674,7 @@  expand_vec_perm (enum machine_mode mode,
 	}
       tmp = gen_rtx_CONST_VECTOR (qimode, vec);
       sel = gen_lowpart (qimode, sel);
-      sel = expand_vec_perm (qimode, sel, sel, tmp, NULL);
+      sel = expand_vec_perm (qimode, gen_reg_rtx (qimode), sel, tmp, NULL);
       gcc_assert (sel != NULL);
 
       /* Add the byte offset to each byte element.  */
@@ -6689,8 +6689,8 @@  expand_vec_perm (enum machine_mode mode,
       gcc_assert (sel_qi != NULL);
     }
 
-  tmp = expand_vec_perm_1 (icode, gen_lowpart (qimode, target),
-			   gen_lowpart (qimode, v0),
+  tmp = mode != qimode ? gen_reg_rtx (qimode) : target;
+  tmp = expand_vec_perm_1 (icode, tmp, gen_lowpart (qimode, v0),
 			   gen_lowpart (qimode, v1), sel_qi);
   if (tmp)
     tmp = gen_lowpart (mode, tmp);
--- gcc/expmed.c.jj	2013-09-30 22:13:54.000000000 +0200
+++ gcc/expmed.c	2013-10-30 21:03:28.609613144 +0100
@@ -624,13 +624,28 @@  store_bit_field_1 (rtx str_rtx, unsigned
 	  || (bitsize % BITS_PER_WORD == 0 && bitnum % BITS_PER_WORD == 0)))
     {
       /* Use the subreg machinery either to narrow OP0 to the required
-	 words or to cope with mode punning between equal-sized modes.  */
-      rtx sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0),
-				     bitnum / BITS_PER_UNIT);
-      if (sub)
+	 words or to cope with mode punning between equal-sized modes.
+	 In the latter case, use subreg on the rhs side, not lhs.  */
+      rtx sub;
+
+      if (bitsize == GET_MODE_BITSIZE (GET_MODE (op0)))
 	{
-	  emit_move_insn (sub, value);
-	  return true;
+	  sub = simplify_gen_subreg (GET_MODE (op0), value, fieldmode, 0);
+	  if (sub)
+	    {
+	      emit_move_insn (op0, sub);
+	      return true;
+	    }
+	}
+      else
+	{
+	  sub = simplify_gen_subreg (fieldmode, op0, GET_MODE (op0),
+				     bitnum / BITS_PER_UNIT);
+	  if (sub)
+	    {
+	      emit_move_insn (sub, value);
+	      return true;
+	    }
 	}
     }
 
--- gcc/config/i386/sse.md.jj	2013-10-29 09:25:55.000000000 +0100
+++ gcc/config/i386/sse.md	2013-10-31 10:04:26.750274224 +0100
@@ -800,10 +800,13 @@  (define_insn_and_split "movdi_to_sse"
 				  gen_rtx_SUBREG (SImode, operands[1], 4)));
       emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
 					     operands[2]));
-    }
+   }
  else if (memory_operand (operands[1], DImode))
-   emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]),
-				  operands[1], const0_rtx));
+   {
+     rtx tmp = gen_reg_rtx (V2DImode);
+     emit_insn (gen_vec_concatv2di (tmp, operands[1], const0_rtx));
+     emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp));
+   }
  else
    gcc_unreachable ();
 })
@@ -4208,7 +4211,7 @@  (define_expand "vec_pack_sfix_trunc_v2df
    (match_operand:V2DF 2 "nonimmediate_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp0, tmp1;
+  rtx tmp0, tmp1, tmp2;
 
   if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
@@ -4222,13 +4225,14 @@  (define_expand "vec_pack_sfix_trunc_v2df
     {
       tmp0 = gen_reg_rtx (V4SImode);
       tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
 
       emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1]));
       emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2]));
-      emit_insn
-       (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-				    gen_lowpart (V2DImode, tmp0),
-				    gen_lowpart (V2DImode, tmp1)));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
     }
   DONE;
 })
@@ -4289,7 +4293,7 @@  (define_expand "vec_pack_sfix_v2df"
    (match_operand:V2DF 2 "nonimmediate_operand")]
   "TARGET_SSE2"
 {
-  rtx tmp0, tmp1;
+  rtx tmp0, tmp1, tmp2;
 
   if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
@@ -4303,13 +4307,14 @@  (define_expand "vec_pack_sfix_v2df"
     {
       tmp0 = gen_reg_rtx (V4SImode);
       tmp1 = gen_reg_rtx (V4SImode);
+      tmp2 = gen_reg_rtx (V2DImode);
 
       emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1]));
       emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2]));
-      emit_insn
-       (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-				    gen_lowpart (V2DImode, tmp0),
-				    gen_lowpart (V2DImode, tmp1)));
+      emit_insn (gen_vec_interleave_lowv2di (tmp2,
+					     gen_lowpart (V2DImode, tmp0),
+					     gen_lowpart (V2DImode, tmp1)));
+      emit_move_insn (operands[0], gen_lowpart (V4SImode, tmp2));
     }
   DONE;
 })
@@ -7328,14 +7333,16 @@  (define_insn "<shift_insn><mode>3"
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "vec_shl_<mode>"
-  [(set (match_operand:VI_128 0 "register_operand")
+  [(set (match_dup 3)
 	(ashift:V1TI
 	 (match_operand:VI_128 1 "register_operand")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
   "TARGET_SSE2"
 {
-  operands[0] = gen_lowpart (V1TImode, operands[0]);
   operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
 })
 
 (define_insn "<sse2_avx2>_ashl<mode>3"
@@ -7365,14 +7372,16 @@  (define_insn "<sse2_avx2>_ashl<mode>3"
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "vec_shr_<mode>"
-  [(set (match_operand:VI_128 0 "register_operand")
+  [(set (match_dup 3)
 	(lshiftrt:V1TI
 	 (match_operand:VI_128 1 "register_operand")
-	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))]
+	 (match_operand:SI 2 "const_0_to_255_mul_8_operand")))
+   (set (match_operand:VI_128 0 "register_operand") (match_dup 4))]
   "TARGET_SSE2"
 {
-  operands[0] = gen_lowpart (V1TImode, operands[0]);
   operands[1] = gen_lowpart (V1TImode, operands[1]);
+  operands[3] = gen_reg_rtx (V1TImode);
+  operands[4] = gen_lowpart (<MODE>mode, operands[3]);
 })
 
 (define_insn "<sse2_avx2>_lshr<mode>3"
@@ -8542,12 +8551,13 @@  (define_expand "vec_interleave_high<mode
 {
   rtx t1 = gen_reg_rtx (<MODE>mode);
   rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
   emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
   emit_insn (gen_avx2_interleave_high<mode> (t2,  operands[1], operands[2]));
-  emit_insn (gen_avx2_permv2ti
-	     (gen_lowpart (V4DImode, operands[0]),
-	      gen_lowpart (V4DImode, t1),
-	      gen_lowpart (V4DImode, t2), GEN_INT (1 + (3 << 4))));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (1 + (3 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
   DONE;
 })
 
@@ -8559,12 +8569,13 @@  (define_expand "vec_interleave_low<mode>
 {
   rtx t1 = gen_reg_rtx (<MODE>mode);
   rtx t2 = gen_reg_rtx (<MODE>mode);
+  rtx t3 = gen_reg_rtx (V4DImode);
   emit_insn (gen_avx2_interleave_low<mode> (t1, operands[1], operands[2]));
   emit_insn (gen_avx2_interleave_high<mode> (t2, operands[1], operands[2]));
-  emit_insn (gen_avx2_permv2ti
-	     (gen_lowpart (V4DImode, operands[0]),
-	      gen_lowpart (V4DImode, t1),
-	      gen_lowpart (V4DImode, t2), GEN_INT (0 + (2 << 4))));
+  emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, t1),
+				gen_lowpart (V4DImode, t2),
+				GEN_INT (0 + (2 << 4))));
+  emit_move_insn (operands[0], gen_lowpart (<MODE>mode, t3));
   DONE;
 })
 
--- gcc/config/i386/i386.c.jj	2013-10-30 18:49:26.000000000 +0100
+++ gcc/config/i386/i386.c	2013-10-31 13:05:56.690352315 +0100
@@ -16803,6 +16803,8 @@  ix86_expand_vector_move_misalign (enum m
 	}
       else
         {
+	  rtx t;
+
 	  if (TARGET_AVX
 	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
 	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
@@ -16821,18 +16823,22 @@  ix86_expand_vector_move_misalign (enum m
 	      return;
             }
 
+	  if (mode != V4SFmode)
+	    t = gen_reg_rtx (V4SFmode);
+	  else
+	    t = op0;
+	    
 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
-	    emit_move_insn (op0, CONST0_RTX (mode));
+	    emit_move_insn (t, CONST0_RTX (V4SFmode));
 	  else
-	    emit_clobber (op0);
-
-	  if (mode != V4SFmode)
-	    op0 = gen_lowpart (V4SFmode, op0);
+	    emit_clobber (t);
 
 	  m = adjust_address (op1, V2SFmode, 0);
-	  emit_insn (gen_sse_loadlps (op0, op0, m));
+	  emit_insn (gen_sse_loadlps (t, t, m));
 	  m = adjust_address (op1, V2SFmode, 8);
-	  emit_insn (gen_sse_loadhps (op0, op0, m));
+	  emit_insn (gen_sse_loadhps (t, t, m));
+	  if (mode != V4SFmode)
+	    emit_move_insn (op0, gen_lowpart (mode, t));
 	}
     }
   else if (MEM_P (op0))
@@ -20473,6 +20479,7 @@  ix86_expand_sse_movcc (rtx dest, rtx cmp
   else
     {
       rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+      rtx d = dest;
 
       if (!nonimmediate_operand (op_true, mode))
 	op_true = force_reg (mode, op_true);
@@ -20496,7 +20503,8 @@  ix86_expand_sse_movcc (rtx dest, rtx cmp
 	  if (TARGET_SSE4_1)
 	    {
 	      gen = gen_sse4_1_pblendvb;
-	      dest = gen_lowpart (V16QImode, dest);
+	      if (mode != V16QImode)
+		d = gen_reg_rtx (V16QImode);
 	      op_false = gen_lowpart (V16QImode, op_false);
 	      op_true = gen_lowpart (V16QImode, op_true);
 	      cmp = gen_lowpart (V16QImode, cmp);
@@ -20517,7 +20525,8 @@  ix86_expand_sse_movcc (rtx dest, rtx cmp
 	  if (TARGET_AVX2)
 	    {
 	      gen = gen_avx2_pblendvb;
-	      dest = gen_lowpart (V32QImode, dest);
+	      if (mode != V32QImode)
+		d = gen_reg_rtx (V32QImode);
 	      op_false = gen_lowpart (V32QImode, op_false);
 	      op_true = gen_lowpart (V32QImode, op_true);
 	      cmp = gen_lowpart (V32QImode, cmp);
@@ -20528,7 +20537,11 @@  ix86_expand_sse_movcc (rtx dest, rtx cmp
 	}
 
       if (gen != NULL)
-	emit_insn (gen (dest, op_false, op_true, cmp));
+	{
+	  emit_insn (gen (d, op_false, op_true, cmp));
+	  if (d != dest)
+	    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
+	}
       else
 	{
 	  op_true = force_reg (mode, op_true);
@@ -20849,8 +20862,7 @@  ix86_expand_int_vcond (rtx operands[])
   else
     {
       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
-      x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
-			       code, cop0, cop1,
+      x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
 			       operands[1+negate], operands[2-negate]);
       x = gen_lowpart (data_mode, x);
     }
@@ -20869,7 +20881,7 @@  ix86_expand_vec_perm (rtx operands[])
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx mask = operands[3];
-  rtx t1, t2, t3, t4, vt, vt2, vec[32];
+  rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
   enum machine_mode mode = GET_MODE (op0);
   enum machine_mode maskmode = GET_MODE (mask);
   int w, e, i;
@@ -20937,7 +20949,7 @@  ix86_expand_vec_perm (rtx operands[])
 
 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
 	  operands[3] = mask = t1;
-	  target = gen_lowpart (mode, target);
+	  target = gen_reg_rtx (mode);
 	  op0 = gen_lowpart (mode, op0);
 	  op1 = gen_lowpart (mode, op1);
 	}
@@ -20949,7 +20961,12 @@  ix86_expand_vec_perm (rtx operands[])
 	     the high bits of the shuffle elements.  No need for us to
 	     perform an AND ourselves.  */
 	  if (one_operand_shuffle)
-	    emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	    {
+	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
+	    }
 	  else
 	    {
 	      t1 = gen_reg_rtx (V8SImode);
@@ -21022,13 +21039,13 @@  ix86_expand_vec_perm (rtx operands[])
 	     stands for other 12 bytes.  */
 	  /* The bit whether element is from the same lane or the other
 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
-	  emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
-				    gen_lowpart (V4DImode, mask),
+	  t5 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
 				    GEN_INT (3)));
 	  /* Clear MSB bits from the mask just in case it had them set.  */
 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
 	  /* After this t1 will have MSB set for elements from other lane.  */
-	  emit_insn (gen_xorv32qi3 (t1, t1, vt2));
+	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
 	  /* Clear bits other than MSB.  */
 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
 	  /* Or in the lower bits from mask into t3.  */
@@ -21037,8 +21054,8 @@  ix86_expand_vec_perm (rtx operands[])
 	     lane.  */
 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
 	  /* Swap 128-bit lanes in t3.  */
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					  gen_lowpart (V4DImode, t3),
+	  t6 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
 	  /* And or in the lower bits from mask into t1.  */
@@ -21048,15 +21065,20 @@  ix86_expand_vec_perm (rtx operands[])
 	      /* Each of these shuffles will put 0s in places where
 		 element from the other 128-bit lane is needed, otherwise
 		 will shuffle in the requested value.  */
-	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
+	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
+						gen_lowpart (V32QImode, t6)));
 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
 	      /* For t3 the 128-bit lanes are swapped again.  */
-	      emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					      gen_lowpart (V4DImode, t3),
+	      t7 = gen_reg_rtx (V4DImode);
+	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
 					      const2_rtx, GEN_INT (3),
 					      const0_rtx, const1_rtx));
 	      /* And oring both together leads to the result.  */
-	      emit_insn (gen_iorv32qi3 (target, t1, t3));
+	      emit_insn (gen_iorv32qi3 (target, t1,
+					gen_lowpart (V32QImode, t7)));
+	      if (target != operands[0])
+		emit_move_insn (operands[0],
+				gen_lowpart (GET_MODE (operands[0]), target));
 	      return;
 	    }
 
@@ -21064,20 +21086,22 @@  ix86_expand_vec_perm (rtx operands[])
 	  /* Similarly to the above one_operand_shuffle code,
 	     just for repeated twice for each operand.  merge_two:
 	     code will merge the two results together.  */
-	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
-	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
+					    gen_lowpart (V32QImode, t6)));
+	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
+					    gen_lowpart (V32QImode, t6)));
 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
-					  gen_lowpart (V4DImode, t4),
+	  t7 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
-	  emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
-					  gen_lowpart (V4DImode, t3),
+	  t8 = gen_reg_rtx (V4DImode);
+	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
 					  const2_rtx, GEN_INT (3),
 					  const0_rtx, const1_rtx));
-	  emit_insn (gen_iorv32qi3 (t4, t2, t4));
-	  emit_insn (gen_iorv32qi3 (t3, t1, t3));
+	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
+	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
 	  t1 = t4;
 	  t2 = t3;
 	  goto merge_two;
@@ -21146,15 +21170,24 @@  ix86_expand_vec_perm (rtx operands[])
   /* The actual shuffle operations all operate on V16QImode.  */
   op0 = gen_lowpart (V16QImode, op0);
   op1 = gen_lowpart (V16QImode, op1);
-  target = gen_lowpart (V16QImode, target);
 
   if (TARGET_XOP)
     {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
       emit_insn (gen_xop_pperm (target, op0, op1, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
   else if (one_operand_shuffle)
     {
+      if (GET_MODE (target) != V16QImode)
+	target = gen_reg_rtx (V16QImode);
       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
   else
     {
@@ -21194,7 +21227,9 @@  ix86_expand_vec_perm (rtx operands[])
       mask = expand_simple_binop (maskmode, AND, mask, vt,
 				  NULL_RTX, 0, OPTAB_DIRECT);
 
-      xops[0] = gen_lowpart (mode, operands[0]);
+      if (GET_MODE (target) != mode)
+	target = gen_reg_rtx (mode);
+      xops[0] = target;
       xops[1] = gen_lowpart (mode, t2);
       xops[2] = gen_lowpart (mode, t1);
       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -21202,6 +21237,9 @@  ix86_expand_vec_perm (rtx operands[])
       xops[5] = vt;
       ok = ix86_expand_int_vcond (xops);
       gcc_assert (ok);
+      if (target != operands[0])
+	emit_move_insn (operands[0],
+			gen_lowpart (GET_MODE (operands[0]), target));
     }
 }
 
@@ -21280,10 +21318,10 @@  ix86_expand_sse_unpack (rtx dest, rtx sr
       else if (high_p)
 	{
 	  /* Shift higher 8 bytes to lower 8 bytes.  */
-	  tmp = gen_reg_rtx (imode);
-	  emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
-					 gen_lowpart (V1TImode, src),
+	  tmp = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
 					 GEN_INT (64)));
+	  tmp = gen_lowpart (imode, tmp);
 	}
       else
 	tmp = src;
@@ -21324,7 +21362,9 @@  ix86_expand_sse_unpack (rtx dest, rtx sr
 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
 				   src, pc_rtx, pc_rtx);
 
-      emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
+      rtx tmp2 = gen_reg_rtx (imode);
+      emit_insn (unpack (tmp2, src, tmp));
+      emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
     }
 }
 
@@ -31967,8 +32007,8 @@  ix86_expand_args_builtin (const struct b
     }
   else
     {
-      target = gen_reg_rtx (rmode);
-      real_target = simplify_gen_subreg (tmode, target, rmode, 0);
+      real_target = gen_reg_rtx (tmode);
+      target = simplify_gen_subreg (rmode, real_target, tmode, 0);
     }
 
   for (i = 0; i < nargs; i++)
@@ -36691,8 +36731,9 @@  ix86_expand_vector_init_duplicate (bool
 	  emit_move_insn (tmp1, gen_lowpart (SImode, val));
 
 	  /* Insert the SImode value as low element of a V4SImode vector. */
-	  tmp2 = gen_lowpart (V4SImode, dperm.op0);
+	  tmp2 = gen_reg_rtx (V4SImode);
 	  emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+	  emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
 
 	  ok = (expand_vec_perm_1 (&dperm)
 		|| expand_vec_perm_broadcast_1 (&dperm));
@@ -36722,9 +36763,10 @@  ix86_expand_vector_init_duplicate (bool
 				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
 	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
 
-	x = gen_lowpart (wvmode, target);
+	x = gen_reg_rtx (wvmode);
 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
 	gcc_assert (ok);
+	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
 	return ok;
       }
 
@@ -37599,8 +37641,9 @@  ix86_expand_vector_set (bool mmx_ok, rtx
       else
 	{
 	  /* For SSE1, we have to reuse the V4SF code.  */
-	  ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
-				  gen_lowpart (SFmode, val), elt);
+	  rtx t = gen_reg_rtx (V4SFmode);
+	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
+	  emit_move_insn (target, gen_lowpart (mode, t));
 	}
       return;
 
@@ -37918,7 +37961,7 @@  ix86_expand_vector_extract (bool mmx_ok,
 static void
 emit_reduc_half (rtx dest, rtx src, int i)
 {
-  rtx tem;
+  rtx tem, d = dest;
   switch (GET_MODE (src))
     {
     case V4SFmode:
@@ -37935,8 +37978,8 @@  emit_reduc_half (rtx dest, rtx src, int
     case V8HImode:
     case V4SImode:
     case V2DImode:
-      tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
-				gen_lowpart (V1TImode, src),
+      d = gen_reg_rtx (V1TImode);
+      tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
 				GEN_INT (i / 2));
       break;
     case V8SFmode:
@@ -37957,19 +38000,26 @@  emit_reduc_half (rtx dest, rtx src, int
     case V8SImode:
     case V4DImode:
       if (i == 256)
-	tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
-				 gen_lowpart (V4DImode, src),
-				 gen_lowpart (V4DImode, src),
-				 const1_rtx);
-      else
-	tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
-				  gen_lowpart (V2TImode, src),
-				  GEN_INT (i / 2));
+	{
+	  if (GET_MODE (dest) != V4DImode)
+	    d = gen_reg_rtx (V4DImode);
+	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
+				   gen_lowpart (V4DImode, src),
+				   const1_rtx);
+	}
+      else
+	{
+	  d = gen_reg_rtx (V2TImode);
+	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
+				    GEN_INT (i / 2));
+	}
       break;
     default:
       gcc_unreachable ();
     }
   emit_insn (tem);
+  if (d != dest)
+    emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
 }
 
 /* Expand a vector reduction.  FN is the binary pattern to reduce;
@@ -39462,6 +39512,8 @@  expand_vec_perm_blend (struct expand_vec
 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
 	    else
 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 	    return true;
 	  }
 
@@ -39471,7 +39523,7 @@  expand_vec_perm_blend (struct expand_vec
       /* FALLTHRU */
 
     do_subreg:
-      target = gen_lowpart (vmode, target);
+      target = gen_reg_rtx (vmode);
       op0 = gen_lowpart (vmode, op0);
       op1 = gen_lowpart (vmode, op1);
       break;
@@ -39525,7 +39577,7 @@  expand_vec_perm_blend (struct expand_vec
 
 		vmode = V32QImode;
 		nelt = 32;
-		target = gen_lowpart (vmode, target);
+		target = gen_reg_rtx (vmode);
 		op0 = gen_lowpart (vmode, op0);
 		op1 = gen_lowpart (vmode, op1);
 		goto finish_pblendvb;
@@ -39558,6 +39610,8 @@  expand_vec_perm_blend (struct expand_vec
   x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
   x = gen_rtx_SET (VOIDmode, target, x);
   emit_insn (x);
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 
   return true;
 }
@@ -39663,13 +39717,17 @@  expand_vec_perm_pshufb (struct expand_ve
 
 	      /* Use vperm2i128 insn.  The pattern uses
 		 V4DImode instead of V2TImode.  */
-	      target = gen_lowpart (V4DImode, d->target);
+	      target = d->target;
+	      if (d->vmode != V4DImode)
+		target = gen_reg_rtx (V4DImode);
 	      op0 = gen_lowpart (V4DImode, d->op0);
 	      op1 = gen_lowpart (V4DImode, d->op1);
 	      rperm[0]
 		= GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
 			   || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
 	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	      if (target != d->target)
+		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 	      return true;
 	    }
 	  return false;
@@ -39704,9 +39762,15 @@  expand_vec_perm_pshufb (struct expand_ve
 		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
 		  if (d->testing_p)
 		    return true;
-		  return expand_vselect (gen_lowpart (V4DImode, d->target),
-					 gen_lowpart (V4DImode, d->op0),
-					 perm, 4, false);
+		  target = gen_reg_rtx (V4DImode);
+		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				      perm, 4, false))
+		    {
+		      emit_move_insn (d->target,
+				      gen_lowpart (d->vmode, target));
+		      return true;
+		    }
+		  return false;
 		}
 
 	      /* Next see if vpermd can be used.  */
@@ -39758,7 +39822,9 @@  expand_vec_perm_pshufb (struct expand_ve
 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
   vperm = force_reg (vmode, vperm);
 
-  target = gen_lowpart (vmode, d->target);
+  target = d->target;
+  if (d->vmode != vmode)
+    target = gen_reg_rtx (vmode);
   op0 = gen_lowpart (vmode, d->op0);
   if (d->one_operand_p)
     {
@@ -39776,6 +39842,8 @@  expand_vec_perm_pshufb (struct expand_ve
       op1 = gen_lowpart (vmode, d->op1);
       emit_insn (gen_xop_pperm (target, op0, op1, vperm));
     }
+  if (target != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 
   return true;
 }
@@ -39975,7 +40043,8 @@  expand_vec_perm_palignr (struct expand_v
   unsigned i, nelt = d->nelt;
   unsigned min, max;
   bool in_order, ok;
-  rtx shift;
+  rtx shift, target;
+  struct expand_vec_perm_d dcopy;
 
   /* Even with AVX, palignr only operates on 128-bit vectors.  */
   if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
@@ -39998,29 +40067,33 @@  expand_vec_perm_palignr (struct expand_v
   if (d->testing_p)
     return true;
 
+  dcopy = *d;
   shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
-  emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
-				  gen_lowpart (TImode, d->op1),
+  target = gen_reg_rtx (TImode);
+  emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
 				  gen_lowpart (TImode, d->op0), shift));
 
-  d->op0 = d->op1 = d->target;
-  d->one_operand_p = true;
+  dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
+  dcopy.one_operand_p = true;
 
   in_order = true;
   for (i = 0; i < nelt; ++i)
     {
-      unsigned e = d->perm[i] - min;
+      unsigned e = dcopy.perm[i] - min;
       if (e != i)
 	in_order = false;
-      d->perm[i] = e;
+      dcopy.perm[i] = e;
     }
 
   /* Test for the degenerate case where the alignment by itself
      produces the desired permutation.  */
   if (in_order)
-    return true;
+    {
+      emit_move_insn (d->target, dcopy.op0);
+      return true;
+    }
 
-  ok = expand_vec_perm_1 (d);
+  ok = expand_vec_perm_1 (&dcopy);
   gcc_assert (ok);
 
   return ok;
@@ -40274,10 +40347,10 @@  expand_vec_perm_interleave2 (struct expa
       else
 	dfinal.perm[i] = e;
     }
-  dfinal.op0 = gen_reg_rtx (dfinal.vmode);
+  dremap.target = gen_reg_rtx (dremap.vmode);
+  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   dfinal.op1 = dfinal.op0;
   dfinal.one_operand_p = true;
-  dremap.target = dfinal.op0;
 
   /* Test if the final remap can be done with a single insn.  For V4SFmode or
      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
@@ -40294,7 +40367,6 @@  expand_vec_perm_interleave2 (struct expa
 
   if (dremap.vmode != dfinal.vmode)
     {
-      dremap.target = gen_lowpart (dremap.vmode, dremap.target);
       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
     }
@@ -40745,8 +40817,12 @@  expand_vec_perm_pshufb2 (struct expand_v
   op = gen_lowpart (V16QImode, d->op1);
   emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
 
-  op = gen_lowpart (V16QImode, d->target);
+  op = d->target;
+  if (d->vmode != V16QImode)
+    op = gen_reg_rtx (V16QImode);
   emit_insn (gen_iorv16qi3 (op, l, h));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40812,8 +40888,12 @@  expand_vec_perm_vpshufb2_vpermq (struct
   op = gen_lowpart (V32QImode, d->op0);
   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
 
-  op = gen_lowpart (V32QImode, d->target);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40889,10 +40969,11 @@  expand_vec_perm_vpshufb2_vpermq_even_odd
   emit_insn (gen_iorv32qi3 (ior, l, h));
 
   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
-  op = gen_lowpart (V4DImode, d->target);
+  op = gen_reg_rtx (V4DImode);
   ior = gen_lowpart (V4DImode, ior);
   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
 				  const1_rtx, GEN_INT (3)));
+  emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
   return true;
 }
@@ -40903,7 +40984,7 @@  expand_vec_perm_vpshufb2_vpermq_even_odd
 static bool
 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
 {
-  rtx t1, t2, t3;
+  rtx t1, t2, t3, t4, t5;
 
   switch (d->vmode)
     {
@@ -41015,10 +41096,17 @@  expand_vec_perm_even_odd_1 (struct expan
 	{
 	  struct expand_vec_perm_d d_copy = *d;
 	  d_copy.vmode = V4DFmode;
-	  d_copy.target = gen_lowpart (V4DFmode, d->target);
+	  d_copy.target = gen_reg_rtx (V4DFmode);
 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
-	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V4DImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
 	}
 
       t1 = gen_reg_rtx (V4DImode);
@@ -41041,44 +41129,51 @@  expand_vec_perm_even_odd_1 (struct expan
 	{
 	  struct expand_vec_perm_d d_copy = *d;
 	  d_copy.vmode = V8SFmode;
-	  d_copy.target = gen_lowpart (V8SFmode, d->target);
+	  d_copy.target = gen_reg_rtx (V8SFmode);
 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
-	  return expand_vec_perm_even_odd_1 (&d_copy, odd);
+	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
+	    {
+	      if (!d->testing_p)
+		emit_move_insn (d->target,
+				gen_lowpart (V8SImode, d_copy.target));
+	      return true;
+	    }
+	  return false;
 	}
 
       t1 = gen_reg_rtx (V8SImode);
       t2 = gen_reg_rtx (V8SImode);
+      t3 = gen_reg_rtx (V4DImode);
+      t4 = gen_reg_rtx (V4DImode);
+      t5 = gen_reg_rtx (V4DImode);
 
       /* Shuffle the lanes around into
 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
-				    gen_lowpart (V4DImode, d->op0),
+      emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
 				    gen_lowpart (V4DImode, d->op1),
 				    GEN_INT (0x20)));
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
-				    gen_lowpart (V4DImode, d->op0),
+      emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
 				    gen_lowpart (V4DImode, d->op1),
 				    GEN_INT (0x31)));
 
       /* Swap the 2nd and 3rd position in each lane into
 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
-      emit_insn (gen_avx2_pshufdv3 (t1, t1,
+      emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
-      emit_insn (gen_avx2_pshufdv3 (t2, t2,
+      emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
 
       /* Now an vpunpck[lh]qdq will produce
 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
       if (odd)
-	t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
-					   gen_lowpart (V4DImode, t1),
+	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
 					   gen_lowpart (V4DImode, t2));
       else
-	t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
-					  gen_lowpart (V4DImode, t1),
+	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
 					  gen_lowpart (V4DImode, t2));
       emit_insn (t3);
+      emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
       break;
 
     default:
@@ -41116,7 +41211,7 @@  expand_vec_perm_broadcast_1 (struct expa
   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   enum machine_mode vmode = d->vmode;
   unsigned char perm2[4];
-  rtx op0 = d->op0;
+  rtx op0 = d->op0, dest;
   bool ok;
 
   switch (vmode)
@@ -41162,9 +41257,11 @@  expand_vec_perm_broadcast_1 (struct expa
       while (vmode != V4SImode);
 
       memset (perm2, elt, 4);
-      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
-			   d->testing_p);
+      dest = gen_reg_rtx (V4SImode);
+      ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
       gcc_assert (ok);
+      if (!d->testing_p)
+	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
     case V32QImode:
@@ -41306,8 +41403,12 @@  expand_vec_perm_vpshufb4_vpermq2 (struct
     }
 
   gcc_assert (l[0] && l[1]);
-  op = gen_lowpart (V32QImode, d->target);
+  op = d->target;
+  if (d->vmode != V32QImode)
+    op = gen_reg_rtx (V32QImode);
   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+  if (op != d->target)
+    emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   return true;
 }
 
@@ -41875,7 +41976,9 @@  ix86_expand_mul_widen_hilo (rtx dest, rt
 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
       gcc_assert (t1 && t2);
 
-      ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
+      t3 = gen_reg_rtx (mode);
+      ix86_expand_vec_interleave (t3, t1, t2, high_p);
+      emit_move_insn (dest, gen_lowpart (wmode, t3));
       break;
 
     case V16QImode:
@@ -41896,14 +41999,14 @@  ix86_expand_mul_widen_hilo (rtx dest, rt
 void
 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
 {
-  rtx res_1, res_2;
+  rtx res_1, res_2, res_3, res_4;
 
   res_1 = gen_reg_rtx (V4SImode);
   res_2 = gen_reg_rtx (V4SImode);
-  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
-				 op1, op2, true, false);
-  ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
-				 op1, op2, true, true);
+  res_3 = gen_reg_rtx (V2DImode);
+  res_4 = gen_reg_rtx (V2DImode);
+  ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
+  ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
 
   /* Move the results in element 2 down to element 1; we don't care
      what goes in elements 2 and 3.  Then we can merge the parts
@@ -41917,9 +42020,11 @@  ix86_expand_sse2_mulv4si3 (rtx op0, rtx
      In both cases the cost of the reformatting stall was too high
      and the overall sequence slower.  */
 
-  emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
+  emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
+				const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
-  emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
+  emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
+				const0_rtx, const2_rtx,
 				const0_rtx, const0_rtx));
   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
 
@@ -42088,12 +42193,17 @@  ix86_expand_pinsr (rtx *operands)
 	    return false;
 	  }
 
-	dst = gen_lowpart (dstmode, dst);
+	rtx d = dst;
+	if (GET_MODE (dst) != dstmode)
+	  d = gen_reg_rtx (dstmode);
 	src = gen_lowpart (srcmode, src);
 
 	pos /= size;
 
-	emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
+	emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
+			  GEN_INT (1 << pos)));
+	if (d != dst)
+	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
 	return true;
       }