diff mbox

[RS6000] reload_vsx_from_gprsf splitter

Message ID 20160211140410.GC10888@bubble.grove.modra.org
State New
Headers show

Commit Message

Alan Modra Feb. 11, 2016, 2:04 p.m. UTC
This is PR68973 part 2, the failure of a boost test, caused by a
splitter emitting an invalid move in reload_vsx_from_gprsf:
  emit_move_insn (op0_di, op2);

op0 can be any vsx reg, but the mtvsrd destination constraint in
movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
we have that constraint so left movdi_internal64 alone and used a
special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
restricted to fprs there too so fixed that as well.  (We can't use %L
in asm operands that must accept vsx.)

Bootstrapped and regression tested powerpc64le-linux.  powerpc64-linux
biarch -mcpu=power7 bootstrap still in progress.  OK to apply assuming
no regressions found?

	PR target/68973
	* config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Delete.
	* config/rs6000/rs6000.md (reload_vsx_from_gprsf): Rewrite splitter.
	(p8_mtvsrd): New.
	(p8_mtvsrd_1, p8_mtvsrd_2): Delete.
	(reload_vsx_from_gpr<mode>): Adjust to use p8_mtvsrd.

Comments

David Edelsohn Feb. 11, 2016, 5:34 p.m. UTC | #1
On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
> This is PR68973 part 2, the failure of a boost test, caused by a
> splitter emitting an invalid move in reload_vsx_from_gprsf:
>   emit_move_insn (op0_di, op2);
>
> op0 can be any vsx reg, but the mtvsrd destination constraint in
> movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
> we have that constraint so left movdi_internal64 alone and used a
> special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
> by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
> restricted to fprs there too so fixed that as well.  (We can't use %L
> in asm operands that must accept vsx.)

Michael, please review the "wj" constraint in these patterns.

Alan, the explanation says that it uses a special p8_mtvsrd similar to
p8_mtvsrd_[12], but does not explain why the two similar patterns are
removed.  The incorrect use of %L implies those patterns, but the
change is to p8_xxpermdi_<mode> that is not mentioned in the
ChangeLog.

I also would appreciate Uli's comments on this direction because of
his reload expertise.

Thanks, David

>
> Bootstrapped and regression tested powerpc64le-linux.  powerpc64-linux
> biarch -mcpu=power7 bootstrap still in progress.  OK to apply assuming
> no regressions found?
>
>         PR target/68973
>         * config/rs6000/vsx.md (vsx_xscvspdpn_directmove): Delete.
>         * config/rs6000/rs6000.md (reload_vsx_from_gprsf): Rewrite splitter.
>         (p8_mtvsrd): New.
>         (p8_mtvsrd_1, p8_mtvsrd_2): Delete.
>         (reload_vsx_from_gpr<mode>): Adjust to use p8_mtvsrd.
>
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index cdbf873..745293b 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -7543,29 +7543,22 @@
>     (set_attr "type" "three")])
>
>  ;; Move 128 bit values from GPRs to VSX registers in 64-bit mode
> -(define_insn "p8_mtvsrd_1"
> -  [(set (match_operand:TF 0 "register_operand" "=ws")
> -       (unspec:TF [(match_operand:DI 1 "register_operand" "r")]
> +(define_insn "p8_mtvsrd"
> +  [(set (match_operand:DF 0 "register_operand" "=ws")
> +       (unspec:DF [(match_operand:DI 1 "register_operand" "r")]
>                    UNSPEC_P8V_MTVSRD))]
>    "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "mtvsrd %0,%1"
> -  [(set_attr "type" "mftgpr")])
> -
> -(define_insn "p8_mtvsrd_2"
> -  [(set (match_operand:TF 0 "register_operand" "+ws")
> -       (unspec:TF [(match_dup 0)
> -                   (match_operand:DI 1 "register_operand" "r")]
> -                  UNSPEC_P8V_MTVSRD))]
> -  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "mtvsrd %L0,%1"
> +  "mtvsrd %x0,%1"
>    [(set_attr "type" "mftgpr")])
>
>  (define_insn "p8_xxpermdi_<mode>"
>    [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
> -       (unspec:FMOVE128_GPR [(match_operand:TF 1 "register_operand" "ws")]
> -                            UNSPEC_P8V_XXPERMDI))]
> +       (unspec:FMOVE128_GPR [
> +               (match_operand:DF 1 "register_operand" "ws")
> +               (match_operand:DF 2 "register_operand" "ws")]
> +               UNSPEC_P8V_XXPERMDI))]
>    "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> -  "xxpermdi %x0,%1,%L1,0"
> +  "xxpermdi %x0,%x1,%x2,0"
>    [(set_attr "type" "vecperm")])
>
>  (define_insn_and_split "reload_vsx_from_gpr<mode>"
> @@ -7581,13 +7574,18 @@
>  {
>    rtx dest = operands[0];
>    rtx src = operands[1];
> -  rtx tmp = operands[2];
> +  /* You might think that we could use op0 as one temp and a DF clobber
> +     as the other, but you'd be wrong.  These secondary_reload patterns
> +     don't check that the clobber is different to the destination, which
> +     is probably a reload bug.  */
> +  rtx tmp_hi = gen_rtx_REG (DFmode, REGNO (operands[2]));
> +  rtx tmp_lo = gen_rtx_REG (DFmode, REGNO (operands[2]) + 1);
>    rtx gpr_hi_reg = gen_highpart (DImode, src);
>    rtx gpr_lo_reg = gen_lowpart (DImode, src);
>
> -  emit_insn (gen_p8_mtvsrd_1 (tmp, gpr_hi_reg));
> -  emit_insn (gen_p8_mtvsrd_2 (tmp, gpr_lo_reg));
> -  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp));
> +  emit_insn (gen_p8_mtvsrd (tmp_hi, gpr_hi_reg));
> +  emit_insn (gen_p8_mtvsrd (tmp_lo, gpr_lo_reg));
> +  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp_hi, tmp_lo));
>    DONE;
>  }
>    [(set_attr "length" "12")
> @@ -7622,16 +7620,18 @@
>    rtx op0 = operands[0];
>    rtx op1 = operands[1];
>    rtx op2 = operands[2];
> +
>    /* Also use the destination register to hold the unconverted DImode value.
>       This is conceptually a separate value from OP0, so we use gen_rtx_REG
>       rather than simplify_gen_subreg.  */
> -  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
> +  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
> +  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
>    rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0);
>
>    /* Move SF value to upper 32-bits for xscvspdpn.  */
>    emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> -  emit_move_insn (op0_di, op2);
> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
> +  emit_insn (gen_p8_mtvsrd (op0_df, op2));
> +  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
>    DONE;
>  }
>    [(set_attr "length" "8")
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 997ff31..2d2f137 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -1518,15 +1518,6 @@
>    "xscvdpspn %x0,%x1"
>    [(set_attr "type" "fp")])
>
> -;; Used by direct move to move a SFmode value from GPR to VSX register
> -(define_insn "vsx_xscvspdpn_directmove"
> -  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
> -       (unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")]
> -                  UNSPEC_VSX_CVSPDPN))]
> -  "TARGET_XSCVSPDPN"
> -  "xscvspdpn %x0,%x1"
> -  [(set_attr "type" "fp")])
> -
>  ;; Convert and scale (used by vec_ctf, vec_cts, vec_ctu for double/long long)
>
>  (define_expand "vsx_xvcvsxddp_scale"
>
> --
> Alan Modra
> Australia Development Lab, IBM
Michael Meissner Feb. 11, 2016, 9:53 p.m. UTC | #2
On Thu, Feb 11, 2016 at 09:34:29AM -0800, David Edelsohn wrote:
> On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
> > This is PR68973 part 2, the failure of a boost test, caused by a
> > splitter emitting an invalid move in reload_vsx_from_gprsf:
> >   emit_move_insn (op0_di, op2);
> >
> > op0 can be any vsx reg, but the mtvsrd destination constraint in
> > movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
> > we have that constraint so left movdi_internal64 alone and used a
> > special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
> > by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
> > restricted to fprs there too so fixed that as well.  (We can't use %L
> > in asm operands that must accept vsx.)
> 
> Michael, please review the "wj" constraint in these patterns.
> 
> Alan, the explanation says that it uses a special p8_mtvsrd similar to
> p8_mtvsrd_[12], but does not explain why the two similar patterns are
> removed.  The incorrect use of %L implies those patterns, but the
> change is to p8_xxpermdi_<mode> that is not mentioned in the
> ChangeLog.
> 
> I also would appreciate Uli's comments on this direction because of
> his reload expertise.

Since the mail was sent to my Lotus Notes account (mrmeissn@us.ibm.com), I
originally tried to reply through that system, but I got bounce errors.

At the present time, we do not allow DImode to go into Altivec
registers. Mostly it was due to reload complications in the power8 time frame,
where we didn't have d-form addressing for the Altivec registers and
interference with the other DImode reload patterns, but also I felt I would
need to go through the main rs6000.md to look for the various DImode patterns
to see if they needed to be updated.  At some I hoped to extend this, so I
added the "wi" and "wj" constraints to be used whenever the mode is DImode.
"wi" is the constraint for the VSX register class DImode can go in
(i.e. currently FLOAT=5FREGS), and "wj" is the VSX register class DImode can go
in if we have 64-bit direct move.

The TFmode thing was a hack.  It was the only way I could see getting two
registers without a lot of hair.  Since TFmode is also restricted to FPR_REGS,
you could use %L on it.  When I was writing it, I really wished we had a reload
pattern to get more than one temporary, but we didn't, so I went for TFmode to
get 2 FPR registers.  Given the replacement pattern no longer uses a TFmode
temporary, it can go in any appropriate register.

If you are using DFmode, the right constraint is "wk" (appropriate register to
use with the double type when direct moves are available) or "ws" (appropriate
register to use for DFtype with VSX instructions).

We don't have a constraint for appropriate register class to use with SFmode
when direct moves are available, so I would suggest using "wy" for a register
class you can use ISA 2.07 (power8) ops (the float scalar ops in the upper
registers).  The "ww" constraint would be the appropriate register to use ISA
2.06 (power7) operations on SFmode.
Michael Meissner Feb. 11, 2016, 9:55 p.m. UTC | #3
On Thu, Feb 11, 2016 at 07:38:15PM +0100, Ulrich Weigand wrote:
> For the most part, this patch doesn't really change anything in the
> interaction with reload as far as I can see.  The changes introduced
> by the patch do make sense to me.  In particular, replacing the two
> patterns p8_mtvsrd_1 and p8_mtvsrd_2 used to fill high and low parts
> of a TFmode register pair with a single pattern p8_mtvsrd that just
> works on any DFmode register (leaving the split into high/low to the
> caller if necessary) seems to simplify things.
> 
> > > +  /* You might think that we could use op0 as one temp and a DF clobber
> > > +     as the other, but you'd be wrong.  These secondary_reload patterns
> > > +     don't check that the clobber is different to the destination, which
> > > +     is probably a reload bug.  */
> 
> It's not a bug, it's deliberate behavior.  The reload registers allocated
> for secondary reload clobbers may overlap the destination, since in many
> cases you simply move the input to the reload register, and then the
> reload register to the destination (where the latter move can be a no-op
> if it is possible to allocate the reload register and the destination
> into the same physical register).  If you need two separate intermediate
> values, you need to allocate a secondary reload register of a larger
> mode (as is already done in the pattern).

This is one of the cases I wished the reload support had the ability to
allocate 2 scratch temporaries instead of 1.  As I said in my other message,
TFmode was a hack to get two registers to use.

> While this was not introduced by this patch, I'm a little bit concerned
> about the hard-coded use of REGNO here, which will crash if op0 at this
> point happens to be a SUBREG instead of a REG.  This is extremely unlikely
> at this point in reload, but not 100% impossible, e.g. if op0 for some
> reason is one of the "magic" registers like the stack pointer.
> 
> That's why it is in general better to use simplify_gen_subreg or one of
> gen_highpart/gen_lowpart, which will handle SUBREG correctly as well.
> I'm not sure why it matters whether this is "conceptually a separate
> value" as the comment argues ...

Good catch.
Alan Modra Feb. 11, 2016, 10:24 p.m. UTC | #4
On Thu, Feb 11, 2016 at 04:55:58PM -0500, Michael Meissner wrote:
> This is one of the cases I wished the reload support had the ability to
> allocate 2 scratch temporaries instead of 1.  As I said in my other message,
> TFmode was a hack to get two registers to use.

Another concern I had about this, besides using %L in asm output (what
forces TFmode to use just fprs?), is what happens when we're using
IEEE 128-bit floats?  In that case it looks like we'd get just one reg.
Michael Meissner Feb. 11, 2016, 10:42 p.m. UTC | #5
On Fri, Feb 12, 2016 at 08:54:19AM +1030, Alan Modra wrote:
> On Thu, Feb 11, 2016 at 04:55:58PM -0500, Michael Meissner wrote:
> > This is one of the cases I wished the reload support had the ability to
> > allocate 2 scratch temporaries instead of 1.  As I said in my other message,
> > TFmode was a hack to get two registers to use.
> 
> Another concern I had about this, besides using %L in asm output (what
> forces TFmode to use just fprs?), is what happens when we're using
> IEEE 128-bit floats?  In that case it looks like we'd get just one reg.

The code in rs6000_hard_regno_mode_ok only allows TFmode (IBM extended double)
in GPRs and FPRs.  In theory, TFmode could go in VSX registers, just it hasn't
been done.

Good point that it breaks if the default long double (TFmode) type is IEEE
128-bit floating point.  We would need to have two patterns, one that uses
TFmode and one that uses IFmode.  I wrote the power8 direct move stuff before
going down the road of IEEE 128-bit floating point.
David Edelsohn Feb. 12, 2016, 12:21 a.m. UTC | #6
On Thu, Feb 11, 2016 at 10:38 AM, Ulrich Weigand <uweigand@de.ibm.com> wrote:
> David Edelsohn wrote:
>> On Thu, Feb 11, 2016 at 6:04 AM, Alan Modra <amodra@gmail.com> wrote:
>> > This is PR68973 part 2, the failure of a boost test, caused by a
>> > splitter emitting an invalid move in reload_vsx_from_gprsf:
>> >   emit_move_insn (op0_di, op2);
>> >
>> > op0 can be any vsx reg, but the mtvsrd destination constraint in
>> > movdi_internal64 is "wj", which only allows fp regs.  I'm not sure why
>> > we have that constraint so left movdi_internal64 alone and used a
>> > special p8_mtvsrd instead, similar to p8_mtvsrd_1 and p8_mtvsrd_2 used
>> > by reload_vsx_from_gpr<mode>.  When looking at those, I noticed we're
>> > restricted to fprs there too so fixed that as well.  (We can't use %L
>> > in asm operands that must accept vsx.)
>>
>> Michael, please review the "wj" constraint in these patterns.
>>
>> Alan, the explanation says that it uses a special p8_mtvsrd similar to
>> p8_mtvsrd_[12], but does not explain why the two similar patterns are
>> removed.  The incorrect use of %L implies those patterns, but the
>> change is to p8_xxpermdi_<mode> that is not mentioned in the
>> ChangeLog.
>>
>> I also would appreciate Uli's comments on this direction because of
>> his reload expertise.
>
> For the most part, this patch doesn't really change anything in the
> interaction with reload as far as I can see.  The changes introduced
> by the patch do make sense to me.  In particular, replacing the two
> patterns p8_mtvsrd_1 and p8_mtvsrd_2 used to fill high and low parts
> of a TFmode register pair with a single pattern p8_mtvsrd that just
> works on any DFmode register (leaving the split into high/low to the
> caller if necessary) seems to simplify things.
>
>> > +  /* You might think that we could use op0 as one temp and a DF clobber
>> > +     as the other, but you'd be wrong.  These secondary_reload patterns
>> > +     don't check that the clobber is different to the destination, which
>> > +     is probably a reload bug.  */
>
> It's not a bug, it's deliberate behavior.  The reload registers allocated
> for secondary reload clobbers may overlap the destination, since in many
> cases you simply move the input to the reload register, and then the
> reload register to the destination (where the latter move can be a no-op
> if it is possible to allocate the reload register and the destination
> into the same physical register).  If you need two separate intermediate
> values, you need to allocate a secondary reload register of a larger
> mode (as is already done in the pattern).
>
>> >    /* Also use the destination register to hold the unconverted DImode value.
>> >       This is conceptually a separate value from OP0, so we use gen_rtx_REG
>> >       rather than simplify_gen_subreg.  */
>> > -  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
>> > +  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
>> > +  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
>
> While this was not introduced by this patch, I'm a little bit concerned
> about the hard-coded use of REGNO here, which will crash if op0 at this
> point happens to be a SUBREG instead of a REG.  This is extremely unlikely
> at this point in reload, but not 100% impossible, e.g. if op0 for some
> reason is one of the "magic" registers like the stack pointer.
>
> That's why it is in general better to use simplify_gen_subreg or one of
> gen_highpart/gen_lowpart, which will handle SUBREG correctly as well.
> I'm not sure why it matters whether this is "conceptually a separate
> value" as the comment argues ...
>
>> >    /* Move SF value to upper 32-bits for xscvspdpn.  */
>> >    emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
>> > -  emit_move_insn (op0_di, op2);
>> > -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
>> > +  emit_insn (gen_p8_mtvsrd (op0_df, op2));
>> > +  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
>
> The sequence of modes used for op0 here is a bit weird.  First,
> op0 is loaded as DFmode by mtvsrd, then it is silently re-
> interpreted as V4SFmode when used as input to xscvspdpn, which
> gets a DFmode output that is again silently re-interpreted as
> SFmode.
>
> This isn't really wrong as such, just maybe a bit confusing.
> Maybe instead have p8_mtvsrd use DImode as output (instead of
> DFmode), which shouldn't be any harder to use in the
> reload_vsx_from_gpr<mode> splitter, and keep using the
> vsx_xscvspdpn_directmove pattern?
>
> [ This of course reinforces the question why we have p8_mtvsrd
> in the first place, instead of just allowing this use directly
> in movdi_internal64 itself.  ]

Good question: is p8_mtvsrd really necessary if the movdi_internal64
is updated to use the correct constraints?

The patch definitely is going in the right direction.  Can we remove
more unnecessary bits?

Thanks, David
diff mbox

Patch

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index cdbf873..745293b 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -7543,29 +7543,22 @@ 
    (set_attr "type" "three")])
 
 ;; Move 128 bit values from GPRs to VSX registers in 64-bit mode
-(define_insn "p8_mtvsrd_1"
-  [(set (match_operand:TF 0 "register_operand" "=ws")
-	(unspec:TF [(match_operand:DI 1 "register_operand" "r")]
+(define_insn "p8_mtvsrd"
+  [(set (match_operand:DF 0 "register_operand" "=ws")
+	(unspec:DF [(match_operand:DI 1 "register_operand" "r")]
 		   UNSPEC_P8V_MTVSRD))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %0,%1"
-  [(set_attr "type" "mftgpr")])
-
-(define_insn "p8_mtvsrd_2"
-  [(set (match_operand:TF 0 "register_operand" "+ws")
-	(unspec:TF [(match_dup 0)
-		    (match_operand:DI 1 "register_operand" "r")]
-		   UNSPEC_P8V_MTVSRD))]
-  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "mtvsrd %L0,%1"
+  "mtvsrd %x0,%1"
   [(set_attr "type" "mftgpr")])
 
 (define_insn "p8_xxpermdi_<mode>"
   [(set (match_operand:FMOVE128_GPR 0 "register_operand" "=wa")
-	(unspec:FMOVE128_GPR [(match_operand:TF 1 "register_operand" "ws")]
-			     UNSPEC_P8V_XXPERMDI))]
+	(unspec:FMOVE128_GPR [
+		(match_operand:DF 1 "register_operand" "ws")
+		(match_operand:DF 2 "register_operand" "ws")]
+		UNSPEC_P8V_XXPERMDI))]
   "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
-  "xxpermdi %x0,%1,%L1,0"
+  "xxpermdi %x0,%x1,%x2,0"
   [(set_attr "type" "vecperm")])
 
 (define_insn_and_split "reload_vsx_from_gpr<mode>"
@@ -7581,13 +7574,18 @@ 
 {
   rtx dest = operands[0];
   rtx src = operands[1];
-  rtx tmp = operands[2];
+  /* You might think that we could use op0 as one temp and a DF clobber
+     as the other, but you'd be wrong.  These secondary_reload patterns
+     don't check that the clobber is different to the destination, which
+     is probably a reload bug.  */
+  rtx tmp_hi = gen_rtx_REG (DFmode, REGNO (operands[2]));
+  rtx tmp_lo = gen_rtx_REG (DFmode, REGNO (operands[2]) + 1);
   rtx gpr_hi_reg = gen_highpart (DImode, src);
   rtx gpr_lo_reg = gen_lowpart (DImode, src);
 
-  emit_insn (gen_p8_mtvsrd_1 (tmp, gpr_hi_reg));
-  emit_insn (gen_p8_mtvsrd_2 (tmp, gpr_lo_reg));
-  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp));
+  emit_insn (gen_p8_mtvsrd (tmp_hi, gpr_hi_reg));
+  emit_insn (gen_p8_mtvsrd (tmp_lo, gpr_lo_reg));
+  emit_insn (gen_p8_xxpermdi_<mode> (dest, tmp_hi, tmp_lo));
   DONE;
 }
   [(set_attr "length" "12")
@@ -7622,16 +7620,18 @@ 
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
+
   /* Also use the destination register to hold the unconverted DImode value.
      This is conceptually a separate value from OP0, so we use gen_rtx_REG
      rather than simplify_gen_subreg.  */
-  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
+  rtx op0_df = gen_rtx_REG (DFmode, REGNO (op0));
+  rtx op0_v4sf = gen_rtx_REG (V4SFmode, REGNO (op0));
   rtx op1_di = simplify_gen_subreg (DImode, op1, SFmode, 0);
 
   /* Move SF value to upper 32-bits for xscvspdpn.  */
   emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
-  emit_move_insn (op0_di, op2);
-  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0_di));
+  emit_insn (gen_p8_mtvsrd (op0_df, op2));
+  emit_insn (gen_vsx_xscvspdpn (op0_df, op0_v4sf));
   DONE;
 }
   [(set_attr "length" "8")
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 997ff31..2d2f137 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1518,15 +1518,6 @@ 
   "xscvdpspn %x0,%x1"
   [(set_attr "type" "fp")])
 
-;; Used by direct move to move a SFmode value from GPR to VSX register
-(define_insn "vsx_xscvspdpn_directmove"
-  [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
-	(unspec:SF [(match_operand:DI 1 "vsx_register_operand" "wa")]
-		   UNSPEC_VSX_CVSPDPN))]
-  "TARGET_XSCVSPDPN"
-  "xscvspdpn %x0,%x1"
-  [(set_attr "type" "fp")])
-
 ;; Convert and scale (used by vec_ctf, vec_cts, vec_ctu for double/long long)
 
 (define_expand "vsx_xvcvsxddp_scale"