diff mbox

disable use_vector_fp_converts for m_CORE_ALL

Message ID 0EFAB2BDD0F67E4FB6CCC8B9F87D7569427A8C52@IRSMSX101.ger.corp.intel.com
State New
Headers show

Commit Message

Zamyatin, Igor Sept. 18, 2013, 1:45 p.m. UTC
Ccing Uros. Changes in i386.md could be related to the fix for PR57954.

Thanks,
Igor

-----Original Message-----
From: Wei Mi [mailto:wmi@google.com] 
Sent: Thursday, September 12, 2013 2:51 AM
To: GCC Patches
Cc: David Li; Zamyatin, Igor
Subject: [PATCH] disable use_vector_fp_converts for m_CORE_ALL

For the following testcase 1.c, on westmere and sandybridge, performance with the option -mtune=^use_vector_fp_converts is better (improves from 3.46s to 2.83s). It means cvtss2sd is often better than
unpcklps+cvtps2pd on recent x86 platforms.

1.c:
float total = 0.2;
int k = 5;

int main() {
 int i;

 for (i = 0; i < 1000000000; i++) {
   total += (0.5 + k);
 }

 return total == 0.3;
}

assembly generated by gcc-r201963 without -mtune=^use_vector_fp_converts
.L2:
        unpcklps        %xmm0, %xmm0
        subl    $1, %eax
        cvtps2pd        %xmm0, %xmm0
        addsd   %xmm1, %xmm0
        unpcklpd        %xmm0, %xmm0
        cvtpd2ps        %xmm0, %xmm0
        jne     .L2

assembly generated by gcc-r201963 with -mtune=^use_vector_fp_converts
.L2:
        cvtss2sd        %xmm0, %xmm0
        subl    $1, %eax
        addsd   %xmm1, %xmm0
        cvtsd2ss        %xmm0, %xmm0
        jne     .L2

But for testcase 2.c (Thanks to Igor Zamyatin for the testcase), performance with the option -mtune=^use_vector_fp_converts is worse.
Analysis to the assembly shows the performance degradation comes from partial reg stall caused by cvtsd2ss. Adding pxor %xmm0, %xmm0 before cvtsd2ss b(,%rdx,8), %xmm0 gets the performance back.

2.c:
double b[1024];

float a[1024];

int main()
{
    int i;
    for(i = 0 ; i < 1024 * 1024 * 256; i++)
      a[i & 1023] = a[i & 1023] * (float)b[i & 1023];
    return (int)a[512];
}

without -mtune-crtl=^use_vector_fp_converts
.L2:
        movl    %eax, %edx
        addl    $1, %eax
        andl    $1023, %edx
        cmpl    $268435456, %eax
        movsd   b(,%rdx,8), %xmm0
        cvtpd2ps        %xmm0, %xmm0    ==> without partial reg stall
because of movsd.
        mulss   a(,%rdx,4), %xmm0
        movss   %xmm0, a(,%rdx,4)
        jne     .L2

with -mtune-crtl=^use_vector_fp_converts
.L2:
        movl    %eax, %edx
        addl    $1, %eax
        andl    $1023, %edx
        cmpl    $268435456, %eax
        cvtsd2ss        b(,%rdx,8), %xmm0   ==> with partial reg
stall. Needs to insert "pxor %xmm0, %xmm0" before current insn.
        mulss   a(,%rdx,4), %xmm0
        movss   %xmm0, a(,%rdx,4)
        jne     .L2

So the patch is to turn off use_vector_fp_converts for m_CORE_ALL to use cvtss2sd/cvtsd2ss directly,  and add "pxor %xmmreg %xmmreg" before cvtss2sd/cvtsd2ss to break partial reg stall (similar as what r201308 does for cvtsi2ss/cvtsi2sd). bootstrap and regression pass. ok for trunk?

Thanks,
Wei Mi.

2013-09-11  Wei Mi  <wmi@google.com>

        * config/i386/x86-tune.def (DEF_TUNE): Remove
        m_CORE_ALL.
        * config/i386/i386.md: Add define_peephole2 to
        break partial reg stall for cvtss2sd/cvtsd2ss.

Comments

Wei Mi Sept. 20, 2013, 10:12 p.m. UTC | #1
Ping.

> -----Original Message-----
> From: Wei Mi [mailto:wmi@google.com]
> Sent: Thursday, September 12, 2013 2:51 AM
> To: GCC Patches
> Cc: David Li; Zamyatin, Igor
> Subject: [PATCH] disable use_vector_fp_converts for m_CORE_ALL
>
> For the following testcase 1.c, on westmere and sandybridge, performance with the option -mtune=^use_vector_fp_converts is better (improves from 3.46s to 2.83s). It means cvtss2sd is often better than
> unpcklps+cvtps2pd on recent x86 platforms.
>
> 1.c:
> float total = 0.2;
> int k = 5;
>
> int main() {
>  int i;
>
>  for (i = 0; i < 1000000000; i++) {
>    total += (0.5 + k);
>  }
>
>  return total == 0.3;
> }
>
> assembly generated by gcc-r201963 without -mtune=^use_vector_fp_converts
> .L2:
>         unpcklps        %xmm0, %xmm0
>         subl    $1, %eax
>         cvtps2pd        %xmm0, %xmm0
>         addsd   %xmm1, %xmm0
>         unpcklpd        %xmm0, %xmm0
>         cvtpd2ps        %xmm0, %xmm0
>         jne     .L2
>
> assembly generated by gcc-r201963 with -mtune=^use_vector_fp_converts
> .L2:
>         cvtss2sd        %xmm0, %xmm0
>         subl    $1, %eax
>         addsd   %xmm1, %xmm0
>         cvtsd2ss        %xmm0, %xmm0
>         jne     .L2
>
> But for testcase 2.c (Thanks to Igor Zamyatin for the testcase), performance with the option -mtune=^use_vector_fp_converts is worse.
> Analysis to the assembly shows the performance degradation comes from partial reg stall caused by cvtsd2ss. Adding pxor %xmm0, %xmm0 before cvtsd2ss b(,%rdx,8), %xmm0 gets the performance back.
>
> 2.c:
> double b[1024];
>
> float a[1024];
>
> int main()
> {
>     int i;
>     for(i = 0 ; i < 1024 * 1024 * 256; i++)
>       a[i & 1023] = a[i & 1023] * (float)b[i & 1023];
>     return (int)a[512];
> }
>
> without -mtune-crtl=^use_vector_fp_converts
> .L2:
>         movl    %eax, %edx
>         addl    $1, %eax
>         andl    $1023, %edx
>         cmpl    $268435456, %eax
>         movsd   b(,%rdx,8), %xmm0
>         cvtpd2ps        %xmm0, %xmm0    ==> without partial reg stall
> because of movsd.
>         mulss   a(,%rdx,4), %xmm0
>         movss   %xmm0, a(,%rdx,4)
>         jne     .L2
>
> with -mtune-crtl=^use_vector_fp_converts
> .L2:
>         movl    %eax, %edx
>         addl    $1, %eax
>         andl    $1023, %edx
>         cmpl    $268435456, %eax
>         cvtsd2ss        b(,%rdx,8), %xmm0   ==> with partial reg
> stall. Needs to insert "pxor %xmm0, %xmm0" before current insn.
>         mulss   a(,%rdx,4), %xmm0
>         movss   %xmm0, a(,%rdx,4)
>         jne     .L2
>
> So the patch is to turn off use_vector_fp_converts for m_CORE_ALL to use cvtss2sd/cvtsd2ss directly,  and add "pxor %xmmreg %xmmreg" before cvtss2sd/cvtsd2ss to break partial reg stall (similar as what r201308 does for cvtsi2ss/cvtsi2sd). bootstrap and regression pass. ok for trunk?
>
> Thanks,
> Wei Mi.
>
> 2013-09-11  Wei Mi  <wmi@google.com>
>
>         * config/i386/x86-tune.def (DEF_TUNE): Remove
>         m_CORE_ALL.
>         * config/i386/i386.md: Add define_peephole2 to
>         break partial reg stall for cvtss2sd/cvtsd2ss.
>
> Index: config/i386/x86-tune.def
> ===================================================================
> --- config/i386/x86-tune.def    (revision 201963)
> +++ config/i386/x86-tune.def    (working copy)
> @@ -189,7 +189,7 @@ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_
>  /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
>     from FP to FP. */
>  DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
> -          m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
> +          m_AMDFAM10 | m_GENERIC)
>  /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
>     from integer to FP. */
>  DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
> Index: config/i386/i386.md
> ===================================================================
> --- config/i386/i386.md (revision 201963)
> +++ config/i386/i386.md (working copy)
> @@ -5075,6 +5075,63 @@
>    emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
>  })
>
> +;; Break partial reg stall for cvtsd2ss.
> +
> +(define_peephole2
> +  [(set (match_operand:SF 0 "register_operand")
> +        (float_truncate:SF
> +         (match_operand:DF 1 "nonimmediate_operand")))]
> +  "TARGET_SSE2 && TARGET_SSE_MATH
> +   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> +   && optimize_function_for_speed_p (cfun)
> +   && reload_completed && SSE_REG_P (operands[0])
> +   && peep2_reg_dead_p (0, operands[0])
> +   && (!SSE_REG_P (operands[1])
> +       || REGNO (operands[0]) != REGNO (operands[1]))"
> +  [(set (match_dup 0)
> +       (vec_merge:V4SF
> +         (vec_duplicate:V4SF
> +           (float_truncate:V2SF
> +             (match_dup 1)))
> +         (match_dup 0)
> +         (const_int 1)))]
> +{
> +  operands[0] = simplify_gen_subreg (V4SFmode, operands[0],
> +                                    SFmode, 0);
> +  operands[1] = simplify_gen_subreg (V2DFmode, operands[1],
> +                                    DFmode, 0);
> +  emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
> +})
> +
> +;; Break partial reg stall for cvtss2sd.
> +
> +(define_peephole2
> +  [(set (match_operand:DF 0 "register_operand")
> +        (float_extend:DF
> +          (match_operand:SF 1 "nonimmediate_operand")))]
> +  "TARGET_SSE2 && TARGET_SSE_MATH
> +   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> +   && optimize_function_for_speed_p (cfun)
> +   && reload_completed && SSE_REG_P (operands[0])
> +   && peep2_reg_dead_p (0, operands[0])
> +   && (!SSE_REG_P (operands[1])
> +       || REGNO (operands[0]) != REGNO (operands[1]))"
> +  [(set (match_dup 0)
> +        (vec_merge:V2DF
> +          (float_extend:V2DF
> +            (vec_select:V2SF
> +              (match_dup 1)
> +              (parallel [(const_int 0) (const_int 1)])))
> +          (match_dup 0)
> +          (const_int 1)))]
> +{
> +  operands[0] = simplify_gen_subreg (V2DFmode, operands[0],
> +                                    DFmode, 0);
> +  operands[1] = simplify_gen_subreg (V4SFmode, operands[1],
> +                                    SFmode, 0);
> +  emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
> +})
> +
>  ;; Avoid store forwarding (partial memory) stall penalty  ;; by passing DImode value through XMM registers.  */
Uros Bizjak Sept. 22, 2013, 9:29 a.m. UTC | #2
On Wed, Sep 18, 2013 at 3:45 PM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> Ccing Uros. Changes in i386.md could be related to the fix for PR57954.

> -----Original Message-----
> From: Wei Mi [mailto:wmi@google.com]
> Sent: Thursday, September 12, 2013 2:51 AM
> To: GCC Patches
> Cc: David Li; Zamyatin, Igor
> Subject: [PATCH] disable use_vector_fp_converts for m_CORE_ALL
>
> For the following testcase 1.c, on westmere and sandybridge, performance with the option -mtune=^use_vector_fp_converts is better (improves from 3.46s to 2.83s). It means cvtss2sd is often better than
> unpcklps+cvtps2pd on recent x86 platforms.
>
> 1.c:
> float total = 0.2;
> int k = 5;
>
> int main() {
>  int i;
>
>  for (i = 0; i < 1000000000; i++) {
>    total += (0.5 + k);
>  }
>
>  return total == 0.3;
> }
>
> assembly generated by gcc-r201963 without -mtune=^use_vector_fp_converts
> .L2:
>         unpcklps        %xmm0, %xmm0
>         subl    $1, %eax
>         cvtps2pd        %xmm0, %xmm0
>         addsd   %xmm1, %xmm0
>         unpcklpd        %xmm0, %xmm0
>         cvtpd2ps        %xmm0, %xmm0
>         jne     .L2
>
> assembly generated by gcc-r201963 with -mtune=^use_vector_fp_converts
> .L2:
>         cvtss2sd        %xmm0, %xmm0
>         subl    $1, %eax
>         addsd   %xmm1, %xmm0
>         cvtsd2ss        %xmm0, %xmm0
>         jne     .L2
>
> But for testcase 2.c (Thanks to Igor Zamyatin for the testcase), performance with the option -mtune=^use_vector_fp_converts is worse.
> Analysis to the assembly shows the performance degradation comes from partial reg stall caused by cvtsd2ss. Adding pxor %xmm0, %xmm0 before cvtsd2ss b(,%rdx,8), %xmm0 gets the performance back.
>
> 2.c:
> double b[1024];
>
> float a[1024];
>
> int main()
> {
>     int i;
>     for(i = 0 ; i < 1024 * 1024 * 256; i++)
>       a[i & 1023] = a[i & 1023] * (float)b[i & 1023];
>     return (int)a[512];
> }
>
> without -mtune-crtl=^use_vector_fp_converts
> .L2:
>         movl    %eax, %edx
>         addl    $1, %eax
>         andl    $1023, %edx
>         cmpl    $268435456, %eax
>         movsd   b(,%rdx,8), %xmm0
>         cvtpd2ps        %xmm0, %xmm0    ==> without partial reg stall
> because of movsd.
>         mulss   a(,%rdx,4), %xmm0
>         movss   %xmm0, a(,%rdx,4)
>         jne     .L2
>
> with -mtune-crtl=^use_vector_fp_converts
> .L2:
>         movl    %eax, %edx
>         addl    $1, %eax
>         andl    $1023, %edx
>         cmpl    $268435456, %eax
>         cvtsd2ss        b(,%rdx,8), %xmm0   ==> with partial reg
> stall. Needs to insert "pxor %xmm0, %xmm0" before current insn.
>         mulss   a(,%rdx,4), %xmm0
>         movss   %xmm0, a(,%rdx,4)
>         jne     .L2
>
> So the patch is to turn off use_vector_fp_converts for m_CORE_ALL to use cvtss2sd/cvtsd2ss directly,  and add "pxor %xmmreg %xmmreg" before cvtss2sd/cvtsd2ss to break partial reg stall (similar as what r201308 does for cvtsi2ss/cvtsi2sd). bootstrap and regression pass. ok for trunk?
>
> Thanks,
> Wei Mi.
>
> 2013-09-11  Wei Mi  <wmi@google.com>
>
>         * config/i386/x86-tune.def (DEF_TUNE): Remove
>         m_CORE_ALL.
>         * config/i386/i386.md: Add define_peephole2 to
>         break partial reg stall for cvtss2sd/cvtsd2ss.

You don't need reload_completed in peephole2 patterns.

Otherwise the patch is OK.

Thanks,
Uros.
H.J. Lu Oct. 1, 2013, 5:58 p.m. UTC | #3
On Sun, Sep 22, 2013 at 2:29 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Sep 18, 2013 at 3:45 PM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> Ccing Uros. Changes in i386.md could be related to the fix for PR57954.
>
>> -----Original Message-----
>> From: Wei Mi [mailto:wmi@google.com]
>> Sent: Thursday, September 12, 2013 2:51 AM
>> To: GCC Patches
>> Cc: David Li; Zamyatin, Igor
>> Subject: [PATCH] disable use_vector_fp_converts for m_CORE_ALL
>>
>> For the following testcase 1.c, on westmere and sandybridge, performance with the option -mtune=^use_vector_fp_converts is better (improves from 3.46s to 2.83s). It means cvtss2sd is often better than
>> unpcklps+cvtps2pd on recent x86 platforms.
>>
>> 1.c:
>> float total = 0.2;
>> int k = 5;
>>
>> int main() {
>>  int i;
>>
>>  for (i = 0; i < 1000000000; i++) {
>>    total += (0.5 + k);
>>  }
>>
>>  return total == 0.3;
>> }
>>
>> assembly generated by gcc-r201963 without -mtune=^use_vector_fp_converts
>> .L2:
>>         unpcklps        %xmm0, %xmm0
>>         subl    $1, %eax
>>         cvtps2pd        %xmm0, %xmm0
>>         addsd   %xmm1, %xmm0
>>         unpcklpd        %xmm0, %xmm0
>>         cvtpd2ps        %xmm0, %xmm0
>>         jne     .L2
>>
>> assembly generated by gcc-r201963 with -mtune=^use_vector_fp_converts
>> .L2:
>>         cvtss2sd        %xmm0, %xmm0
>>         subl    $1, %eax
>>         addsd   %xmm1, %xmm0
>>         cvtsd2ss        %xmm0, %xmm0
>>         jne     .L2
>>
>> But for testcase 2.c (Thanks to Igor Zamyatin for the testcase), performance with the option -mtune=^use_vector_fp_converts is worse.
>> Analysis to the assembly shows the performance degradation comes from partial reg stall caused by cvtsd2ss. Adding pxor %xmm0, %xmm0 before cvtsd2ss b(,%rdx,8), %xmm0 gets the performance back.
>>
>> 2.c:
>> double b[1024];
>>
>> float a[1024];
>>
>> int main()
>> {
>>     int i;
>>     for(i = 0 ; i < 1024 * 1024 * 256; i++)
>>       a[i & 1023] = a[i & 1023] * (float)b[i & 1023];
>>     return (int)a[512];
>> }
>>
>> without -mtune-crtl=^use_vector_fp_converts
>> .L2:
>>         movl    %eax, %edx
>>         addl    $1, %eax
>>         andl    $1023, %edx
>>         cmpl    $268435456, %eax
>>         movsd   b(,%rdx,8), %xmm0
>>         cvtpd2ps        %xmm0, %xmm0    ==> without partial reg stall
>> because of movsd.
>>         mulss   a(,%rdx,4), %xmm0
>>         movss   %xmm0, a(,%rdx,4)
>>         jne     .L2
>>
>> with -mtune-crtl=^use_vector_fp_converts
>> .L2:
>>         movl    %eax, %edx
>>         addl    $1, %eax
>>         andl    $1023, %edx
>>         cmpl    $268435456, %eax
>>         cvtsd2ss        b(,%rdx,8), %xmm0   ==> with partial reg
>> stall. Needs to insert "pxor %xmm0, %xmm0" before current insn.
>>         mulss   a(,%rdx,4), %xmm0
>>         movss   %xmm0, a(,%rdx,4)
>>         jne     .L2
>>
>> So the patch is to turn off use_vector_fp_converts for m_CORE_ALL to use cvtss2sd/cvtsd2ss directly,  and add "pxor %xmmreg %xmmreg" before cvtss2sd/cvtsd2ss to break partial reg stall (similar as what r201308 does for cvtsi2ss/cvtsi2sd). bootstrap and regression pass. ok for trunk?
>>
>> Thanks,
>> Wei Mi.
>>
>> 2013-09-11  Wei Mi  <wmi@google.com>
>>
>>         * config/i386/x86-tune.def (DEF_TUNE): Remove
>>         m_CORE_ALL.
>>         * config/i386/i386.md: Add define_peephole2 to
>>         break partial reg stall for cvtss2sd/cvtsd2ss.
>
> You don't need reload_completed in peephole2 patterns.
>
> Otherwise the patch is OK.
>
> Thanks,
> Uros.

Hi Wei Mi,

Have you checked in your patch?
Wei Mi Oct. 1, 2013, 8:30 p.m. UTC | #4
> Hi Wei Mi,
>
> Have you checked in your patch?
>
> --
> H.J.

No, I havn't. Honza wants me to wait for his testing on AMD hardware.
http://gcc.gnu.org/ml/gcc-patches/2013-09/msg01603.html
Jan Hubicka Oct. 1, 2013, 10:50 p.m. UTC | #5
> > Hi Wei Mi,
> >
> > Have you checked in your patch?
> >
> > --
> > H.J.
> 
> No, I havn't. Honza wants me to wait for his testing on AMD hardware.
> http://gcc.gnu.org/ml/gcc-patches/2013-09/msg01603.html
I only wanted to separate it from the changes in generic so the regular testers
can pick it up separately.  So just go ahead and check it in.

Honza
Wei Mi Oct. 1, 2013, 11:35 p.m. UTC | #6
On Tue, Oct 1, 2013 at 3:50 PM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> > Hi Wei Mi,
>> >
>> > Have you checked in your patch?
>> >
>> > --
>> > H.J.
>>
>> No, I havn't. Honza wants me to wait for his testing on AMD hardware.
>> http://gcc.gnu.org/ml/gcc-patches/2013-09/msg01603.html
> I only wanted to separate it from the changes in generic so the regular testers
> can pick it up separately.  So just go ahead and check it in.
>
> Honza

Thanks, check in as r203095.

Wei Mi.
diff mbox

Patch

Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def    (revision 201963)
+++ config/i386/x86-tune.def    (working copy)
@@ -189,7 +189,7 @@  DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_
 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
    from FP to FP. */
 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
-          m_CORE_ALL | m_AMDFAM10 | m_GENERIC)
+          m_AMDFAM10 | m_GENERIC)
 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
    from integer to FP. */
 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 201963)
+++ config/i386/i386.md (working copy)
@@ -5075,6 +5075,63 @@ 
   emit_move_insn (operands[0], CONST0_RTX (<ssevecmode>mode));
 })

+;; Break partial reg stall for cvtsd2ss.
+
+(define_peephole2
+  [(set (match_operand:SF 0 "register_operand")
+        (float_truncate:SF
+         (match_operand:DF 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])
+   && peep2_reg_dead_p (0, operands[0])
+   && (!SSE_REG_P (operands[1])
+       || REGNO (operands[0]) != REGNO (operands[1]))"
+  [(set (match_dup 0)
+       (vec_merge:V4SF
+         (vec_duplicate:V4SF
+           (float_truncate:V2SF
+             (match_dup 1)))
+         (match_dup 0)
+         (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0],
+                                    SFmode, 0);
+  operands[1] = simplify_gen_subreg (V2DFmode, operands[1],
+                                    DFmode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
+})
+
+;; Break partial reg stall for cvtss2sd.
+
+(define_peephole2
+  [(set (match_operand:DF 0 "register_operand")
+        (float_extend:DF
+          (match_operand:SF 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed && SSE_REG_P (operands[0])
+   && peep2_reg_dead_p (0, operands[0])
+   && (!SSE_REG_P (operands[1])
+       || REGNO (operands[0]) != REGNO (operands[1]))"
+  [(set (match_dup 0)
+        (vec_merge:V2DF
+          (float_extend:V2DF
+            (vec_select:V2SF
+              (match_dup 1)
+              (parallel [(const_int 0) (const_int 1)])))
+          (match_dup 0)
+          (const_int 1)))]
+{
+  operands[0] = simplify_gen_subreg (V2DFmode, operands[0],
+                                    DFmode, 0);
+  operands[1] = simplify_gen_subreg (V4SFmode, operands[1],
+                                    SFmode, 0);
+  emit_move_insn (operands[0], CONST0_RTX (V2DFmode));
+})
+
 ;; Avoid store forwarding (partial memory) stall penalty  ;; by passing DImode value through XMM registers.  */