diff mbox series

i386: Fix up copysign/xorsign expansion [PR104612]

Message ID YhPBxUGyLxmOBSO4@tucnak
State New
Headers show
Series i386: Fix up copysign/xorsign expansion [PR104612] | expand

Commit Message

Jakub Jelinek Feb. 21, 2022, 4:45 p.m. UTC
Hi!

We ICE on the following testcase for -m32 since r12-3435. because
operands[2] is (subreg:SF (reg:DI ...) 0) and
lowpart_subreg (V4SFmode, operands[2], SFmode)
returns NULL, and that is what we use in AND etc. insns we emit.

The following patch (non-attached) fixes that by calling force_reg for the
input operands, to make sure they are really REGs and so lowpart_subreg
will succeed on them - even for theoretical MEMs using REGs there seems
desirable, we don't want to read following memory slots for the paradoxical
subreg.  For the outputs, I thought we'd get better code by always computing
result into a new pseudo and them move lowpart of that pseudo into dest.

I've bootstrapped/regtested this version on x86_64-linux and i686-linux,
unfortunately it regressed
FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps
on which the patch changes:
 	vandps	.LC0(%rip), %xmm1, %xmm1
-	vxorps	%xmm0, %xmm1, %xmm0
+	vxorps	%xmm0, %xmm1, %xmm1
+	vmovaps	%xmm1, %xmm0
 	ret
The RA sees:
(insn 8 4 9 2 (set (reg:V4SF 85)
        (and:V4SF (subreg:V4SF (reg:SF 90) 0)
            (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16 A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3}
     (expr_list:REG_DEAD (reg:SF 90)
        (nil)))
(insn 9 8 10 2 (set (reg:V4SF 87)
        (xor:V4SF (reg:V4SF 85)
            (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3}
     (expr_list:REG_DEAD (reg:SF 89)
        (expr_list:REG_DEAD (reg:V4SF 85)
            (nil))))
(insn 10 9 14 2 (set (reg:SF 82 [ <retval> ])
        (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal}
     (expr_list:REG_DEAD (reg:V4SF 87)
        (nil)))
(insn 14 10 15 2 (set (reg/i:SF 20 xmm0)
        (reg:SF 82 [ <retval> ])) "pr89984-2.c":8:1 142 {*movsf_internal}
     (expr_list:REG_DEAD (reg:SF 82 [ <retval> ])
        (nil)))
(insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1
     (nil))
and doesn't know that if it would use xmm0 not just for pseudo 82
but also for pseudo 87, it could create a noop move in insn 10 and
so could avoid an extra register copy and nothing later on is able
to figure that out either.  I don't know how the RA should know
that though.

Anyway, so that we don't regress, I have an alternative patch in attachment,
which will do this stuff (i.e. use fresh vector pseudo as destination and
then move lowpart of that to dest) over what it used before (i.e.
use paradoxical subreg of the dest) only if lowpart_subreg returns NULL.

Ok for trunk if the attached version passes bootstrap/regtest?

2022-02-21  Jakub Jelinek  <jakub@redhat.com>

	PR target/104612
	* config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
	on input operands before calling lowpart_subreg on it.  For output
	operand, use a vmode pseudo as destination and then move its lowpart
	subreg into operands[0].
	(ix86_expand_xorsign): Likewise.

	* gcc.dg/pr104612.c: New test.


	Jakub
2022-02-21  Jakub Jelinek  <jakub@redhat.com>

	PR target/104612
	* config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
	on input operands before calling lowpart_subreg on it.  For output
	operand, use a vmode pseudo as destination and then move its lowpart
	subreg into operands[0] if lowpart_subreg fails on dest.
	(ix86_expand_xorsign): Likewise.

	* gcc.dg/pr104612.c: New test.

--- gcc/config/i386/i386-expand.cc.jj	2022-02-21 16:51:36.639411090 +0100
+++ gcc/config/i386/i386-expand.cc	2022-02-21 17:20:11.655150129 +0100
@@ -2153,7 +2153,7 @@ void
 ix86_expand_copysign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, op2, op3;
+  rtx dest, vdest, op0, op1, mask, op2, op3;
 
   mode = GET_MODE (operands[0]);
 
@@ -2174,8 +2174,13 @@ ix86_expand_copysign (rtx operands[])
       return;
     }
 
-  dest = lowpart_subreg (vmode, operands[0], mode);
-  op1 = lowpart_subreg (vmode, operands[2], mode);
+  dest = operands[0];
+  vdest = lowpart_subreg (vmode, dest, mode);
+  if (vdest == NULL_RTX)
+    vdest = gen_reg_rtx (vmode);
+  else
+    dest = NULL_RTX;
+  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
   if (CONST_DOUBLE_P (operands[1]))
@@ -2184,7 +2189,9 @@ ix86_expand_copysign (rtx operands[])
       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
       if (op0 == CONST0_RTX (mode))
 	{
-	  emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
+	  emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
+	  if (dest)
+	    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 	  return;
 	}
 
@@ -2193,7 +2200,7 @@ ix86_expand_copysign (rtx operands[])
       op0 = force_reg (vmode, op0);
     }
   else
-    op0 = lowpart_subreg (vmode, operands[1], mode);
+    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
 
   op2 = gen_reg_rtx (vmode);
   op3 = gen_reg_rtx (vmode);
@@ -2201,7 +2208,9 @@ ix86_expand_copysign (rtx operands[])
 				    gen_rtx_NOT (vmode, mask),
 				    op0));
   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
-  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
+  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
+  if (dest)
+    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 /* Expand an xorsign operation.  */
@@ -2210,7 +2219,7 @@ void
 ix86_expand_xorsign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, x, temp;
+  rtx dest, vdest, op0, op1, mask, x, temp;
 
   dest = operands[0];
   op0 = operands[1];
@@ -2230,15 +2239,22 @@ ix86_expand_xorsign (rtx operands[])
   temp = gen_reg_rtx (vmode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
-  op1 = lowpart_subreg (vmode, op1, mode);
+  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
   x = gen_rtx_AND (vmode, op1, mask);
   emit_insn (gen_rtx_SET (temp, x));
 
-  op0 = lowpart_subreg (vmode, op0, mode);
+  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
   x = gen_rtx_XOR (vmode, temp, op0);
 
-  dest = lowpart_subreg (vmode, dest, mode);
-  emit_insn (gen_rtx_SET (dest, x));
+  vdest = lowpart_subreg (vmode, dest, mode);
+  if (vdest == NULL_RTX)
+    vdest = gen_reg_rtx (vmode);
+  else
+    dest = NULL_RTX;
+  emit_insn (gen_rtx_SET (vdest, x));
+
+  if (dest)
+    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
--- gcc/testsuite/gcc.dg/pr104612.c.jj	2022-02-21 17:16:32.947140573 +0100
+++ gcc/testsuite/gcc.dg/pr104612.c	2022-02-21 17:16:32.947140573 +0100
@@ -0,0 +1,27 @@
+/* PR target/104612 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-msse2 -mfpmath=sse" { target i?86-*-* x86_64-*-* } } */
+
+struct V { float x, y; };
+
+struct V
+foo (struct V v)
+{
+  struct V ret;
+  ret.x = __builtin_copysignf (1.0e+0, v.x);
+  ret.y = __builtin_copysignf (1.0e+0, v.y);
+  return ret;
+}
+
+float
+bar (struct V v)
+{
+  return __builtin_copysignf (v.x, v.y);
+}
+
+float
+baz (struct V v)
+{
+  return v.x * __builtin_copysignf (1.0f, v.y);
+}

Comments

Uros Bizjak Feb. 21, 2022, 5:01 p.m. UTC | #1
On Mon, Feb 21, 2022 at 5:46 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> We ICE on the following testcase for -m32 since r12-3435. because
> operands[2] is (subreg:SF (reg:DI ...) 0) and
> lowpart_subreg (V4SFmode, operands[2], SFmode)
> returns NULL, and that is what we use in AND etc. insns we emit.
>
> The following patch (non-attached) fixes that by calling force_reg for the
> input operands, to make sure they are really REGs and so lowpart_subreg
> will succeed on them - even for theoretical MEMs using REGs there seems
> desirable, we don't want to read following memory slots for the paradoxical
> subreg.  For the outputs, I thought we'd get better code by always computing
> result into a new pseudo and them move lowpart of that pseudo into dest.
>
> I've bootstrapped/regtested this version on x86_64-linux and i686-linux,
> unfortunately it regressed
> FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps
> on which the patch changes:
>         vandps  .LC0(%rip), %xmm1, %xmm1
> -       vxorps  %xmm0, %xmm1, %xmm0
> +       vxorps  %xmm0, %xmm1, %xmm1
> +       vmovaps %xmm1, %xmm0
>         ret
> The RA sees:
> (insn 8 4 9 2 (set (reg:V4SF 85)
>         (and:V4SF (subreg:V4SF (reg:SF 90) 0)
>             (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16 A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3}
>      (expr_list:REG_DEAD (reg:SF 90)
>         (nil)))
> (insn 9 8 10 2 (set (reg:V4SF 87)
>         (xor:V4SF (reg:V4SF 85)
>             (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3}
>      (expr_list:REG_DEAD (reg:SF 89)
>         (expr_list:REG_DEAD (reg:V4SF 85)
>             (nil))))
> (insn 10 9 14 2 (set (reg:SF 82 [ <retval> ])
>         (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:V4SF 87)
>         (nil)))
> (insn 14 10 15 2 (set (reg/i:SF 20 xmm0)
>         (reg:SF 82 [ <retval> ])) "pr89984-2.c":8:1 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:SF 82 [ <retval> ])
>         (nil)))
> (insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1
>      (nil))
> and doesn't know that if it would use xmm0 not just for pseudo 82
> but also for pseudo 87, it could create a noop move in insn 10 and
> so could avoid an extra register copy and nothing later on is able
> to figure that out either.  I don't know how the RA should know
> that though.
>
> Anyway, so that we don't regress, I have an alternative patch in attachment,
> which will do this stuff (i.e. use fresh vector pseudo as destination and
> then move lowpart of that to dest) over what it used before (i.e.
> use paradoxical subreg of the dest) only if lowpart_subreg returns NULL.

I remember the same issue in the past, so it looks like the fresh
pseudo as destination gives RA some more freedom to do its magic. So,
it is better to do:

      emit_move_insn (dest, gen_lowpart (wmode, t3));

then play with subregs on a destination, like:

      dest = lowpart_subreg (vmode, dest, mode);

The attached patch is OK.

Thanks,
Uros.

>
> Ok for trunk if the attached version passes bootstrap/regtest?
>
> 2022-02-21  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/104612
>         * config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
>         on input operands before calling lowpart_subreg on it.  For output
>         operand, use a vmode pseudo as destination and then move its lowpart
>         subreg into operands[0].
>         (ix86_expand_xorsign): Likewise.
>
>         * gcc.dg/pr104612.c: New test.
>
> --- gcc/config/i386/i386-expand.cc.jj   2022-02-09 20:45:03.463499205 +0100
> +++ gcc/config/i386/i386-expand.cc      2022-02-21 13:14:31.756657743 +0100
> @@ -2153,7 +2153,7 @@ void
>  ix86_expand_copysign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, op2, op3;
> +  rtx dest, vdest, op0, op1, mask, op2, op3;
>
>    mode = GET_MODE (operands[0]);
>
> @@ -2174,8 +2174,9 @@ ix86_expand_copysign (rtx operands[])
>        return;
>      }
>
> -  dest = lowpart_subreg (vmode, operands[0], mode);
> -  op1 = lowpart_subreg (vmode, operands[2], mode);
> +  dest = operands[0];
> +  vdest = gen_reg_rtx (vmode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
>    if (CONST_DOUBLE_P (operands[1]))
> @@ -2184,7 +2185,8 @@ ix86_expand_copysign (rtx operands[])
>        /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
>        if (op0 == CONST0_RTX (mode))
>         {
> -         emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>           return;
>         }
>
> @@ -2193,7 +2195,7 @@ ix86_expand_copysign (rtx operands[])
>        op0 = force_reg (vmode, op0);
>      }
>    else
> -    op0 = lowpart_subreg (vmode, operands[1], mode);
> +    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
>
>    op2 = gen_reg_rtx (vmode);
>    op3 = gen_reg_rtx (vmode);
> @@ -2201,7 +2203,8 @@ ix86_expand_copysign (rtx operands[])
>                                     gen_rtx_NOT (vmode, mask),
>                                     op0));
>    emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
> -  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  /* Expand an xorsign operation.  */
> @@ -2210,7 +2213,7 @@ void
>  ix86_expand_xorsign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, x, temp;
> +  rtx dest, vdest, op0, op1, mask, x, temp;
>
>    dest = operands[0];
>    op0 = operands[1];
> @@ -2230,15 +2233,17 @@ ix86_expand_xorsign (rtx operands[])
>    temp = gen_reg_rtx (vmode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
> -  op1 = lowpart_subreg (vmode, op1, mode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
>    x = gen_rtx_AND (vmode, op1, mask);
>    emit_insn (gen_rtx_SET (temp, x));
>
> -  op0 = lowpart_subreg (vmode, op0, mode);
> +  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
>    x = gen_rtx_XOR (vmode, temp, op0);
>
> -  dest = lowpart_subreg (vmode, dest, mode);
> -  emit_insn (gen_rtx_SET (dest, x));
> +  vdest = gen_reg_rtx (vmode);
> +  emit_insn (gen_rtx_SET (vdest, x));
> +
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
> --- gcc/testsuite/gcc.dg/pr104612.c.jj  2022-02-21 13:26:41.134606451 +0100
> +++ gcc/testsuite/gcc.dg/pr104612.c     2022-02-21 13:26:18.247922789 +0100
> @@ -0,0 +1,27 @@
> +/* PR target/104612 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-msse2 -mfpmath=sse" { target i?86-*-* x86_64-*-* } } */
> +
> +struct V { float x, y; };
> +
> +struct V
> +foo (struct V v)
> +{
> +  struct V ret;
> +  ret.x = __builtin_copysignf (1.0e+0, v.x);
> +  ret.y = __builtin_copysignf (1.0e+0, v.y);
> +  return ret;
> +}
> +
> +float
> +bar (struct V v)
> +{
> +  return __builtin_copysignf (v.x, v.y);
> +}
> +
> +float
> +baz (struct V v)
> +{
> +  return v.x * __builtin_copysignf (1.0f, v.y);
> +}
>
>         Jakub
Jakub Jelinek Feb. 21, 2022, 5:33 p.m. UTC | #2
On Mon, Feb 21, 2022 at 06:01:00PM +0100, Uros Bizjak wrote:
> I remember the same issue in the past, so it looks like the fresh
> pseudo as destination gives RA some more freedom to do its magic. So,
> it is better to do:
> 
>       emit_move_insn (dest, gen_lowpart (wmode, t3));
> 
> then play with subregs on a destination, like:
> 
>       dest = lowpart_subreg (vmode, dest, mode);

That is what I assumed too, but unfortunately on the pr89984-2.c
testcase that extra freedom resulted in worse code.
The inlined patch gave it that freedom, the attached patch does not
(unless dest is already SUBREG).
I think it might be useful to open a PR for GCC 13 and change the
attached patch to the inlined one once we can deal with it in the RA.

	Jakub
Uros Bizjak Feb. 21, 2022, 7:34 p.m. UTC | #3
On Mon, Feb 21, 2022 at 6:33 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Mon, Feb 21, 2022 at 06:01:00PM +0100, Uros Bizjak wrote:
> > I remember the same issue in the past, so it looks like the fresh
> > pseudo as destination gives RA some more freedom to do its magic. So,
> > it is better to do:
> >
> >       emit_move_insn (dest, gen_lowpart (wmode, t3));
> >
> > then play with subregs on a destination, like:
> >
> >       dest = lowpart_subreg (vmode, dest, mode);
>
> That is what I assumed too, but unfortunately on the pr89984-2.c
> testcase that extra freedom resulted in worse code.
> The inlined patch gave it that freedom, the attached patch does not
> (unless dest is already SUBREG).
> I think it might be useful to open a PR for GCC 13 and change the
> attached patch to the inlined one once we can deal with it in the RA.

Agreed.

Uros.
Hongtao Liu Feb. 22, 2022, 3:43 a.m. UTC | #4
On Tue, Feb 22, 2022 at 12:46 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> We ICE on the following testcase for -m32 since r12-3435. because
> operands[2] is (subreg:SF (reg:DI ...) 0) and
According to validate_subreg, (subreg:V4SF (reg:DI ...) 0) should be
valid(but not sure if it really works )

For -m64 part:
I saw operands[2] is (subreg:SF (reg:SI ..) 0).
It's a bit weird that (subreg:V4SF (reg:SF ..) 0) and (subreg:SF
(reg:SI 0)) are legal but (subreg:V4SF (reg:SI ..) not, and last time
I tried to remove those weird codes in validate_subreg but caused a
lot of regression[1].
And it seems we need to be aware of all pre_reload paradoxical subregs
of V*{SF,HF}mode, op, {SF,HF}mode).

 940  /* ??? x86 sse code makes heavy use of *paradoxical* vector subregs,
 941     i.e. (subreg:V4SF (reg:SF) 0) or (subreg:V4SF (reg:V2SF) 0).  This
 942     surely isn't the cleanest way to represent this.  It's questionable
 943     if this ought to be represented at all -- why can't this all be hidden
 944     in post-reload splitters that make arbitrarily mode changes to the
 945     registers themselves.  */
 946  else if (VECTOR_MODE_P (omode)
 947           && GET_MODE_INNER (omode) == GET_MODE_INNER (imode))

Also i;m thinking about(too risky at stage4, leave it to GCC13 stage
1) adjusting GET_MODE_INNER (omode) == GET_MODE_INNER (imode) from the
same mode to the same size.
The same size looks a little more logically coherent.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-August/578186.html

> lowpart_subreg (V4SFmode, operands[2], SFmode)
> returns NULL, and that is what we use in AND etc. insns we emit.
>
> The following patch (non-attached) fixes that by calling force_reg for the
> input operands, to make sure they are really REGs and so lowpart_subreg
> will succeed on them - even for theoretical MEMs using REGs there seems
> desirable, we don't want to read following memory slots for the paradoxical
> subreg.  For the outputs, I thought we'd get better code by always computing
> result into a new pseudo and them move lowpart of that pseudo into dest.
>
> I've bootstrapped/regtested this version on x86_64-linux and i686-linux,
> unfortunately it regressed
> FAIL: gcc.target/i386/pr89984-2.c scan-assembler-not vmovaps
> on which the patch changes:
>         vandps  .LC0(%rip), %xmm1, %xmm1
> -       vxorps  %xmm0, %xmm1, %xmm0
> +       vxorps  %xmm0, %xmm1, %xmm1
> +       vmovaps %xmm1, %xmm0
>         ret
> The RA sees:
> (insn 8 4 9 2 (set (reg:V4SF 85)
>         (and:V4SF (subreg:V4SF (reg:SF 90) 0)
>             (mem/u/c:V4SF (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0  S16 A128]))) "pr89984-2.c":7:12 2838 {*andv4sf3}
>      (expr_list:REG_DEAD (reg:SF 90)
>         (nil)))
> (insn 9 8 10 2 (set (reg:V4SF 87)
>         (xor:V4SF (reg:V4SF 85)
>             (subreg:V4SF (reg:SF 89) 0))) "pr89984-2.c":7:12 2842 {*xorv4sf3}
>      (expr_list:REG_DEAD (reg:SF 89)
>         (expr_list:REG_DEAD (reg:V4SF 85)
>             (nil))))
> (insn 10 9 14 2 (set (reg:SF 82 [ <retval> ])
>         (subreg:SF (reg:V4SF 87) 0)) "pr89984-2.c":7:12 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:V4SF 87)
>         (nil)))
> (insn 14 10 15 2 (set (reg/i:SF 20 xmm0)
>         (reg:SF 82 [ <retval> ])) "pr89984-2.c":8:1 142 {*movsf_internal}
>      (expr_list:REG_DEAD (reg:SF 82 [ <retval> ])
>         (nil)))
> (insn 15 14 0 2 (use (reg/i:SF 20 xmm0)) "pr89984-2.c":8:1 -1
>      (nil))
> and doesn't know that if it would use xmm0 not just for pseudo 82
> but also for pseudo 87, it could create a noop move in insn 10 and
> so could avoid an extra register copy and nothing later on is able
> to figure that out either.  I don't know how the RA should know
> that though.
>
> Anyway, so that we don't regress, I have an alternative patch in attachment,
> which will do this stuff (i.e. use fresh vector pseudo as destination and
> then move lowpart of that to dest) over what it used before (i.e.
> use paradoxical subreg of the dest) only if lowpart_subreg returns NULL.
>
> Ok for trunk if the attached version passes bootstrap/regtest?
>
> 2022-02-21  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/104612
>         * config/i386/i386-expand.cc (ix86_expand_copysign): Call force_reg
>         on input operands before calling lowpart_subreg on it.  For output
>         operand, use a vmode pseudo as destination and then move its lowpart
>         subreg into operands[0].
>         (ix86_expand_xorsign): Likewise.
>
>         * gcc.dg/pr104612.c: New test.
>
> --- gcc/config/i386/i386-expand.cc.jj   2022-02-09 20:45:03.463499205 +0100
> +++ gcc/config/i386/i386-expand.cc      2022-02-21 13:14:31.756657743 +0100
> @@ -2153,7 +2153,7 @@ void
>  ix86_expand_copysign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, op2, op3;
> +  rtx dest, vdest, op0, op1, mask, op2, op3;
>
>    mode = GET_MODE (operands[0]);
>
> @@ -2174,8 +2174,9 @@ ix86_expand_copysign (rtx operands[])
>        return;
>      }
>
> -  dest = lowpart_subreg (vmode, operands[0], mode);
> -  op1 = lowpart_subreg (vmode, operands[2], mode);
> +  dest = operands[0];
> +  vdest = gen_reg_rtx (vmode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
>    if (CONST_DOUBLE_P (operands[1]))
> @@ -2184,7 +2185,8 @@ ix86_expand_copysign (rtx operands[])
>        /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
>        if (op0 == CONST0_RTX (mode))
>         {
> -         emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
> +         emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>           return;
>         }
>
> @@ -2193,7 +2195,7 @@ ix86_expand_copysign (rtx operands[])
>        op0 = force_reg (vmode, op0);
>      }
>    else
> -    op0 = lowpart_subreg (vmode, operands[1], mode);
> +    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
>
>    op2 = gen_reg_rtx (vmode);
>    op3 = gen_reg_rtx (vmode);
> @@ -2201,7 +2203,8 @@ ix86_expand_copysign (rtx operands[])
>                                     gen_rtx_NOT (vmode, mask),
>                                     op0));
>    emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
> -  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  /* Expand an xorsign operation.  */
> @@ -2210,7 +2213,7 @@ void
>  ix86_expand_xorsign (rtx operands[])
>  {
>    machine_mode mode, vmode;
> -  rtx dest, op0, op1, mask, x, temp;
> +  rtx dest, vdest, op0, op1, mask, x, temp;
>
>    dest = operands[0];
>    op0 = operands[1];
> @@ -2230,15 +2233,17 @@ ix86_expand_xorsign (rtx operands[])
>    temp = gen_reg_rtx (vmode);
>    mask = ix86_build_signbit_mask (vmode, 0, 0);
>
> -  op1 = lowpart_subreg (vmode, op1, mode);
> +  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
>    x = gen_rtx_AND (vmode, op1, mask);
>    emit_insn (gen_rtx_SET (temp, x));
>
> -  op0 = lowpart_subreg (vmode, op0, mode);
> +  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
>    x = gen_rtx_XOR (vmode, temp, op0);
>
> -  dest = lowpart_subreg (vmode, dest, mode);
> -  emit_insn (gen_rtx_SET (dest, x));
> +  vdest = gen_reg_rtx (vmode);
> +  emit_insn (gen_rtx_SET (vdest, x));
> +
> +  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
>  }
>
>  static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
> --- gcc/testsuite/gcc.dg/pr104612.c.jj  2022-02-21 13:26:41.134606451 +0100
> +++ gcc/testsuite/gcc.dg/pr104612.c     2022-02-21 13:26:18.247922789 +0100
> @@ -0,0 +1,27 @@
> +/* PR target/104612 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-msse2 -mfpmath=sse" { target i?86-*-* x86_64-*-* } } */
> +
> +struct V { float x, y; };
> +
> +struct V
> +foo (struct V v)
> +{
> +  struct V ret;
> +  ret.x = __builtin_copysignf (1.0e+0, v.x);
> +  ret.y = __builtin_copysignf (1.0e+0, v.y);
> +  return ret;
> +}
> +
> +float
> +bar (struct V v)
> +{
> +  return __builtin_copysignf (v.x, v.y);
> +}
> +
> +float
> +baz (struct V v)
> +{
> +  return v.x * __builtin_copysignf (1.0f, v.y);
> +}
>
>         Jakub



--
BR,
Hongtao
diff mbox series

Patch

--- gcc/config/i386/i386-expand.cc.jj	2022-02-09 20:45:03.463499205 +0100
+++ gcc/config/i386/i386-expand.cc	2022-02-21 13:14:31.756657743 +0100
@@ -2153,7 +2153,7 @@  void
 ix86_expand_copysign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, op2, op3;
+  rtx dest, vdest, op0, op1, mask, op2, op3;
 
   mode = GET_MODE (operands[0]);
 
@@ -2174,8 +2174,9 @@  ix86_expand_copysign (rtx operands[])
       return;
     }
 
-  dest = lowpart_subreg (vmode, operands[0], mode);
-  op1 = lowpart_subreg (vmode, operands[2], mode);
+  dest = operands[0];
+  vdest = gen_reg_rtx (vmode);
+  op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
   if (CONST_DOUBLE_P (operands[1]))
@@ -2184,7 +2185,8 @@  ix86_expand_copysign (rtx operands[])
       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
       if (op0 == CONST0_RTX (mode))
 	{
-	  emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
+	  emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
+	  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 	  return;
 	}
 
@@ -2193,7 +2195,7 @@  ix86_expand_copysign (rtx operands[])
       op0 = force_reg (vmode, op0);
     }
   else
-    op0 = lowpart_subreg (vmode, operands[1], mode);
+    op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
 
   op2 = gen_reg_rtx (vmode);
   op3 = gen_reg_rtx (vmode);
@@ -2201,7 +2203,8 @@  ix86_expand_copysign (rtx operands[])
 				    gen_rtx_NOT (vmode, mask),
 				    op0));
   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
-  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
+  emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
+  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 /* Expand an xorsign operation.  */
@@ -2210,7 +2213,7 @@  void
 ix86_expand_xorsign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask, x, temp;
+  rtx dest, vdest, op0, op1, mask, x, temp;
 
   dest = operands[0];
   op0 = operands[1];
@@ -2230,15 +2233,17 @@  ix86_expand_xorsign (rtx operands[])
   temp = gen_reg_rtx (vmode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
-  op1 = lowpart_subreg (vmode, op1, mode);
+  op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
   x = gen_rtx_AND (vmode, op1, mask);
   emit_insn (gen_rtx_SET (temp, x));
 
-  op0 = lowpart_subreg (vmode, op0, mode);
+  op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
   x = gen_rtx_XOR (vmode, temp, op0);
 
-  dest = lowpart_subreg (vmode, dest, mode);
-  emit_insn (gen_rtx_SET (dest, x));
+  vdest = gen_reg_rtx (vmode);
+  emit_insn (gen_rtx_SET (vdest, x));
+
+  emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
 }
 
 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
--- gcc/testsuite/gcc.dg/pr104612.c.jj	2022-02-21 13:26:41.134606451 +0100
+++ gcc/testsuite/gcc.dg/pr104612.c	2022-02-21 13:26:18.247922789 +0100
@@ -0,0 +1,27 @@ 
+/* PR target/104612 */
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-msse2 -mfpmath=sse" { target i?86-*-* x86_64-*-* } } */
+
+struct V { float x, y; };
+
+struct V
+foo (struct V v)
+{
+  struct V ret;
+  ret.x = __builtin_copysignf (1.0e+0, v.x);
+  ret.y = __builtin_copysignf (1.0e+0, v.y);
+  return ret;
+}
+
+float
+bar (struct V v)
+{
+  return __builtin_copysignf (v.x, v.y);
+}
+
+float
+baz (struct V v)
+{
+  return v.x * __builtin_copysignf (1.0f, v.y);
+}