diff mbox series

[V3] PR target/70243 - Do not generate vmaddfp or vnmsubdp

Message ID ZDFte7XxGH3P2fpq@toto.the-meissners.org
State New
Headers show
Series [V3] PR target/70243 - Do not generate vmaddfp or vnmsubdp | expand

Commit Message

Michael Meissner April 8, 2023, 1:34 p.m. UTC
This is version 3 of the patch.  This is essentially version 1 with the removal
of changes to altivec.md, and cleanup of the comments.

Version 2 generated the vmaddfp and vnmsubfp instructions if -Ofast was used,
and those changes are deleted in this patch.

The Altivec instructions vmaddfp and vnmsubfp have different rounding behaviors
than the VSX xvmaddsp and xvnmsubsp instructions.  In particular, generating
these instructions seems to break Eigen on big endian systems.

I have done bootstrap builds on power9 little endian (with both IEEE long
double and IBM long double).  I have also done the builds and test on a power8
big endian system (testing both 32-bit and 64-bit code generation).  Chip has
verified that it fixes the problem that Eigen encountered.  Can I check this
into the master GCC branch?  After a burn-in period, can I check this patch
into the active GCC branches?

Thanks in advance.

2023-04-07   Michael Meissner  <meissner@linux.ibm.com>

gcc/

	PR target/70243
	* config/rs6000/rs6000.md (vsx_fmav4sf4): Do not generate vmaddfp.
	(vsx_nfmsv4sf4): Do not generate vnmsubfp.

gcc/testsuite/

	PR target/70243
	* gcc.target/powerpc/pr70243.c: New test.
---
 gcc/config/rs6000/vsx.md                   | 31 ++++++++--------
 gcc/testsuite/gcc.target/powerpc/pr70243.c | 41 ++++++++++++++++++++++
 2 files changed, 55 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr70243.c

Comments

Segher Boessenkool April 8, 2023, 2:01 p.m. UTC | #1
Hi!

On Sat, Apr 08, 2023 at 09:34:51AM -0400, Michael Meissner wrote:
> The Altivec instructions vmaddfp and vnmsubfp have different rounding behaviors
> than the VSX xvmaddsp and xvnmsubsp instructions.  In particular, generating
> these instructions seems to break Eigen on big endian systems.

What actually breaks Eigen is not the rounding behaviour (it runs with
RN=0b00, like most things do, so round-to-nearest-ties-to-even, the only
rounding mode supported by the VMX float insns, no big shocking surprise
there).  What break Eigen and many other unsuspecting programs
unknowingly using VMX is that on Linux programs are started with
VSCR[NJ]=1, "Non-Java mode", which means all numbers with unbiased
exponent 0 get the mantissa forced to 0 as well, both on input and
output (all denormals are flushed to zero of the same sign).

This is counter to the various ABIs.  I'll submit a patch to Linux soon.
But since many people run older kernels, at least for a while more, we
need to fix this in GCC.  Like your patch does.

> 	PR target/70243
> 	* config/rs6000/rs6000.md (vsx_fmav4sf4): Do not generate vmaddfp.
> 	(vsx_nfmsv4sf4): Do not generate vnmsubfp.

> -;; Fused vector multiply/add instructions. Support the classical Altivec
> -;; versions of fma, which allows the target to be a separate register from the
> -;; 3 inputs.  Under VSX, the target must be either the addend or the first
> -;; multiply.
> -
> +;; Fused vector multiply/add instructions. Do not generate the Altivec versions
> +;; of fma (vmaddfp and vnmsubfp).  These instructions allows the target to be a
> +;; separate register from the 3 inputs, but they have different rounding
> +;; behaviors than the VSX instructions.

Please mention the VSCR[NJ] thing here as well?  Just something very
short, just mentioning "NJ" or "Non-Java" is enough.

With that: okay for trunk, thank you!  Also okay for all backports.


Segher
diff mbox series

Patch

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 0865608f94a..c4c503cacad 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -2009,22 +2009,20 @@  (define_insn "*vsx_tsqrt<mode>2_internal"
   "x<VSv>tsqrt<sd>p %0,%x1"
   [(set_attr "type" "<VStype_simple>")])
 
-;; Fused vector multiply/add instructions. Support the classical Altivec
-;; versions of fma, which allows the target to be a separate register from the
-;; 3 inputs.  Under VSX, the target must be either the addend or the first
-;; multiply.
-
+;; Fused vector multiply/add instructions. Do not generate the Altivec versions
+;; of fma (vmaddfp and vnmsubfp).  These instructions allows the target to be a
+;; separate register from the 3 inputs, but they have different rounding
+;; behaviors than the VSX instructions.
 (define_insn "*vsx_fmav4sf4"
-  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,v")
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa")
 	(fma:V4SF
-	  (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa,v")
-	  (match_operand:V4SF 2 "vsx_register_operand" "wa,0,v")
-	  (match_operand:V4SF 3 "vsx_register_operand" "0,wa,v")))]
+	  (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa")
+	  (match_operand:V4SF 2 "vsx_register_operand" "wa,0")
+	  (match_operand:V4SF 3 "vsx_register_operand" "0,wa")))]
   "VECTOR_UNIT_VSX_P (V4SFmode)"
   "@
    xvmaddasp %x0,%x1,%x2
-   xvmaddmsp %x0,%x1,%x3
-   vmaddfp %0,%1,%2,%3"
+   xvmaddmsp %x0,%x1,%x3"
   [(set_attr "type" "vecfloat")])
 
 (define_insn "*vsx_fmav2df4"
@@ -2066,18 +2064,17 @@  (define_insn "*vsx_nfma<mode>4"
   [(set_attr "type" "<VStype_mul>")])
 
 (define_insn "*vsx_nfmsv4sf4"
-  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,v")
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa")
 	(neg:V4SF
 	 (fma:V4SF
-	   (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa,v")
-	   (match_operand:V4SF 2 "vsx_register_operand" "wa,0,v")
+	   (match_operand:V4SF 1 "vsx_register_operand" "%wa,wa")
+	   (match_operand:V4SF 2 "vsx_register_operand" "wa,0")
 	   (neg:V4SF
-	     (match_operand:V4SF 3 "vsx_register_operand" "0,wa,v")))))]
+	     (match_operand:V4SF 3 "vsx_register_operand" "0,wa")))))]
   "VECTOR_UNIT_VSX_P (V4SFmode)"
   "@
    xvnmsubasp %x0,%x1,%x2
-   xvnmsubmsp %x0,%x1,%x3
-   vnmsubfp %0,%1,%2,%3"
+   xvnmsubmsp %x0,%x1,%x3"
   [(set_attr "type" "vecfloat")])
 
 (define_insn "*vsx_nfmsv2df4"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr70243.c b/gcc/testsuite/gcc.target/powerpc/pr70243.c
new file mode 100644
index 00000000000..18a5ce78792
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr70243.c
@@ -0,0 +1,41 @@ 
+/* { dg-do compile */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+/* PR 70423, Make sure we don't generate vmaddfp or vnmsubfp.  These
+   instructions have different rounding modes than the VSX instructions
+   xvmaddsp and xvnmsubsp.  These tests are written where the 3 inputs and
+   target are all separate registers.  Because vmaddfp and vnmsubfp are no
+   longer generated the compiler will have to generate an xsmaddsp or xsnmsubsp
+   instruction followed by a move operation.  */
+
+#include <altivec.h>
+
+vector float
+do_add1 (vector float dummy, vector float a, vector float b, vector float c)
+{
+  return (a * b) + c;
+}
+
+vector float
+do_nsub1 (vector float dummy, vector float a, vector float b, vector float c)
+{
+  return -((a * b) - c);
+}
+
+vector float
+do_add2 (vector float dummy, vector float a, vector float b, vector float c)
+{
+  return vec_madd (a, b, c);
+}
+
+vector float
+do_nsub2 (vector float dummy, vector float a, vector float b, vector float c)
+{
+  return vec_nmsub (a, b, c);
+}
+
+/* { dg-final { scan-assembler     {\mxvmadd[am]sp\M}  } } */
+/* { dg-final { scan-assembler     {\mxvnmsub[am]sp\M} } } */
+/* { dg-final { scan-assembler-not {\mvmaddfp\M}       } } */
+/* { dg-final { scan-assembler-not {\mvnmsubfp\M}      } } */