diff mbox

[ARM] Don't use NEON for autovectorization in big-endian mode

Message ID 20140616124236.10b09c2c@octopus
State New
Headers show

Commit Message

Julian Brown June 16, 2014, 11:42 a.m. UTC
Hi,

As discussed several times previously, support for NEON in ARM
big-endian mode is quite broken because of differing assumptions about
lane ordering made by the ARM EABI and the set of NEON intrinsics on
the one hand, and the vectorizer on the other.

Fixing this "properly" would involve quite a large overhaul of the NEON
backend implementation, and such an overhaul does not appear to be
forthcoming. Unfortunately this leaves big-endian mode with a problem:
even if the user is not explicitly using NEON intrinsics, compiling
with NEON and the vectorizer enabled (i.e. -O3) can quite easily lead
to incorrect code being generated.

This is the patch we've been using internally for a while to work
around the problem. When applied:

* We do not allow Neon vectors to be used for autovectorization.
  Vectorization is not disabled completely: ARM core registers (e.g.
  four chars packed into a core register) can still be used to vectorize
  loops in limited circumstances. I think this is mildly preferable to
  forcing -ftree-vectorize to be off entirely for big-endian NEON.

* Intrinsics are not touched. Those which attempt to mix generic vector
  operations with the ABI-defined vector types (i.e. those which are
  implemented with __builtin_shuffle) are, I think, technically
  incorrect -- but in the sense of two wrongs making a right, so the
  end result appears to work.

* Generic vectors (i.e. direct use of __attribute__((vector_size(foo)))
  types) will continue to behave strangely in big-endian mode.

This of course continues to be suboptimal, but at least in *the
common case* we stop generating bad code.

Testing in big-endian mode on user-space QEMU (ARMv7-A, NEON, softfp)
shows (apart from some noise) test diffs as attached. Notice the large
number of removed execution failures, in particular.

OK to apply?

Thanks,

Julian

ChangeLog

    gcc/
    * config/arm/arm.c (arm_array_mode_supported_p): No array modes for
    big-endian NEON.
    (arm_preferred_simd_mode): Don't use NEON vectors for
    autovectorization in big-endian mode.
    (arm_autovectorize_vector_sizes): Don't iterate over other vector
    sizes for big-endian NEON.

    gcc/testsuite/
    * lib/target-supports.exp (check_vect_support_and_set_flags): Don't
    run vect tests for big-endian ARM NEON.
    * gcc.target/arm/neon/vect-vcvt.c: XFAIL for !arm_little_endian.
    * gcc.target/arm/neon/vect-vcvtq.c: Likewise.
    * gcc.target/arm/neon-vshl-imm-1.c: Likewise.
    * gcc.target/arm/neon-vshr-imm-1.c: Likewise.
    * gcc.target/arm/neon-vmls-1.c: Likewise.
    * gcc.target/arm/neon-vmla-1.c: Likewise.
    * gcc.target/arm/neon-vfma-1.c: Likewise.
    * gcc.target/arm/neon-vfms-1.c: Likewise.
    * gcc.target/arm/neon-vorn-vbic.c: Likewise.
    * gcc.target/arm/neon-vlshr-imm-1.c: Likewise.
    * gcc.target/arm/neon-vcond-ltgt.c: Likewise.
    * gcc.target/arm/neon-vcond-gt.c: Likewise.
    * gcc.target/arm/neon-vcond-unordered.c: Likewise.

Comments

Julian Brown June 24, 2014, 11:26 a.m. UTC | #1
On Mon, 16 Jun 2014 12:42:36 +0100
Julian Brown <julian@codesourcery.com> wrote:

> Hi,
> 
> As discussed several times previously, support for NEON in ARM
> big-endian mode is quite broken because of differing assumptions about
> lane ordering made by the ARM EABI and the set of NEON intrinsics on
> the one hand, and the vectorizer on the other.
> 
> Fixing this "properly" would involve quite a large overhaul of the
> NEON backend implementation, and such an overhaul does not appear to
> be forthcoming. Unfortunately this leaves big-endian mode with a
> problem: even if the user is not explicitly using NEON intrinsics,
> compiling with NEON and the vectorizer enabled (i.e. -O3) can quite
> easily lead to incorrect code being generated.
> 
> This is the patch we've been using internally for a while to work
> around the problem. When applied:

Ping?

Julian
diff mbox

Patch

Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	(revision 210209)
+++ gcc/config/arm/arm.c	(working copy)
@@ -28813,7 +28813,7 @@  static bool
 arm_array_mode_supported_p (enum machine_mode mode,
 			    unsigned HOST_WIDE_INT nelems)
 {
-  if (TARGET_NEON
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN
       && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))
       && (nelems >= 2 && nelems <= 4))
     return true;
@@ -28828,7 +28828,7 @@  arm_array_mode_supported_p (enum machine
 static enum machine_mode
 arm_preferred_simd_mode (enum machine_mode mode)
 {
-  if (TARGET_NEON)
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
     switch (mode)
       {
       case SFmode:
@@ -29845,7 +29845,8 @@  arm_vector_alignment (const_tree type)
 static unsigned int
 arm_autovectorize_vector_sizes (void)
 {
-  return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
+  return (TARGET_NEON_VECTORIZE_DOUBLE || (TARGET_NEON && BYTES_BIG_ENDIAN))
+	 ? 0 : (16 | 8);
 }
 
 static bool
Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon/vect-vcvtq.c	(working copy)
@@ -24,5 +24,5 @@  int convert()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
Index: gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon/vect-vcvt.c	(working copy)
@@ -24,5 +24,5 @@  int convert()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { xfail { ! arm_little_endian } } } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
Index: gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vshl-imm-1.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshl\.i32.*#3" } } */
+/* { dg-final { scan-assembler "vshl\.i32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vshr-imm-1.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshr\.s32.*#3" } } */
+/* { dg-final { scan-assembler "vshr\.s32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vfma-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vfma-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vfma-1.c	(working copy)
@@ -2,7 +2,7 @@ 
 /* { dg-require-effective-target arm_neonv2_ok } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neonv2 } */
-/* { dg-final { scan-assembler "vfma\\.f32\[	\]+\[dDqQ]" } } */
+/* { dg-final { scan-assembler "vfma\\.f32\[	\]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VFMA is used.  */
 void f1(int n, float a, float x[], float y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vorn-vbic.c	(working copy)
@@ -16,5 +16,5 @@  void bic (int *__restrict__ c, int *__re
     c[i] = b[i] & (~a[i]);
 }
 
-/* { dg-final { scan-assembler "vorn\\t" } } */
-/* { dg-final { scan-assembler "vbic\\t" } } */
+/* { dg-final { scan-assembler "vorn\\t" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbic\\t" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vmla-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vmla-1.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* { dg-require-effective-target arm_neon_hw } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
-/* { dg-final { scan-assembler "vmla\\.i32" } } */
+/* { dg-final { scan-assembler "vmla\\.i32" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VMLA is used.  */
 void f1(int n, int a, int x[], int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-ltgt.c	(working copy)
@@ -13,6 +13,6 @@  void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 } } */
-/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" 2 { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vfms-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vfms-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vfms-1.c	(working copy)
@@ -2,7 +2,7 @@ 
 /* { dg-require-effective-target arm_neonv2_ok } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neonv2 } */
-/* { dg-final { scan-assembler "vfms\\.f32\[	\]+\[dDqQ]" } } */
+/* { dg-final { scan-assembler "vfms\\.f32\[	\]+\[dDqQ]" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VFMS is used.  */
 void f1(int n, float a, float x[], float y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-gt.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-gt.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-gt.c	(working copy)
@@ -13,5 +13,5 @@  void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vlshr-imm-1.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target arm_neon_ok } */
 /* { dg-options "-O2 -mfpu=neon -mfloat-abi=softfp -ftree-vectorize" } */
-/* { dg-final { scan-assembler "vshr\.u32.*#3" } } */
+/* { dg-final { scan-assembler "vshr\.u32.*#3" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VSHR immediate is used.  */
 void f1(int n, unsigned int x[], unsigned int y[]) {
Index: gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vcond-unordered.c	(working copy)
@@ -13,7 +13,7 @@  void foo (int ilast,float* w, float* w2)
   }
 }
 
-/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
-/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" } } */
+/* { dg-final { scan-assembler "vcgt\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vcge\\.f32\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vorr\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
+/* { dg-final { scan-assembler "vbsl|vbit|vbif\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+,\[\\t \]*q\[0-9\]+" { xfail { ! arm_little_endian } } } } */
Index: gcc/testsuite/gcc.target/arm/neon-vmls-1.c
===================================================================
--- gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(revision 210209)
+++ gcc/testsuite/gcc.target/arm/neon-vmls-1.c	(working copy)
@@ -1,7 +1,7 @@ 
 /* { dg-require-effective-target arm_neon_hw } */
 /* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
 /* { dg-add-options arm_neon } */
-/* { dg-final { scan-assembler "vmls\\.i32" } } */
+/* { dg-final { scan-assembler "vmls\\.i32" { xfail { ! arm_little_endian } } } } */
 
 /* Verify that VMLS is used.  */
 void f1(int n, int a, int x[], int y[]) {
Index: gcc/testsuite/lib/target-supports.exp
===================================================================
--- gcc/testsuite/lib/target-supports.exp	(revision 210209)
+++ gcc/testsuite/lib/target-supports.exp	(working copy)
@@ -5636,6 +5636,14 @@  proc check_vect_support_and_set_flags { 
     } elseif [istarget ia64-*-*] {
         set dg-do-what-default run
     } elseif [is-effective-target arm_neon_ok] {
+	# NEON is not used for vectorization in big-endian mode at present.
+	# Some vect tests still pass without NEON support (i.e. using
+	# core registers), but there are too many failures (missed
+	# vectorization opportunities) to make test results meaningful.
+	if ![check_effective_target_arm_little_endian] {
+	    return 0
+	}
+
         eval lappend DEFAULT_VECTCFLAGS [add_options_for_arm_neon ""]
         # NEON does not support denormals, so is not used for vectorization by
         # default to avoid loss of precision.  We must pass -ffast-math to test