Patchwork PATCH: Disable double precision vectorizer for Atom

login
register
mail settings
Submitter H.J. Lu
Date Sept. 13, 2010, 1:47 p.m.
Message ID <20100913134740.GA20178@intel.com>
Download mbox | patch
Permalink /patch/64603/
State New
Headers show

Comments

H.J. Lu - Sept. 13, 2010, 1:47 p.m.
Hi,

Double precision vector instructions are much slower than double
precision scalar instructions on Atom.  This patch disables double
precision vectorizer for Atom.  It improves SPEC CPU 2K FP geomean by
7% on 64bit and 3% on 32bit.  OK for trunk?

Thanks.


H.J.
----
gcc/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* config/i386/i386.c (initial_ix86_tune_features): Add
	X86_TUNE_VECTORIZE_DOUBLE.
	* config/i386/i386.h (ix86_tune_indices): Likewise.
	(TARGET_VECTORIZE_DOUBLE): New.
	(UNITS_PER_SIMD_WORD): Return UNITS_PER_WORD for DFmode if
	TARGET_VECTORIZE_DOUBLE is false.

gcc/testsuite/

2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>

	* gcc.target/i386/fma4-256-vector.c: Add -mtune=generic.
	* gcc.target/i386/fma4-vector.c: Likewise.
	* gcc.target/i386/vectorize2.c: Likewise.
	* gcc.target/i386/vectorize4.c: Likewise.
	* gcc.target/i386/vectorize5.c: Likewise.
	* gcc.target/i386/vectorize6.c: Likewise.
	* gcc.target/i386/vectorize8.c: Likewise.

	* gcc.target/i386/vect-double-1.c: New.
	* gcc.target/i386/vect-double-1a.c: Likewise.
	* gcc.target/i386/vect-double-2.c: Likewise.
	* gcc.target/i386/vect-double-2a.c: Likewise.

	* lib/target-supports.exp (check_effective_target_vect_double):
	Set et_vect_double_saved to 0 when tuning for Atom.
Uros Bizjak - Sept. 13, 2010, 6:51 p.m.
On Mon, Sep 13, 2010 at 3:47 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:

> Double precision vector instructions are much slower than double
> precision scalar instructions on Atom.  This patch disables double
> precision vectorizer for Atom.  It improves SPEC CPU 2K FP geomean by
> 7% on 64bit and 3% on 32bit.  OK for trunk?
>
> Thanks.
>
>
> H.J.
> ----
> gcc/
>
> 2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>
>
>        * config/i386/i386.c (initial_ix86_tune_features): Add
>        X86_TUNE_VECTORIZE_DOUBLE.
>        * config/i386/i386.h (ix86_tune_indices): Likewise.
>        (TARGET_VECTORIZE_DOUBLE): New.
>        (UNITS_PER_SIMD_WORD): Return UNITS_PER_WORD for DFmode if
>        TARGET_VECTORIZE_DOUBLE is false.
>
> gcc/testsuite/
>
> 2010-09-13  H.J. Lu  <hongjiu.lu@intel.com>
>
>        * gcc.target/i386/fma4-256-vector.c: Add -mtune=generic.
>        * gcc.target/i386/fma4-vector.c: Likewise.
>        * gcc.target/i386/vectorize2.c: Likewise.
>        * gcc.target/i386/vectorize4.c: Likewise.
>        * gcc.target/i386/vectorize5.c: Likewise.
>        * gcc.target/i386/vectorize6.c: Likewise.
>        * gcc.target/i386/vectorize8.c: Likewise.
>
>        * gcc.target/i386/vect-double-1.c: New.
>        * gcc.target/i386/vect-double-1a.c: Likewise.
>        * gcc.target/i386/vect-double-2.c: Likewise.
>        * gcc.target/i386/vect-double-2a.c: Likewise.
>
>        * lib/target-supports.exp (check_effective_target_vect_double):
>        Set et_vect_double_saved to 0 when tuning for Atom.

OK, but see comments bellow ...

> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 1d79a18..7d165bb 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -1627,6 +1627,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
>   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
>      will impact LEA instruction selection. */
>   m_ATOM,
> +
> +  /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
> +     instructions.  */
> +  ~m_ATOM,
>  };
>
>  /* Feature tests against the various architecture variations.  */
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 91238d5..2acf60a 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -312,6 +312,7 @@ enum ix86_tune_indices {
>   X86_TUNE_USE_VECTOR_CONVERTS,
>   X86_TUNE_FUSE_CMP_AND_BRANCH,
>   X86_TUNE_OPT_AGU,
> +  X86_TUNE_VECTORIZE_DOUBLE,
>
>   X86_TUNE_LAST
>  };
> @@ -404,6 +405,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
>  #define TARGET_FUSE_CMP_AND_BRANCH \
>        ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
>  #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
> +#define TARGET_VECTORIZE_DOUBLE \
> +       ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
>
>  /* Feature tests against the various architecture variations.  */
>  enum ix86_arch_indices {
> @@ -1037,8 +1040,10 @@ enum target_cpu_default
>    different sizes for integer and floating point vectors.  We limit
>    vector size to 16byte.  */
>  #define UNITS_PER_SIMD_WORD(MODE)                                      \
> -  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)     \
> -             : (TARGET_SSE ? 16 : UNITS_PER_WORD))
> +  ((MODE) == DFmode && !TARGET_VECTORIZE_DOUBLE                                \
> +   ? UNITS_PER_WORD                                                    \
> +   : (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)  \
> +                : (TARGET_SSE ? 16 : UNITS_PER_WORD)))

Please rewrite this function to a helper function using switch
statement. I must admit I'm not able to parse this mess.

Thanks,
Uros.

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 1d79a18..7d165bb 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1627,6 +1627,10 @@  static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
      will impact LEA instruction selection. */
   m_ATOM,
+
+  /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
+     instructions.  */
+  ~m_ATOM,
 };
 
 /* Feature tests against the various architecture variations.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 91238d5..2acf60a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -312,6 +312,7 @@  enum ix86_tune_indices {
   X86_TUNE_USE_VECTOR_CONVERTS,
   X86_TUNE_FUSE_CMP_AND_BRANCH,
   X86_TUNE_OPT_AGU,
+  X86_TUNE_VECTORIZE_DOUBLE,
 
   X86_TUNE_LAST
 };
@@ -404,6 +405,8 @@  extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_FUSE_CMP_AND_BRANCH \
 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
+#define TARGET_VECTORIZE_DOUBLE \
+	ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
@@ -1037,8 +1040,10 @@  enum target_cpu_default
    different sizes for integer and floating point vectors.  We limit
    vector size to 16byte.  */
 #define UNITS_PER_SIMD_WORD(MODE)					\
-  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)	\
-   	      : (TARGET_SSE ? 16 : UNITS_PER_WORD))
+  ((MODE) == DFmode && !TARGET_VECTORIZE_DOUBLE				\
+   ? UNITS_PER_WORD							\
+   : (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)	\
+		 : (TARGET_SSE ? 16 : UNITS_PER_WORD)))
 
 #define VALID_DFP_MODE_P(MODE) \
   ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode)
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
index 714b743..1bd2ce4 100644
--- a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
+++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c
@@ -3,7 +3,7 @@ 
 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
 
 extern void exit (int);
 
diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c
index df8463e..da12780 100644
--- a/gcc/testsuite/gcc.target/i386/fma4-vector.c
+++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c
@@ -3,7 +3,7 @@ 
 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */
+/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */
 
 extern void exit (int);
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1.c b/gcc/testsuite/gcc.target/i386/vect-double-1.c
new file mode 100644
index 0000000..87e5fe9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-1.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -march=core2 -fdump-tree-vect-stats" } */
+
+extern void abort (void);
+
+#ifndef STATIC
+#define STATIC
+#endif
+
+#define N 16
+ 
+double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+double ca[N];
+
+STATIC void
+__attribute__ ((noinline))
+sse2_test (void)
+{  
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      ca[i] = cb[i];
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (ca[i] != cb[i])
+        abort ();
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1a.c b/gcc/testsuite/gcc.target/i386/vect-double-1a.c
new file mode 100644
index 0000000..a62c939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-1a.c
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=core2" } */
+
+#define STATIC static
+
+#include "vect-double-1.c"
+#include "sse2-check.h"
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2.c b/gcc/testsuite/gcc.target/i386/vect-double-2.c
new file mode 100644
index 0000000..a76dcb4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom -fdump-tree-vect-stats" } */
+
+extern void abort (void);
+
+#ifndef STATIC
+#define STATIC
+#endif
+
+#define N 16
+ 
+double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
+double ca[N];
+
+STATIC void
+__attribute__ ((noinline))
+sse2_test (void)
+{  
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      ca[i] = cb[i];
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (ca[i] != cb[i])
+        abort ();
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "vectorized 1 loops" "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2a.c b/gcc/testsuite/gcc.target/i386/vect-double-2a.c
new file mode 100644
index 0000000..94f8062
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-double-2a.c
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom" } */
+
+#define STATIC static
+
+#include "vect-double-2.c"
+#include "sse2-check.h"
diff --git a/gcc/testsuite/gcc.target/i386/vectorize2.c b/gcc/testsuite/gcc.target/i386/vectorize2.c
index 4196487..427e2d4 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize2.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize2.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mtune=generic" } */
 
 double a[256];
 int b[256];
diff --git a/gcc/testsuite/gcc.target/i386/vectorize4.c b/gcc/testsuite/gcc.target/i386/vectorize4.c
index f3d605e..557d0a2 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize4.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize4.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target ilp32 } */
-/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mtune=generic --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */
 /* This test, tests two thing, we vectorize square root and also we don't crash due to a GC issue.  */
 
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize5.c b/gcc/testsuite/gcc.target/i386/vectorize5.c
index 3894240..04f044f 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize5.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize5.c
@@ -1,6 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math -mtune=generic" } */
 
 double x[256];
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize6.c b/gcc/testsuite/gcc.target/i386/vectorize6.c
index 78ec53d..d299a15 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize6.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize6.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math" } */
+/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math -mtune=generic" } */
 
 double x[256];
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize8.c b/gcc/testsuite/gcc.target/i386/vectorize8.c
index ed1517b..a194bb0 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize8.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize8.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -msse2" } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mtune=generic" } */
 
 unsigned int a[256];
 double b[256];
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index fc24b78..de9f21e 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2309,8 +2309,17 @@  proc check_effective_target_vect_double { } {
     } else {
 	set et_vect_double_saved 0
 	if { [istarget i?86-*-*]
-	      || [istarget x86_64-*-*] 
-	      || [istarget spu-*-*] } {
+	      || [istarget x86_64-*-*] } {
+	   if { [check_no_compiler_messages vect_double assembly {
+		 #ifdef __tune_atom__
+		 # error No double vectorizer support.
+		 #endif
+		}] } {
+		set et_vect_double_saved 1
+	    } else {
+		set et_vect_double_saved 0
+	    }
+	} elseif { [istarget spu-*-*] } {
 	   set et_vect_double_saved 1
 	}
     }