From patchwork Mon Sep 13 19:29:38 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 64628 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id E307AB70AB for ; Tue, 14 Sep 2010 05:30:02 +1000 (EST) Received: (qmail 2187 invoked by alias); 13 Sep 2010 19:29:56 -0000 Received: (qmail 2167 invoked by uid 22791); 13 Sep 2010 19:29:49 -0000 X-SWARE-Spam-Status: No, hits=-1.8 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_NONE, TW_ZJ, T_FRT_BELOW2 X-Spam-Check-By: sourceware.org Received: from mail-vw0-f47.google.com (HELO mail-vw0-f47.google.com) (209.85.212.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Mon, 13 Sep 2010 19:29:41 +0000 Received: by vws9 with SMTP id 9so5183288vws.20 for ; Mon, 13 Sep 2010 12:29:39 -0700 (PDT) MIME-Version: 1.0 Received: by 10.220.125.1 with SMTP id w1mr2833097vcr.239.1284406178805; Mon, 13 Sep 2010 12:29:38 -0700 (PDT) Received: by 10.220.78.193 with HTTP; Mon, 13 Sep 2010 12:29:38 -0700 (PDT) In-Reply-To: References: <20100913134740.GA20178@intel.com> Date: Mon, 13 Sep 2010 12:29:38 -0700 Message-ID: Subject: Re: PATCH: Disable double precision vectorizer for Atom From: "H.J. Lu" To: Uros Bizjak Cc: gcc-patches@gcc.gnu.org X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On Mon, Sep 13, 2010 at 11:51 AM, Uros Bizjak wrote: > On Mon, Sep 13, 2010 at 3:47 PM, H.J. Lu wrote: > >> Double precision vector instructions are much slower than double >> precision scalar instructions on Atom.  This patch disables double >> precision vectorizer for Atom.  It improves SPEC CPU 2K FP geomean by >> 7% on 64bit and 3% on 32bit.  OK for trunk? >> >> Thanks. >> >> >> H.J. >> ---- >> gcc/ >> >> 2010-09-13  H.J. Lu   >> >>        * config/i386/i386.c (initial_ix86_tune_features): Add >>        X86_TUNE_VECTORIZE_DOUBLE. >>        * config/i386/i386.h (ix86_tune_indices): Likewise. >>        (TARGET_VECTORIZE_DOUBLE): New. >>        (UNITS_PER_SIMD_WORD): Return UNITS_PER_WORD for DFmode if >>        TARGET_VECTORIZE_DOUBLE is false. >> >> gcc/testsuite/ >> >> 2010-09-13  H.J. Lu   >> >>        * gcc.target/i386/fma4-256-vector.c: Add -mtune=generic. >>        * gcc.target/i386/fma4-vector.c: Likewise. >>        * gcc.target/i386/vectorize2.c: Likewise. >>        * gcc.target/i386/vectorize4.c: Likewise. >>        * gcc.target/i386/vectorize5.c: Likewise. >>        * gcc.target/i386/vectorize6.c: Likewise. >>        * gcc.target/i386/vectorize8.c: Likewise. >> >>        * gcc.target/i386/vect-double-1.c: New. >>        * gcc.target/i386/vect-double-1a.c: Likewise. >>        * gcc.target/i386/vect-double-2.c: Likewise. >>        * gcc.target/i386/vect-double-2a.c: Likewise. >> >>        * lib/target-supports.exp (check_effective_target_vect_double): >>        Set et_vect_double_saved to 0 when tuning for Atom. > > OK, but see comments bellow ... > >> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c >> index 1d79a18..7d165bb 100644 >> --- a/gcc/config/i386/i386.c >> +++ b/gcc/config/i386/i386.c >> @@ -1627,6 +1627,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { >>   /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag >>      will impact LEA instruction selection. */ >>   m_ATOM, >> + >> +  /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector >> +     instructions.  */ >> +  ~m_ATOM, >>  }; >> >>  /* Feature tests against the various architecture variations.  */ >> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h >> index 91238d5..2acf60a 100644 >> --- a/gcc/config/i386/i386.h >> +++ b/gcc/config/i386/i386.h >> @@ -312,6 +312,7 @@ enum ix86_tune_indices { >>   X86_TUNE_USE_VECTOR_CONVERTS, >>   X86_TUNE_FUSE_CMP_AND_BRANCH, >>   X86_TUNE_OPT_AGU, >> +  X86_TUNE_VECTORIZE_DOUBLE, >> >>   X86_TUNE_LAST >>  }; >> @@ -404,6 +405,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; >>  #define TARGET_FUSE_CMP_AND_BRANCH \ >>        ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH] >>  #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] >> +#define TARGET_VECTORIZE_DOUBLE \ >> +       ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE] >> >>  /* Feature tests against the various architecture variations.  */ >>  enum ix86_arch_indices { >> @@ -1037,8 +1040,10 @@ enum target_cpu_default >>    different sizes for integer and floating point vectors.  We limit >>    vector size to 16byte.  */ >>  #define UNITS_PER_SIMD_WORD(MODE)                                      \ >> -  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)     \ >> -             : (TARGET_SSE ? 16 : UNITS_PER_WORD)) >> +  ((MODE) == DFmode && !TARGET_VECTORIZE_DOUBLE                                \ >> +   ? UNITS_PER_WORD                                                    \ >> +   : (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)  \ >> +                : (TARGET_SSE ? 16 : UNITS_PER_WORD))) > > Please rewrite this function to a helper function using switch > statement. I must admit I'm not able to parse this mess. > This is the patch I checked in. Thanks. diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 900b424..c4c61f7 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -50,6 +50,7 @@ extern bool x86_extended_QIreg_mentioned_p (rtx); extern bool x86_extended_reg_mentioned_p (rtx); extern bool x86_maybe_negate_const_int (rtx *, enum machine_mode); extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx); +extern unsigned int ix86_units_per_simd_word (enum machine_mode); extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode); extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 1d79a18..a1bc205 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1627,6 +1627,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ m_ATOM, + + /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector + instructions. */ + ~m_ATOM, }; /* Feature tests against the various architecture variations. */ @@ -32334,6 +32338,28 @@ has_dispatch (rtx insn, int action) return false; } +/* ??? No autovectorization into MMX or 3DNOW until we can reliably + place emms and femms instructions. */ + +unsigned int +ix86_units_per_simd_word (enum machine_mode mode) +{ + /* Disable double precision vectorizer if needed. */ + if (mode == DFmode && !TARGET_VECTORIZE_DOUBLE) + return UNITS_PER_WORD; + +#if 0 + /* FIXME: AVX has 32byte floating point vector operations and 16byte + integer vector operations. But vectorizer doesn't support + different sizes for integer and floating point vectors. We limit + vector size to 16byte. */ + if (TARGET_AVX) + return (mode == DFmode || mode == SFmode) ? 32 : 16; + else +#endif + return TARGET_SSE ? 16 : UNITS_PER_WORD; +} + /* Initialize the GCC target structure. */ #undef TARGET_RETURN_IN_MEMORY #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 91238d5..6fc2249 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -312,6 +312,7 @@ enum ix86_tune_indices { X86_TUNE_USE_VECTOR_CONVERTS, X86_TUNE_FUSE_CMP_AND_BRANCH, X86_TUNE_OPT_AGU, + X86_TUNE_VECTORIZE_DOUBLE, X86_TUNE_LAST }; @@ -404,6 +405,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_FUSE_CMP_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] +#define TARGET_VECTORIZE_DOUBLE \ + ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { @@ -1030,15 +1033,7 @@ enum target_cpu_default || (MODE) == V2SImode || (MODE) == SImode \ || (MODE) == V4HImode || (MODE) == V8QImode) -/* ??? No autovectorization into MMX or 3DNOW until we can reliably - place emms and femms instructions. - FIXME: AVX has 32byte floating point vector operations and 16byte - integer vector operations. But vectorizer doesn't support - different sizes for integer and floating point vectors. We limit - vector size to 16byte. */ -#define UNITS_PER_SIMD_WORD(MODE) \ - (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16) \ - : (TARGET_SSE ? 16 : UNITS_PER_WORD)) +#define UNITS_PER_SIMD_WORD(MODE) ix86_units_per_simd_word (MODE) #define VALID_DFP_MODE_P(MODE) \ ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode) diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c index 714b743..1bd2ce4 100644 --- a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c +++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c @@ -3,7 +3,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target lp64 } */ -/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */ extern void exit (int); diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c index df8463e..da12780 100644 --- a/gcc/testsuite/gcc.target/i386/fma4-vector.c +++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c @@ -3,7 +3,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target lp64 } */ -/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize -mtune=generic" } */ extern void exit (int); diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1.c b/gcc/testsuite/gcc.target/i386/vect-double-1.c new file mode 100644 index 0000000..87e5fe9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-double-1.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -march=core2 -fdump-tree-vect-stats" } */ + +extern void abort (void); + +#ifndef STATIC +#define STATIC +#endif + +#define N 16 + +double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; +double ca[N]; + +STATIC void +__attribute__ ((noinline)) +sse2_test (void) +{ + int i; + + for (i = 0; i < N; i++) + { + ca[i] = cb[i]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (ca[i] != cb[i]) + abort (); + } +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-double-1a.c b/gcc/testsuite/gcc.target/i386/vect-double-1a.c new file mode 100644 index 0000000..a62c939 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-double-1a.c @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=core2" } */ + +#define STATIC static + +#include "vect-double-1.c" +#include "sse2-check.h" diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2.c b/gcc/testsuite/gcc.target/i386/vect-double-2.c new file mode 100644 index 0000000..a76dcb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-double-2.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom -fdump-tree-vect-stats" } */ + +extern void abort (void); + +#ifndef STATIC +#define STATIC +#endif + +#define N 16 + +double cb[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45}; +double ca[N]; + +STATIC void +__attribute__ ((noinline)) +sse2_test (void) +{ + int i; + + for (i = 0; i < N; i++) + { + ca[i] = cb[i]; + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (ca[i] != cb[i]) + abort (); + } +} + +/* { dg-final { scan-tree-dump-not "vectorized 1 loops" "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-double-2a.c b/gcc/testsuite/gcc.target/i386/vect-double-2a.c new file mode 100644 index 0000000..94f8062 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-double-2a.c @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-options "-O2 -ftree-vectorize -mfpmath=sse -msse2 -mtune=atom" } */ + +#define STATIC static + +#include "vect-double-2.c" +#include "sse2-check.h" diff --git a/gcc/testsuite/gcc.target/i386/vectorize2.c b/gcc/testsuite/gcc.target/i386/vectorize2.c index 4196487..427e2d4 100644 --- a/gcc/testsuite/gcc.target/i386/vectorize2.c +++ b/gcc/testsuite/gcc.target/i386/vectorize2.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-require-effective-target ilp32 } */ -/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mtune=generic" } */ double a[256]; int b[256]; diff --git a/gcc/testsuite/gcc.target/i386/vectorize4.c b/gcc/testsuite/gcc.target/i386/vectorize4.c index f3d605e..557d0a2 100644 --- a/gcc/testsuite/gcc.target/i386/vectorize4.c +++ b/gcc/testsuite/gcc.target/i386/vectorize4.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-require-effective-target ilp32 } */ -/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mtune=generic --param ggc-min-expand=0 --param ggc-min-heapsize=0" } */ /* This test, tests two thing, we vectorize square root and also we don't crash due to a GC issue. */ diff --git a/gcc/testsuite/gcc.target/i386/vectorize5.c b/gcc/testsuite/gcc.target/i386/vectorize5.c index 3894240..04f044f 100644 --- a/gcc/testsuite/gcc.target/i386/vectorize5.c +++ b/gcc/testsuite/gcc.target/i386/vectorize5.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-require-effective-target lp64 } */ -/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math" } */ +/* { dg-options "-O2 -ftree-vectorize -mveclibabi=acml -ffast-math -mtune=generic" } */ double x[256]; diff --git a/gcc/testsuite/gcc.target/i386/vectorize6.c b/gcc/testsuite/gcc.target/i386/vectorize6.c index 78ec53d..d299a15 100644 --- a/gcc/testsuite/gcc.target/i386/vectorize6.c +++ b/gcc/testsuite/gcc.target/i386/vectorize6.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math" } */ +/* { dg-options "-O2 -msse2 -ftree-vectorize -mveclibabi=svml -ffast-math -mtune=generic" } */ double x[256]; diff --git a/gcc/testsuite/gcc.target/i386/vectorize8.c b/gcc/testsuite/gcc.target/i386/vectorize8.c index ed1517b..a194bb0 100644 --- a/gcc/testsuite/gcc.target/i386/vectorize8.c +++ b/gcc/testsuite/gcc.target/i386/vectorize8.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-vectorize -msse2" } */ +/* { dg-options "-O2 -ftree-vectorize -msse2 -mtune=generic" } */ unsigned int a[256]; double b[256]; diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index fc24b78..de9f21e 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -2309,8 +2309,17 @@ proc check_effective_target_vect_double { } { } else { set et_vect_double_saved 0 if { [istarget i?86-*-*] - || [istarget x86_64-*-*] - || [istarget spu-*-*] } { + || [istarget x86_64-*-*] } { + if { [check_no_compiler_messages vect_double assembly { + #ifdef __tune_atom__ + # error No double vectorizer support. + #endif + }] } { + set et_vect_double_saved 1 + } else { + set et_vect_double_saved 0 + } + } elseif { [istarget spu-*-*] } { set et_vect_double_saved 1 } }