From patchwork Tue Nov 15 19:23:15 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Uros Bizjak X-Patchwork-Id: 125857 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id D3BF5B6F8C for ; Wed, 16 Nov 2011 06:23:55 +1100 (EST) Received: (qmail 30388 invoked by alias); 15 Nov 2011 19:23:52 -0000 Received: (qmail 30379 invoked by uid 22791); 15 Nov 2011 19:23:50 -0000 X-SWARE-Spam-Status: No, hits=-1.9 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, TW_AV, TW_DQ, TW_KL, TW_LH, TW_LQ, TW_QD, TW_VC, TW_VD, TW_VL, TW_VP, TW_VT, TW_ZJ X-Spam-Check-By: sourceware.org Received: from mail-yw0-f47.google.com (HELO mail-yw0-f47.google.com) (209.85.213.47) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 15 Nov 2011 19:23:16 +0000 Received: by ywt2 with SMTP id 2so5503246ywt.20 for ; Tue, 15 Nov 2011 11:23:15 -0800 (PST) MIME-Version: 1.0 Received: by 10.147.5.22 with SMTP id h22mr1883866yai.0.1321384995110; Tue, 15 Nov 2011 11:23:15 -0800 (PST) Received: by 10.147.114.16 with HTTP; Tue, 15 Nov 2011 11:23:15 -0800 (PST) Date: Tue, 15 Nov 2011 20:23:15 +0100 Message-ID: Subject: [PATCH, i386]: Optimize v2df (x2) -> v4sf, v4si conversion sequences for AVX. From: Uros Bizjak To: gcc-patches@gcc.gnu.org Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Hello! Attached patch optimizes v2df (x2) -> v4sf,v4si conversion sequences for AVX from: vcvtpd2psx 48(%rsp), %xmm1 vcvtpd2psx 64(%rsp), %xmm0 vmovlhps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, 32(%rsp) to vmovapd 64(%rsp), %xmm0 vinsertf128 $0x1, 80(%rsp), %ymm0, %ymm0 vcvtpd2psy %ymm0, %xmm0 vmovaps %xmm0, 32(%rsp) Please note only one conversion instruction. In a similar way, the patch optimizes floor/ceil/round from: vroundpd $1, 32(%rsp), %xmm1 vroundpd $1, 48(%rsp), %xmm0 vcvttpd2dqx %xmm1, %xmm1 vcvttpd2dqx %xmm0, %xmm0 vpunpcklqdq %xmm0, %xmm1, %xmm0 vmovdqa %xmm0, 16(%rsp) to vroundpd $1, 64(%rsp), %xmm1 vroundpd $1, 80(%rsp), %xmm0 vinsertf128 $0x1, %xmm0, %ymm1, %ymm0 vcvttpd2dqy %ymm0, %xmm0 vmovdqa %xmm0, 32(%rsp) Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd $1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed vectorize factors. The patch also changes a couple of patterns to use simpler SSE patterns with vec-concat pattern to generate equivalent code. 2011-11-15 Uros Bizjak * config/i386/sse.md (vec_pack_trunc_v2df): Optimize sequence for AVX. (vec_pack_sfix_trunc_v2df): Ditto. (vec_pack_sfix_v2df): Ditto. (vec_pack_sfix_trunc_v4df): Generate fix_truncv4dfv4si2 and avx_vec_concatv8si patterns. (vec_pack_sfix_v4df): Generate avx_cvtpd2dq256 and avx_vec_concatv8si patterns. testsuite/ChangeLog: 2011-11-15 Uros Bizjak * gcc.target/i386/avx-cvt-2-vec.c: New test. * gcc.target/i386/avx-floor-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-ceil-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-rint-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-round-sfix-2-vec.c: Ditto. Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN. Uros. Index: gcc.target/i386/avx-ceil-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include + +extern double ceil (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) ceil (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) ceil (a[i])) + abort(); +} Index: gcc.target/i386/avx-rint-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include + +extern double rint (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) rint (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) rint (a[i])) + abort(); +} Index: gcc.target/i386/avx-round-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-round-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-round-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include + +extern double round (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) round (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) round (a[i])) + abort(); +} Index: gcc.target/i386/avx-floor-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include + +extern double floor (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) floor (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) floor (a[i])) + abort(); +} Index: config/i386/sse.md =================================================================== --- config/i386/sse.md (revision 181378) +++ config/i386/sse.md (working copy) @@ -3038,14 +3038,25 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { - rtx r1, r2; + rtx tmp0, tmp1; - r1 = gen_reg_rtx (V4SFmode); - r2 = gen_reg_rtx (V4SFmode); + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn (gen_sse2_cvtpd2ps (r1, operands[1])); - emit_insn (gen_sse2_cvtpd2ps (r2, operands[2])); - emit_insn (gen_sse_movlhps (operands[0], r1, r2)); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_cvtpd2ps256 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SFmode); + tmp1 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_sse2_cvtpd2ps (tmp0, operands[1])); + emit_insn (gen_sse2_cvtpd2ps (tmp1, operands[2])); + emit_insn (gen_sse_movlhps (operands[0], tmp0, tmp1)); + } DONE; }) @@ -3057,12 +3068,12 @@ { rtx r1, r2; - r1 = gen_reg_rtx (V8SImode); - r2 = gen_reg_rtx (V8SImode); + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); - emit_insn (gen_avx_cvttpd2dq256_2 (r1, operands[1])); - emit_insn (gen_avx_cvttpd2dq256_2 (r2, operands[2])); - emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20))); + emit_insn (gen_fix_truncv4dfv4si2 (r1, operands[1])); + emit_insn (gen_fix_truncv4dfv4si2 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2)); DONE; }) @@ -3072,16 +3083,28 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { - rtx r1, r2; + rtx tmp0, tmp1; - r1 = gen_reg_rtx (V4SImode); - r2 = gen_reg_rtx (V4SImode); + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn (gen_sse2_cvttpd2dq (r1, operands[1])); - emit_insn (gen_sse2_cvttpd2dq (r2, operands[2])); - emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SImode); + tmp1 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1])); + emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2])); + emit_insn + (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + } DONE; }) @@ -3126,12 +3149,12 @@ { rtx r1, r2; - r1 = gen_reg_rtx (V8SImode); - r2 = gen_reg_rtx (V8SImode); + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); - emit_insn (gen_avx_cvtpd2dq256_2 (r1, operands[1])); - emit_insn (gen_avx_cvtpd2dq256_2 (r2, operands[2])); - emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20))); + emit_insn (gen_avx_cvtpd2dq256 (r1, operands[1])); + emit_insn (gen_avx_cvtpd2dq256 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2)); DONE; }) @@ -3141,16 +3164,28 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { - rtx r1, r2; + rtx tmp0, tmp1; - r1 = gen_reg_rtx (V4SImode); - r2 = gen_reg_rtx (V4SImode); + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn (gen_sse2_cvtpd2dq (r1, operands[1])); - emit_insn (gen_sse2_cvtpd2dq (r2, operands[2])); - emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_cvtpd2dq256 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SImode); + tmp1 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1])); + emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2])); + emit_insn + (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + } DONE; })