Improve AVX512F sse4_1_round* patterns

Message ID	20160427213433.GI26501@tucnak.zalov.cz
State	New
Headers	show Return-Path: <gcc-patches-return-425890-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:cc:subject:message-id:reply-to:mime-version :content-type; q=dns; s=default; b=XsjHfMGuJOzyXnhDOlJ3OVLzzseEW ESuaJSXnpVAKYXRL+zFVy5ohaMYhtyn0DFZhcQnNl7+hDG8KcBlOlttCLGeqkfEE Twufjv1osRZBJcVhPWrp3lf/j/wuT4n0rMbHMn8ZfpWJdx2LwX/a6Gu57St8SK17 LBHn0ciniWh7K0= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Wed, 27 Apr 2016 23:34:33 +0200 From: Jakub Jelinek <jakub@redhat.com> To: Uros Bizjak <ubizjak@gmail.com>, Kirill Yukhin <kirill.yukhin@gmail.com> Cc: gcc-patches@gcc.gnu.org Subject: [PATCH] Improve AVX512F sse4_1_round* patterns Message-ID: <20160427213433.GI26501@tucnak.zalov.cz> Reply-To: Jakub Jelinek <jakub@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.24 (2015-08-30)

--- gcc/config/i386/i386.md.jj 2016-04-27 14:34:43.897064531 +0200 +++ gcc/config/i386/i386.md 2016-04-27 14:34:52.402950392 +0200 @@ -15510,15 +15510,19 @@ (define_expand "significand<mode>2" (define_insn "sse4_1_round<mode>2" - [(set (match_operand:MODEF 0 "register_operand" "=x") - (unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x") - (match_operand:SI 2 "const_0_to_15_operand" "n")] + [(set (match_operand:MODEF 0 "register_operand" "=x,v") + (unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x,v") + (match_operand:SI 2 "const_0_to_15_operand" "n,n")] UNSPEC_ROUND))] "TARGET_ROUND" - "%vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}" + "@ + %vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2} + vrndscale<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}" [(set_attr "type" "ssecvt") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "maybe_vex") + (set_attr "prefix_extra" "1,*") + (set_attr "length_immediate" "*,1") + (set_attr "prefix" "maybe_vex,evex") + (set_attr "isa" "noavx512f,avx512f") (set_attr "mode" "<MODE>")]) (define_insn "rintxf2" --- gcc/config/i386/sse.md.jj 2016-04-27 14:34:43.903064451 +0200 +++ gcc/config/i386/sse.md 2016-04-27 14:34:52.407950325 +0200 @@ -14867,25 +14867,26 @@ (define_expand "<sse4_1>_round<ssemodesu }) (define_insn "sse4_1_round<ssescalarmodesuffix>" - [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x") + [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v") (vec_merge:VF_128 (unspec:VF_128 - [(match_operand:VF_128 2 "register_operand" "Yr,*x,x") - (match_operand:SI 3 "const_0_to_15_operand" "n,n,n")] + [(match_operand:VF_128 2 "register_operand" "Yr,*x,x,v") + (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")] UNSPEC_ROUND) - (match_operand:VF_128 1 "register_operand" "0,0,x") + (match_operand:VF_128 1 "register_operand" "0,0,x,v") (const_int 1)))] "TARGET_ROUND" "@ round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3} round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3} - vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "isa" "noavx,noavx,avx") + vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3} + vrndscale<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx,avx512f") (set_attr "type" "ssecvt") (set_attr "length_immediate" "1") - (set_attr "prefix_data16" "1,1,*") + (set_attr "prefix_data16" "1,1,*,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,orig,vex") + (set_attr "prefix" "orig,orig,vex,evex") (set_attr "mode" "<MODE>")]) (define_expand "round<mode>2" --- gcc/testsuite/gcc.target/i386/avx-vround-1.c.jj 2016-04-27 14:34:12.785482013 +0200 +++ gcc/testsuite/gcc.target/i386/avx-vround-1.c 2016-04-27 11:49:20.282759808 +0200 @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mavx -mno-avx2" } */ + +#include <x86intrin.h> + +__attribute__((noinline, noclone)) double +f1 (double x) +{ + return __builtin_round (x); +} + +__attribute__((noinline, noclone)) float +f2 (float x) +{ + return __builtin_roundf (x); +} + +__attribute__((noinline, noclone)) __m128d +f3 (__m128d x, __m128d y) +{ + return _mm_round_sd (x, y, _MM_FROUND_NINT); +} + +__attribute__((noinline, noclone)) __m128 +f4 (__m128 x, __m128 y) +{ + return _mm_round_ss (x, y, _MM_FROUND_NINT); +} + +__attribute__((noinline, noclone)) __m128d +f5 (__m128d x) +{ + return _mm_round_pd (x, _MM_FROUND_NINT); +} + +__attribute__((noinline, noclone)) __m128 +f6 (__m128 x) +{ + return _mm_round_ps (x, _MM_FROUND_NINT); +} + +__attribute__((noinline, noclone)) __m256d +f7 (__m256d x) +{ + return _mm256_round_pd (x, _MM_FROUND_NINT); +} + +__attribute__((noinline, noclone)) __m256 +f8 (__m256 x) +{ + return _mm256_round_ps (x, _MM_FROUND_NINT); +} + +/* { dg-final { scan-assembler-times "vroundsd\[^\n\r\]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vroundss\[^\n\r\]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vroundpd\[^\n\r\]*xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vroundps\[^\n\r\]*xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vroundpd\[^\n\r\]*ymm" 1 } } */ +/* { dg-final { scan-assembler-times "vroundps\[^\n\r\]*ymm" 1 } } */ --- gcc/testsuite/gcc.target/i386/avx-vround-2.c.jj 2016-04-27 14:34:18.815401099 +0200 +++ gcc/testsuite/gcc.target/i386/avx-vround-2.c 2016-04-27 14:51:51.912286022 +0200 @@ -0,0 +1,77 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-Ofast -mavx -mno-avx2" } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#define TEST avx_test +#define SRC "avx-vround-1.c" +#endif + +#include CHECK_H +#include SRC + +static void +TEST (void) +{ + union128d a, ae; + union128 b, be; + union256d c, ce; + union256 d, de; + if (f1 (0.5) != 1.0 || f1 (1.5) != 2.0 || f1 (-0.5) != -1.0 || f1 (-1.5) != -2.0) + abort (); + if (f2 (0.5f) != 1.0f || f2 (1.5f) != 2.0f || f2 (-0.5f) != -1.0f || f2 (-1.5f) != -2.0f) + abort (); + a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (0.5)); + ae.x = _mm_set_pd (7.0, 0.0); + if (check_union128d (a, ae.a)) + abort (); + a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (1.5)); + ae.x = _mm_set_pd (7.0, 2.0); + if (check_union128d (a, ae.a)) + abort (); + a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (-0.5)); + ae.x = _mm_set_pd (7.0, 0.0); + if (check_union128d (a, ae.a)) + abort (); + a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (-1.5)); + ae.x = _mm_set_pd (7.0, -2.0); + if (check_union128d (a, ae.a)) + abort (); + b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (0.5f)); + be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 0.0f); + if (check_union128 (b, be.a)) + abort (); + b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (1.5f)); + be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 2.0f); + if (check_union128 (b, be.a)) + abort (); + b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (-0.5f)); + be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 0.0f); + if (check_union128 (b, be.a)) + abort (); + b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (-1.5f)); + be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, -2.0f); + if (check_union128 (b, be.a)) + abort (); + a.x = f5 (_mm_set_pd (0.5, 1.5)); + ae.x = _mm_set_pd (0.0, 2.0); + if (check_union128d (a, ae.a)) + abort (); + a.x = f5 (_mm_set_pd (-0.5, -1.5)); + ae.x = _mm_set_pd (0.0, -2.0); + if (check_union128d (a, ae.a)) + abort (); + b.x = f6 (_mm_set_ps (0.5f, 1.5f, -0.5f, -1.5f)); + be.x = _mm_set_ps (0.0f, 2.0f, 0.0f, -2.0f); + if (check_union128 (b, be.a)) + abort (); + c.x = f7 (_mm256_set_pd (0.5, 1.5, -0.5, -1.5)); + ce.x = _mm256_set_pd (0.0, 2.0, 0.0, -2.0); + if (check_union256d (c, ce.a)) + abort (); + d.x = f8 (_mm256_set_ps (0.5f, 1.5f, -0.5f, -1.5f, 0.25f, 1.0f, -16.5f, 0.75f)); + de.x = _mm256_set_ps (0.0f, 2.0f, 0.0f, -2.0f, 0.0f, 1.0f, -16.0f, 1.0f); + if (check_union256 (d, de.a)) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx512vl-vround-1.c.jj 2016-04-27 14:35:59.146054776 +0200 +++ gcc/testsuite/gcc.target/i386/avx512vl-vround-1.c 2016-04-27 11:52:05.876501110 +0200 @@ -0,0 +1,96 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-Ofast -mavx512vl" } */ + +#include <x86intrin.h> + +__attribute__((noinline, noclone)) double +f1 (double x) +{ + register double a __asm__ ("xmm16") = __builtin_round (x); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) float +f2 (float x) +{ + register float a __asm__ ("xmm16") = __builtin_roundf (x); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m128d +f3 (__m128d x, __m128d y) +{ + register __m128d a __asm__ ("xmm16") = x, b __asm__ ("xmm17") = y; + __asm__ ("" : "+v" (a), "+v" (b)); + a = _mm_round_sd (a, b, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m128 +f4 (__m128 x, __m128 y) +{ + register __m128 a __asm__ ("xmm16") = x, b __asm__ ("xmm17") = y; + __asm__ ("" : "+v" (a), "+v" (b)); + a = _mm_round_ss (a, b, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m128d +f5 (__m128d x) +{ + register __m128d a __asm__ ("xmm16") = x; + __asm__ ("" : "+v" (a)); + a = _mm_round_pd (a, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m128 +f6 (__m128 x) +{ + register __m128 a __asm__ ("xmm16") = x; + __asm__ ("" : "+v" (a)); + a = _mm_round_ps (a, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m256d +f7 (__m256d x) +{ + register __m256d a __asm__ ("xmm16") = x; + __asm__ ("" : "+v" (a)); + a = _mm256_round_pd (a, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +__attribute__((noinline, noclone)) __m256 +f8 (__m256 x) +{ + register __m256 a __asm__ ("xmm16") = x; + __asm__ ("" : "+v" (a)); + a = _mm256_round_ps (a, _MM_FROUND_NINT); + __asm__ ("" : "+v" (a)); + return a; +} + +/* Instead of vround{sd,ss,pd,ps} this should use vrndscale{sd,ss,pd,ps} + counterparts, so that [xy]mm1[67] can be referenced directly in the + instructions. */ +/* { dg-final { scan-assembler-times "vrndscalesd\[^\n\r\]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vrndscaless\[^\n\r\]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vrndscalepd\[^\n\r\]*xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscaleps\[^\n\r\]*xmm" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscalepd\[^\n\r\]*ymm" 1 } } */ +/* { dg-final { scan-assembler-times "vrndscaleps\[^\n\r\]*ymm" 1 } } */ +/* { dg-final { scan-assembler-not "vroundsd\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler-not "vroundss\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler-not "vroundpd\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler-not "vroundps\[^\n\r\]*xmm" } } */ +/* { dg-final { scan-assembler-not "vroundpd\[^\n\r\]*ymm" } } */ +/* { dg-final { scan-assembler-not "vroundps\[^\n\r\]*ymm" } } */ --- gcc/testsuite/gcc.target/i386/avx512vl-vround-2.c.jj 2016-04-27 14:37:33.405789918 +0200 +++ gcc/testsuite/gcc.target/i386/avx512vl-vround-2.c 2016-04-27 14:41:57.432251717 +0200 @@ -0,0 +1,9 @@ +/* { dg-do run { target { ! ia32 } } } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-Ofast -mavx512vl" } */ + +#define CHECK_H "avx512vl-check.h" +#define TEST avx512vl_test +#define SRC "avx512vl-vround-1.c" + +#include "avx-vround-2.c"

Improve AVX512F sse4_1_round* patterns

Commit Message

Comments

Patch