diff mbox

Improve AVX512F sse4_1_round* patterns

Message ID 20160427213433.GI26501@tucnak.zalov.cz
State New
Headers show

Commit Message

Jakub Jelinek April 27, 2016, 9:34 p.m. UTC
Hi!

While AVX512F doesn't contain EVEX encoded vround{ss,sd,ps,pd} instructions,
it contains vrndscale* which performs the same thing if bits [4:7] of the
immediate are zero.

For _mm*_round_{ps,pd} we actually already emit vrndscale* for -mavx512f
instead of vround* unconditionally (because
<avx512>_rndscale<mode><mask_name><round_saeonly_name>
instruction has the same RTL as <sse4_1>_round<ssemodesuffix><avxsizesuffix>
and the former, enabled for TARGET_AVX512F, comes first), for the scalar
cases (thus __builtin_round* or _mm*_round_s{s,d}) the patterns we have
don't allow extended registers and thus we end up with unnecessary moves
if the inputs and/or outputs are or could be most effectively allocated
in the xmm16+ registers.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?

2016-04-27  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.md (sse4_1_round<mode>2): Add avx512f alternative.
	* config/i386/sse.md (sse4_1_round<ssescalarmodesuffix>): Likewise.

	* gcc.target/i386/avx-vround-1.c: New test.
	* gcc.target/i386/avx-vround-2.c: New test.
	* gcc.target/i386/avx512vl-vround-1.c: New test.
	* gcc.target/i386/avx512vl-vround-2.c: New test.


	Jakub

Comments

Kirill Yukhin April 28, 2016, 2:27 p.m. UTC | #1
Hi Jakub,
On 27 Apr 23:34, Jakub Jelinek wrote:
> Hi!
> 
> While AVX512F doesn't contain EVEX encoded vround{ss,sd,ps,pd} instructions,
> it contains vrndscale* which performs the same thing if bits [4:7] of the
> immediate are zero.
> 
> For _mm*_round_{ps,pd} we actually already emit vrndscale* for -mavx512f
> instead of vround* unconditionally (because
> <avx512>_rndscale<mode><mask_name><round_saeonly_name>
> instruction has the same RTL as <sse4_1>_round<ssemodesuffix><avxsizesuffix>
> and the former, enabled for TARGET_AVX512F, comes first), for the scalar
> cases (thus __builtin_round* or _mm*_round_s{s,d}) the patterns we have
> don't allow extended registers and thus we end up with unnecessary moves
> if the inputs and/or outputs are or could be most effectively allocated
> in the xmm16+ registers.
> 
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?
Your patch is OK.
> 
> 2016-04-27  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/i386.md (sse4_1_round<mode>2): Add avx512f alternative.
> 	* config/i386/sse.md (sse4_1_round<ssescalarmodesuffix>): Likewise.
> 
> 	* gcc.target/i386/avx-vround-1.c: New test.
> 	* gcc.target/i386/avx-vround-2.c: New test.
> 	* gcc.target/i386/avx512vl-vround-1.c: New test.
> 	* gcc.target/i386/avx512vl-vround-2.c: New test.

--
Thanks, K
diff mbox

Patch

--- gcc/config/i386/i386.md.jj	2016-04-27 14:34:43.897064531 +0200
+++ gcc/config/i386/i386.md	2016-04-27 14:34:52.402950392 +0200
@@ -15510,15 +15510,19 @@  (define_expand "significand<mode>2"
 
 
 (define_insn "sse4_1_round<mode>2"
-  [(set (match_operand:MODEF 0 "register_operand" "=x")
-	(unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x")
-		       (match_operand:SI 2 "const_0_to_15_operand" "n")]
+  [(set (match_operand:MODEF 0 "register_operand" "=x,v")
+	(unspec:MODEF [(match_operand:MODEF 1 "register_operand" "x,v")
+		       (match_operand:SI 2 "const_0_to_15_operand" "n,n")]
 		      UNSPEC_ROUND))]
   "TARGET_ROUND"
-  "%vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}"
+  "@
+   %vround<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}
+   vrndscale<ssemodesuffix>\t{%2, %1, %d0|%d0, %1, %2}"
   [(set_attr "type" "ssecvt")
-   (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "maybe_vex")
+   (set_attr "prefix_extra" "1,*")
+   (set_attr "length_immediate" "*,1")
+   (set_attr "prefix" "maybe_vex,evex")
+   (set_attr "isa" "noavx512f,avx512f")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "rintxf2"
--- gcc/config/i386/sse.md.jj	2016-04-27 14:34:43.903064451 +0200
+++ gcc/config/i386/sse.md	2016-04-27 14:34:52.407950325 +0200
@@ -14867,25 +14867,26 @@  (define_expand "<sse4_1>_round<ssemodesu
 })
 
 (define_insn "sse4_1_round<ssescalarmodesuffix>"
-  [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x")
+  [(set (match_operand:VF_128 0 "register_operand" "=Yr,*x,x,v")
 	(vec_merge:VF_128
 	  (unspec:VF_128
-	    [(match_operand:VF_128 2 "register_operand" "Yr,*x,x")
-	     (match_operand:SI 3 "const_0_to_15_operand" "n,n,n")]
+	    [(match_operand:VF_128 2 "register_operand" "Yr,*x,x,v")
+	     (match_operand:SI 3 "const_0_to_15_operand" "n,n,n,n")]
 	    UNSPEC_ROUND)
-	  (match_operand:VF_128 1 "register_operand" "0,0,x")
+	  (match_operand:VF_128 1 "register_operand" "0,0,x,v")
 	  (const_int 1)))]
   "TARGET_ROUND"
   "@
    round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
    round<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "isa" "noavx,noavx,avx")
+   vround<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vrndscale<ssescalarmodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "noavx,noavx,avx,avx512f")
    (set_attr "type" "ssecvt")
    (set_attr "length_immediate" "1")
-   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_data16" "1,1,*,*")
    (set_attr "prefix_extra" "1")
-   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "prefix" "orig,orig,vex,evex")
    (set_attr "mode" "<MODE>")])
 
 (define_expand "round<mode>2"
--- gcc/testsuite/gcc.target/i386/avx-vround-1.c.jj	2016-04-27 14:34:12.785482013 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vround-1.c	2016-04-27 11:49:20.282759808 +0200
@@ -0,0 +1,59 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx -mno-avx2" } */
+
+#include <x86intrin.h>
+
+__attribute__((noinline, noclone)) double
+f1 (double x)
+{
+  return __builtin_round (x);
+}
+
+__attribute__((noinline, noclone)) float
+f2 (float x)
+{
+  return __builtin_roundf (x);
+}
+
+__attribute__((noinline, noclone)) __m128d
+f3 (__m128d x, __m128d y)
+{
+  return _mm_round_sd (x, y, _MM_FROUND_NINT);
+}
+
+__attribute__((noinline, noclone)) __m128
+f4 (__m128 x, __m128 y)
+{
+  return _mm_round_ss (x, y, _MM_FROUND_NINT);
+}
+
+__attribute__((noinline, noclone)) __m128d
+f5 (__m128d x)
+{
+  return _mm_round_pd (x, _MM_FROUND_NINT);
+}
+
+__attribute__((noinline, noclone)) __m128
+f6 (__m128 x)
+{
+  return _mm_round_ps (x, _MM_FROUND_NINT);
+}
+
+__attribute__((noinline, noclone)) __m256d
+f7 (__m256d x)
+{
+  return _mm256_round_pd (x, _MM_FROUND_NINT);
+}
+
+__attribute__((noinline, noclone)) __m256
+f8 (__m256 x)
+{
+  return _mm256_round_ps (x, _MM_FROUND_NINT);
+}
+
+/* { dg-final { scan-assembler-times "vroundsd\[^\n\r\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vroundss\[^\n\r\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vroundpd\[^\n\r\]*xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vroundps\[^\n\r\]*xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vroundpd\[^\n\r\]*ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vroundps\[^\n\r\]*ymm" 1 } } */
--- gcc/testsuite/gcc.target/i386/avx-vround-2.c.jj	2016-04-27 14:34:18.815401099 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vround-2.c	2016-04-27 14:51:51.912286022 +0200
@@ -0,0 +1,77 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-Ofast -mavx -mno-avx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+#define SRC "avx-vround-1.c"
+#endif
+
+#include CHECK_H
+#include SRC
+
+static void
+TEST (void)
+{
+  union128d a, ae;
+  union128 b, be;
+  union256d c, ce;
+  union256 d, de;
+  if (f1 (0.5) != 1.0 || f1 (1.5) != 2.0 || f1 (-0.5) != -1.0 || f1 (-1.5) != -2.0)
+    abort ();
+  if (f2 (0.5f) != 1.0f || f2 (1.5f) != 2.0f || f2 (-0.5f) != -1.0f || f2 (-1.5f) != -2.0f)
+    abort ();
+  a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (0.5));
+  ae.x = _mm_set_pd (7.0, 0.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (1.5));
+  ae.x = _mm_set_pd (7.0, 2.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (-0.5));
+  ae.x = _mm_set_pd (7.0, 0.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  a.x = f3 (_mm_set1_pd (7.0), _mm_set1_pd (-1.5));
+  ae.x = _mm_set_pd (7.0, -2.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (0.5f));
+  be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 0.0f);
+  if (check_union128 (b, be.a))
+    abort ();
+  b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (1.5f));
+  be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 2.0f);
+  if (check_union128 (b, be.a))
+    abort ();
+  b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (-0.5f));
+  be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, 0.0f);
+  if (check_union128 (b, be.a))
+    abort ();
+  b.x = f4 (_mm_set1_ps (7.0f), _mm_set1_ps (-1.5f));
+  be.x = _mm_set_ps (7.0f, 7.0f, 7.0f, -2.0f);
+  if (check_union128 (b, be.a))
+    abort ();
+  a.x = f5 (_mm_set_pd (0.5, 1.5));
+  ae.x = _mm_set_pd (0.0, 2.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  a.x = f5 (_mm_set_pd (-0.5, -1.5));
+  ae.x = _mm_set_pd (0.0, -2.0);
+  if (check_union128d (a, ae.a))
+    abort ();
+  b.x = f6 (_mm_set_ps (0.5f, 1.5f, -0.5f, -1.5f));
+  be.x = _mm_set_ps (0.0f, 2.0f, 0.0f, -2.0f);
+  if (check_union128 (b, be.a))
+    abort ();
+  c.x = f7 (_mm256_set_pd (0.5, 1.5, -0.5, -1.5));
+  ce.x = _mm256_set_pd (0.0, 2.0, 0.0, -2.0);
+  if (check_union256d (c, ce.a))
+    abort ();
+  d.x = f8 (_mm256_set_ps (0.5f, 1.5f, -0.5f, -1.5f, 0.25f, 1.0f, -16.5f, 0.75f));
+  de.x = _mm256_set_ps (0.0f, 2.0f, 0.0f, -2.0f, 0.0f, 1.0f, -16.0f, 1.0f);
+  if (check_union256 (d, de.a))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx512vl-vround-1.c.jj	2016-04-27 14:35:59.146054776 +0200
+++ gcc/testsuite/gcc.target/i386/avx512vl-vround-1.c	2016-04-27 11:52:05.876501110 +0200
@@ -0,0 +1,96 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Ofast -mavx512vl" } */
+
+#include <x86intrin.h>
+
+__attribute__((noinline, noclone)) double
+f1 (double x)
+{
+  register double a __asm__ ("xmm16") = __builtin_round (x);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) float
+f2 (float x)
+{
+  register float a __asm__ ("xmm16") = __builtin_roundf (x);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m128d
+f3 (__m128d x, __m128d y)
+{
+  register __m128d a __asm__ ("xmm16") = x, b __asm__ ("xmm17") = y;
+  __asm__ ("" : "+v" (a), "+v" (b));
+  a = _mm_round_sd (a, b, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m128
+f4 (__m128 x, __m128 y)
+{
+  register __m128 a __asm__ ("xmm16") = x, b __asm__ ("xmm17") = y;
+  __asm__ ("" : "+v" (a), "+v" (b));
+  a = _mm_round_ss (a, b, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m128d
+f5 (__m128d x)
+{
+  register __m128d a __asm__ ("xmm16") = x;
+  __asm__ ("" : "+v" (a));
+  a = _mm_round_pd (a, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m128
+f6 (__m128 x)
+{
+  register __m128 a __asm__ ("xmm16") = x;
+  __asm__ ("" : "+v" (a));
+  a = _mm_round_ps (a, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m256d
+f7 (__m256d x)
+{
+  register __m256d a __asm__ ("xmm16") = x;
+  __asm__ ("" : "+v" (a));
+  a = _mm256_round_pd (a, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+__attribute__((noinline, noclone)) __m256
+f8 (__m256 x)
+{
+  register __m256 a __asm__ ("xmm16") = x;
+  __asm__ ("" : "+v" (a));
+  a = _mm256_round_ps (a, _MM_FROUND_NINT);
+  __asm__ ("" : "+v" (a));
+  return a;
+}
+
+/* Instead of vround{sd,ss,pd,ps} this should use vrndscale{sd,ss,pd,ps}
+   counterparts, so that [xy]mm1[67] can be referenced directly in the
+   instructions.  */
+/* { dg-final { scan-assembler-times "vrndscalesd\[^\n\r\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vrndscaless\[^\n\r\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vrndscalepd\[^\n\r\]*xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vrndscaleps\[^\n\r\]*xmm" 1 } } */
+/* { dg-final { scan-assembler-times "vrndscalepd\[^\n\r\]*ymm" 1 } } */
+/* { dg-final { scan-assembler-times "vrndscaleps\[^\n\r\]*ymm" 1 } } */
+/* { dg-final { scan-assembler-not "vroundsd\[^\n\r\]*xmm" } } */
+/* { dg-final { scan-assembler-not "vroundss\[^\n\r\]*xmm" } } */
+/* { dg-final { scan-assembler-not "vroundpd\[^\n\r\]*xmm" } } */
+/* { dg-final { scan-assembler-not "vroundps\[^\n\r\]*xmm" } } */
+/* { dg-final { scan-assembler-not "vroundpd\[^\n\r\]*ymm" } } */
+/* { dg-final { scan-assembler-not "vroundps\[^\n\r\]*ymm" } } */
--- gcc/testsuite/gcc.target/i386/avx512vl-vround-2.c.jj	2016-04-27 14:37:33.405789918 +0200
+++ gcc/testsuite/gcc.target/i386/avx512vl-vround-2.c	2016-04-27 14:41:57.432251717 +0200
@@ -0,0 +1,9 @@ 
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-Ofast -mavx512vl" } */
+
+#define CHECK_H "avx512vl-check.h"
+#define TEST avx512vl_test
+#define SRC "avx512vl-vround-1.c"
+
+#include "avx-vround-2.c"