diff mbox

[i386,AVX-512ER] vrsqrt28ps auto generation

Message ID 20160620193512.GB21416@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Ilya Verbin June 20, 2016, 7:35 p.m. UTC
Hi!

This patch emits vrsqrt28ps instruction in ix86_emit_swsqrtsf for recip case and
vrcp28ps(vrsqrt28ps(a)) for !recip.
Regtested using various benchmarks on a AVX-512ER machine.  OK for trunk?


gcc/
	* config/i386/i386.c (ix86_emit_swsqrtsf): Emit vrsqrt28ps.
	* config/i386/sse.md (define_expand "rsqrtv16sf2"): New.
gcc/testsuite/
	* gcc.target/i386/avx512er-vrsqrt28ps-3.c: New test.
	* gcc.target/i386/avx512er-vrsqrt28ps-4.c: New test.
	* gcc.target/i386/avx512er-vrsqrt28ps-5.c: New test.
	* gcc.target/i386/avx512er-vrsqrt28ps-6.c: New test.




  -- Ilya

Comments

Uros Bizjak June 20, 2016, 7:43 p.m. UTC | #1
On Mon, Jun 20, 2016 at 9:35 PM, Ilya Verbin <iverbin@gmail.com> wrote:
> Hi!
>
> This patch emits vrsqrt28ps instruction in ix86_emit_swsqrtsf for recip case and
> vrcp28ps(vrsqrt28ps(a)) for !recip.
> Regtested using various benchmarks on a AVX-512ER machine.  OK for trunk?
>
>
> gcc/
>         * config/i386/i386.c (ix86_emit_swsqrtsf): Emit vrsqrt28ps.
>         * config/i386/sse.md (define_expand "rsqrtv16sf2"): New.
> gcc/testsuite/
>         * gcc.target/i386/avx512er-vrsqrt28ps-3.c: New test.
>         * gcc.target/i386/avx512er-vrsqrt28ps-4.c: New test.
>         * gcc.target/i386/avx512er-vrsqrt28ps-5.c: New test.
>         * gcc.target/i386/avx512er-vrsqrt28ps-6.c: New test.

OK.

Thanks,
Uros.

>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 8e0bf26..edd3d23 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -48722,6 +48722,24 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
>    e2 = gen_reg_rtx (mode);
>    e3 = gen_reg_rtx (mode);
>
> +  if (TARGET_AVX512ER && mode == V16SFmode)
> +    {
> +      if (recip)
> +       /* res = rsqrt28(a) estimate */
> +       emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
> +                                                    UNSPEC_RSQRT28)));
> +      else
> +       {
> +         /* x0 = rsqrt28(a) estimate */
> +         emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
> +                                                     UNSPEC_RSQRT28)));
> +         /* res = rcp28(x0) estimate */
> +         emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
> +                                                      UNSPEC_RCP28)));
> +       }
> +      return;
> +    }
> +
>    real_from_integer (&r, VOIDmode, -3, SIGNED);
>    mthree = const_double_from_real_value (r, SFmode);
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 6056ddc..c1ea04f 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1559,6 +1559,17 @@
>    DONE;
>  })
>
> +(define_expand "rsqrtv16sf2"
> +  [(set (match_operand:V16SF 0 "register_operand")
> +       (unspec:V16SF
> +         [(match_operand:V16SF 1 "vector_operand")]
> +         UNSPEC_RSQRT28))]
> +  "TARGET_SSE_MATH && TARGET_AVX512ER"
> +{
> +  ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true);
> +  DONE;
> +})
> +
>  (define_insn "<sse>_rsqrt<mode>2"
>    [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
>         (unspec:VF1_128_256
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
> new file mode 100644
> index 0000000..1ba8172
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target avx512er } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include <math.h>
> +#include "avx512er-check.h"
> +
> +#define MAX 1000
> +#define EPS 0.00001
> +
> +__attribute__ ((noinline, optimize (1)))
> +void static
> +compute_rsqrt_ref (float *a, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +    r[i] = 1.0 / sqrtf (a[i]);
> +}
> +
> +__attribute__ ((noinline))
> +void static
> +compute_rsqrt_exp (float *a, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +    r[i] = 1.0 / sqrtf (a[i]);
> +}
> +
> +void static
> +avx512er_test (void)
> +{
> +  float in[MAX];
> +  float ref[MAX];
> +  float exp[MAX];
> +
> +  for (int i = 0; i < MAX; i++)
> +    in[i] = 8765.987 - 8.6756 * i;
> +
> +  compute_rsqrt_ref (in, ref);
> +  compute_rsqrt_exp (in, exp);
> +
> +  for (int i = 0; i < MAX; i++)
> +    {
> +      float rel_err = (ref[i] - exp[i]) / ref[i];
> +      rel_err = rel_err > 0.0 ? rel_err : -rel_err;
> +      if (rel_err > EPS)
> +       abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c
> new file mode 100644
> index 0000000..2f5f73f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include "avx512er-vrsqrt28ps-3.c"
> +
> +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-not "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c
> new file mode 100644
> index 0000000..e067a81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target avx512er } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include <math.h>
> +#include "avx512er-check.h"
> +
> +#define MAX 1000
> +#define EPS 0.00001
> +
> +__attribute__ ((noinline, optimize (1)))
> +void static
> +compute_sqrt_ref (float *a, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +    r[i] = sqrtf (a[i]);
> +}
> +
> +__attribute__ ((noinline))
> +void static
> +compute_sqrt_exp (float *a, float *r)
> +{
> +  for (int i = 0; i < MAX; i++)
> +    r[i] = sqrtf (a[i]);
> +}
> +
> +void static
> +avx512er_test (void)
> +{
> +  float in[MAX];
> +  float ref[MAX];
> +  float exp[MAX];
> +
> +  for (int i = 0; i < MAX; i++)
> +    in[i] = 8765.987 - 8.6756 * i;
> +
> +  compute_sqrt_ref (in, ref);
> +  compute_sqrt_exp (in, exp);
> +
> +  for (int i = 0; i < MAX; i++)
> +    {
> +      float rel_err = (ref[i] - exp[i]) / ref[i];
> +      rel_err = rel_err > 0.0 ? rel_err : -rel_err;
> +      if (rel_err > EPS)
> +       abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c
> new file mode 100644
> index 0000000..77c5cba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
> +
> +#include "avx512er-vrsqrt28ps-5.c"
> +
> +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
>
>
>   -- Ilya
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 8e0bf26..edd3d23 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48722,6 +48722,24 @@  void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
   e2 = gen_reg_rtx (mode);
   e3 = gen_reg_rtx (mode);
 
+  if (TARGET_AVX512ER && mode == V16SFmode)
+    {
+      if (recip)
+	/* res = rsqrt28(a) estimate */
+	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+						     UNSPEC_RSQRT28)));
+      else
+	{
+	  /* x0 = rsqrt28(a) estimate */
+	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+						      UNSPEC_RSQRT28)));
+	  /* res = rcp28(x0) estimate */
+	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
+						       UNSPEC_RCP28)));
+	}
+      return;
+    }
+
   real_from_integer (&r, VOIDmode, -3, SIGNED);
   mthree = const_double_from_real_value (r, SFmode);
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6056ddc..c1ea04f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1559,6 +1559,17 @@ 
   DONE;
 })
 
+(define_expand "rsqrtv16sf2"
+  [(set (match_operand:V16SF 0 "register_operand")
+	(unspec:V16SF
+	  [(match_operand:V16SF 1 "vector_operand")]
+	  UNSPEC_RSQRT28))]
+  "TARGET_SSE_MATH && TARGET_AVX512ER"
+{
+  ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true);
+  DONE;
+})
+
 (define_insn "<sse>_rsqrt<mode>2"
   [(set (match_operand:VF1_128_256 0 "register_operand" "=x")
 	(unspec:VF1_128_256
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
new file mode 100644
index 0000000..1ba8172
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include <math.h>
+#include "avx512er-check.h"
+
+#define MAX 1000
+#define EPS 0.00001
+
+__attribute__ ((noinline, optimize (1)))
+void static
+compute_rsqrt_ref (float *a, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+    r[i] = 1.0 / sqrtf (a[i]);
+}
+
+__attribute__ ((noinline))
+void static
+compute_rsqrt_exp (float *a, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+    r[i] = 1.0 / sqrtf (a[i]);
+}
+
+void static
+avx512er_test (void)
+{
+  float in[MAX];
+  float ref[MAX];
+  float exp[MAX];
+
+  for (int i = 0; i < MAX; i++)
+    in[i] = 8765.987 - 8.6756 * i;
+
+  compute_rsqrt_ref (in, ref);
+  compute_rsqrt_exp (in, exp);
+
+  for (int i = 0; i < MAX; i++)
+    {
+      float rel_err = (ref[i] - exp[i]) / ref[i];
+      rel_err = rel_err > 0.0 ? rel_err : -rel_err;
+      if (rel_err > EPS)
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c
new file mode 100644
index 0000000..2f5f73f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include "avx512er-vrsqrt28ps-3.c"
+
+/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-not "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c
new file mode 100644
index 0000000..e067a81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx512er } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include <math.h>
+#include "avx512er-check.h"
+
+#define MAX 1000
+#define EPS 0.00001
+
+__attribute__ ((noinline, optimize (1)))
+void static
+compute_sqrt_ref (float *a, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+    r[i] = sqrtf (a[i]);
+}
+
+__attribute__ ((noinline))
+void static
+compute_sqrt_exp (float *a, float *r)
+{
+  for (int i = 0; i < MAX; i++)
+    r[i] = sqrtf (a[i]);
+}
+
+void static
+avx512er_test (void)
+{
+  float in[MAX];
+  float ref[MAX];
+  float exp[MAX];
+
+  for (int i = 0; i < MAX; i++)
+    in[i] = 8765.987 - 8.6756 * i;
+
+  compute_sqrt_ref (in, ref);
+  compute_sqrt_exp (in, exp);
+
+  for (int i = 0; i < MAX; i++)
+    {
+      float rel_err = (ref[i] - exp[i]) / ref[i];
+      rel_err = rel_err > 0.0 ? rel_err : -rel_err;
+      if (rel_err > EPS)
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c
new file mode 100644
index 0000000..77c5cba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c
@@ -0,0 +1,7 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */
+
+#include "avx512er-vrsqrt28ps-5.c"
+
+/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */