Message ID | 20160620193512.GB21416@msticlxl57.ims.intel.com |
---|---|
State | New |
Headers | show |
On Mon, Jun 20, 2016 at 9:35 PM, Ilya Verbin <iverbin@gmail.com> wrote: > Hi! > > This patch emits vrsqrt28ps instruction in ix86_emit_swsqrtsf for recip case and > vrcp28ps(vrsqrt28ps(a)) for !recip. > Regtested using various benchmarks on a AVX-512ER machine. OK for trunk? > > > gcc/ > * config/i386/i386.c (ix86_emit_swsqrtsf): Emit vrsqrt28ps. > * config/i386/sse.md (define_expand "rsqrtv16sf2"): New. > gcc/testsuite/ > * gcc.target/i386/avx512er-vrsqrt28ps-3.c: New test. > * gcc.target/i386/avx512er-vrsqrt28ps-4.c: New test. > * gcc.target/i386/avx512er-vrsqrt28ps-5.c: New test. > * gcc.target/i386/avx512er-vrsqrt28ps-6.c: New test. OK. Thanks, Uros. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 8e0bf26..edd3d23 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -48722,6 +48722,24 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) > e2 = gen_reg_rtx (mode); > e3 = gen_reg_rtx (mode); > > + if (TARGET_AVX512ER && mode == V16SFmode) > + { > + if (recip) > + /* res = rsqrt28(a) estimate */ > + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), > + UNSPEC_RSQRT28))); > + else > + { > + /* x0 = rsqrt28(a) estimate */ > + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), > + UNSPEC_RSQRT28))); > + /* res = rcp28(x0) estimate */ > + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), > + UNSPEC_RCP28))); > + } > + return; > + } > + > real_from_integer (&r, VOIDmode, -3, SIGNED); > mthree = const_double_from_real_value (r, SFmode); > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index 6056ddc..c1ea04f 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -1559,6 +1559,17 @@ > DONE; > }) > > +(define_expand "rsqrtv16sf2" > + [(set (match_operand:V16SF 0 "register_operand") > + (unspec:V16SF > + [(match_operand:V16SF 1 "vector_operand")] > + UNSPEC_RSQRT28))] > + "TARGET_SSE_MATH && TARGET_AVX512ER" > +{ > + ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true); > + DONE; > +}) > + > (define_insn "<sse>_rsqrt<mode>2" > [(set (match_operand:VF1_128_256 0 "register_operand" "=x") > (unspec:VF1_128_256 > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c > new file mode 100644 > index 0000000..1ba8172 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c > @@ -0,0 +1,47 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx512er } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include <math.h> > +#include "avx512er-check.h" > + > +#define MAX 1000 > +#define EPS 0.00001 > + > +__attribute__ ((noinline, optimize (1))) > +void static > +compute_rsqrt_ref (float *a, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = 1.0 / sqrtf (a[i]); > +} > + > +__attribute__ ((noinline)) > +void static > +compute_rsqrt_exp (float *a, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = 1.0 / sqrtf (a[i]); > +} > + > +void static > +avx512er_test (void) > +{ > + float in[MAX]; > + float ref[MAX]; > + float exp[MAX]; > + > + for (int i = 0; i < MAX; i++) > + in[i] = 8765.987 - 8.6756 * i; > + > + compute_rsqrt_ref (in, ref); > + compute_rsqrt_exp (in, exp); > + > + for (int i = 0; i < MAX; i++) > + { > + float rel_err = (ref[i] - exp[i]) / ref[i]; > + rel_err = rel_err > 0.0 ? rel_err : -rel_err; > + if (rel_err > EPS) > + abort (); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c > new file mode 100644 > index 0000000..2f5f73f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-vrsqrt28ps-3.c" > + > +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-not "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c > new file mode 100644 > index 0000000..e067a81 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c > @@ -0,0 +1,47 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx512er } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include <math.h> > +#include "avx512er-check.h" > + > +#define MAX 1000 > +#define EPS 0.00001 > + > +__attribute__ ((noinline, optimize (1))) > +void static > +compute_sqrt_ref (float *a, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = sqrtf (a[i]); > +} > + > +__attribute__ ((noinline)) > +void static > +compute_sqrt_exp (float *a, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = sqrtf (a[i]); > +} > + > +void static > +avx512er_test (void) > +{ > + float in[MAX]; > + float ref[MAX]; > + float exp[MAX]; > + > + for (int i = 0; i < MAX; i++) > + in[i] = 8765.987 - 8.6756 * i; > + > + compute_sqrt_ref (in, ref); > + compute_sqrt_exp (in, exp); > + > + for (int i = 0; i < MAX; i++) > + { > + float rel_err = (ref[i] - exp[i]) / ref[i]; > + rel_err = rel_err > 0.0 ? rel_err : -rel_err; > + if (rel_err > EPS) > + abort (); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c > new file mode 100644 > index 0000000..77c5cba > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c > @@ -0,0 +1,7 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-vrsqrt28ps-5.c" > + > +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > > > -- Ilya
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 8e0bf26..edd3d23 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -48722,6 +48722,24 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) e2 = gen_reg_rtx (mode); e3 = gen_reg_rtx (mode); + if (TARGET_AVX512ER && mode == V16SFmode) + { + if (recip) + /* res = rsqrt28(a) estimate */ + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT28))); + else + { + /* x0 = rsqrt28(a) estimate */ + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT28))); + /* res = rcp28(x0) estimate */ + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), + UNSPEC_RCP28))); + } + return; + } + real_from_integer (&r, VOIDmode, -3, SIGNED); mthree = const_double_from_real_value (r, SFmode); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 6056ddc..c1ea04f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1559,6 +1559,17 @@ DONE; }) +(define_expand "rsqrtv16sf2" + [(set (match_operand:V16SF 0 "register_operand") + (unspec:V16SF + [(match_operand:V16SF 1 "vector_operand")] + UNSPEC_RSQRT28))] + "TARGET_SSE_MATH && TARGET_AVX512ER" +{ + ix86_emit_swsqrtsf (operands[0], operands[1], V16SFmode, true); + DONE; +}) + (define_insn "<sse>_rsqrt<mode>2" [(set (match_operand:VF1_128_256 0 "register_operand" "=x") (unspec:VF1_128_256 diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c new file mode 100644 index 0000000..1ba8172 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-3.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512er } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include <math.h> +#include "avx512er-check.h" + +#define MAX 1000 +#define EPS 0.00001 + +__attribute__ ((noinline, optimize (1))) +void static +compute_rsqrt_ref (float *a, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = 1.0 / sqrtf (a[i]); +} + +__attribute__ ((noinline)) +void static +compute_rsqrt_exp (float *a, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = 1.0 / sqrtf (a[i]); +} + +void static +avx512er_test (void) +{ + float in[MAX]; + float ref[MAX]; + float exp[MAX]; + + for (int i = 0; i < MAX; i++) + in[i] = 8765.987 - 8.6756 * i; + + compute_rsqrt_ref (in, ref); + compute_rsqrt_exp (in, exp); + + for (int i = 0; i < MAX; i++) + { + float rel_err = (ref[i] - exp[i]) / ref[i]; + rel_err = rel_err > 0.0 ? rel_err : -rel_err; + if (rel_err > EPS) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c new file mode 100644 index 0000000..2f5f73f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-4.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include "avx512er-vrsqrt28ps-3.c" + +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-not "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c new file mode 100644 index 0000000..e067a81 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-5.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512er } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include <math.h> +#include "avx512er-check.h" + +#define MAX 1000 +#define EPS 0.00001 + +__attribute__ ((noinline, optimize (1))) +void static +compute_sqrt_ref (float *a, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = sqrtf (a[i]); +} + +__attribute__ ((noinline)) +void static +compute_sqrt_exp (float *a, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = sqrtf (a[i]); +} + +void static +avx512er_test (void) +{ + float in[MAX]; + float ref[MAX]; + float exp[MAX]; + + for (int i = 0; i < MAX; i++) + in[i] = 8765.987 - 8.6756 * i; + + compute_sqrt_ref (in, ref); + compute_sqrt_exp (in, exp); + + for (int i = 0; i < MAX; i++) + { + float rel_err = (ref[i] - exp[i]) / ref[i]; + rel_err = rel_err > 0.0 ? rel_err : -rel_err; + if (rel_err > EPS) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c new file mode 100644 index 0000000..77c5cba --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrsqrt28ps-6.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include "avx512er-vrsqrt28ps-5.c" + +/* { dg-final { scan-assembler-times "vrsqrt28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */