Message ID | 20160620170919.GA21416@msticlxl57.ims.intel.com |
---|---|
State | New |
Headers | show |
On Mon, Jun 20, 2016 at 7:09 PM, Ilya Verbin <iverbin@gmail.com> wrote: > Hi! > > This patch emits vrcp28ps and vmulps istructions for ix86_emit_swdivsf. > The relative error is < 2^-23, so no additional iteration is necessary. > Regtested using various benchmarks on a AVX-512ER machine. OK for trunk? > > > gcc/ > * config/i386/i386.c (ix86_emit_swdivsf): Emit vrcp28ps. > gcc/testsuite/ > * gcc.target/i386/avx512er-vrcp28ps-3.c: New test. > * gcc.target/i386/avx512er-vrcp28ps-4.c: New test. OK. Thanks, Uros. > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 56a5b9c..8e0bf26 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -48674,8 +48674,19 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) > > /* x0 = rcp(b) estimate */ > if (mode == V16SFmode || mode == V8DFmode) > - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > - UNSPEC_RCP14))); > + { > + if (TARGET_AVX512ER) > + { > + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > + UNSPEC_RCP28))); > + /* res = a * x0 */ > + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); > + return; > + } > + else > + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > + UNSPEC_RCP14))); > + } > else > emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > UNSPEC_RCP))); > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c > new file mode 100644 > index 0000000..e08bea4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c > @@ -0,0 +1,50 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target avx512er } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-check.h" > + > +#define MAX 1000 > +#define EPS 0.00001 > + > +__attribute__ ((noinline, optimize (0))) > +void static > +compute_rcp_ref (float *a, float *b, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = a[i] / b[i]; > +} > + > +__attribute__ ((noinline)) > +void static > +compute_rcp_exp (float *a, float *b, float *r) > +{ > + for (int i = 0; i < MAX; i++) > + r[i] = a[i] / b[i]; > +} > + > +void static > +avx512er_test (void) > +{ > + float a[MAX]; > + float b[MAX]; > + float ref[MAX]; > + float exp[MAX]; > + > + for (int i = 0; i < MAX; i++) > + { > + a[i] = 179.345 - 6.5645 * i; > + b[i] = 8765.987 - 8.6756 * i; > + } > + > + compute_rcp_ref (a, b, ref); > + compute_rcp_exp (a, b, exp); > + > + for (int i = 0; i < MAX; i++) > + { > + float rel_err = (ref[i] - exp[i]) / ref[i]; > + rel_err = rel_err > 0.0 ? rel_err : -rel_err; > + if (rel_err > EPS) > + abort (); > + } > +} > diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c > new file mode 100644 > index 0000000..2c76d96 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c > @@ -0,0 +1,6 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ > + > +#include "avx512er-vrcp28ps-3.c" > + > +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */ > > > -- Ilya
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 56a5b9c..8e0bf26 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -48674,8 +48674,19 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) /* x0 = rcp(b) estimate */ if (mode == V16SFmode || mode == V8DFmode) - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP14))); + { + if (TARGET_AVX512ER) + { + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP28))); + /* res = a * x0 */ + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); + return; + } + else + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP14))); + } else emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), UNSPEC_RCP))); diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c new file mode 100644 index 0000000..e08bea4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-3.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512er } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include "avx512er-check.h" + +#define MAX 1000 +#define EPS 0.00001 + +__attribute__ ((noinline, optimize (0))) +void static +compute_rcp_ref (float *a, float *b, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = a[i] / b[i]; +} + +__attribute__ ((noinline)) +void static +compute_rcp_exp (float *a, float *b, float *r) +{ + for (int i = 0; i < MAX; i++) + r[i] = a[i] / b[i]; +} + +void static +avx512er_test (void) +{ + float a[MAX]; + float b[MAX]; + float ref[MAX]; + float exp[MAX]; + + for (int i = 0; i < MAX; i++) + { + a[i] = 179.345 - 6.5645 * i; + b[i] = 8765.987 - 8.6756 * i; + } + + compute_rcp_ref (a, b, ref); + compute_rcp_exp (a, b, exp); + + for (int i = 0; i < MAX; i++) + { + float rel_err = (ref[i] - exp[i]) / ref[i]; + rel_err = rel_err > 0.0 ? rel_err : -rel_err; + if (rel_err > EPS) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c new file mode 100644 index 0000000..2c76d96 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512er-vrcp28ps-4.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx512er" } */ + +#include "avx512er-vrcp28ps-3.c" + +/* { dg-final { scan-assembler-times "vrcp28ps\[^\n\r\]*zmm\[0-9\]+(?:\n|\[ \\t\]+#)" 1 } } */