diff mbox series

Support 128/256/512-bit vector _Float16 plus/smin/smax reduce.

Message ID 20210927085405.3420838-1-hongtao.liu@intel.com
State New
Headers show
Series Support 128/256/512-bit vector _Float16 plus/smin/smax reduce. | expand

Commit Message

liuhongt Sept. 27, 2021, 8:54 a.m. UTC
Hi:
  Add expanders for reduc_{smin,smax,plus}_scal_{v8hf,v16hf,v32hf}
  Bootstrapped and regtest on x86_64-pc-linux-gnu{-m32,}
  
gcc/ChangeLog:

	* config/i386/i386-expand.c (emit_reduc_half): Handle
	V8HF/V16HF/V32HFmode.
	* config/i386/sse.md (REDUC_SSE_PLUS_MODE): Add V8HF.
	(REDUC_SSE_SMINMAX_MODE): Ditto.
	(REDUC_PLUS_MODE): Add V16HF and V32HF.
	(REDUC_SMINMAX_MODE): Ditto.

gcc/testsuite

	* gcc.target/i386/avx512fp16-reduce-op-2.c: New test.
	* gcc.target/i386/avx512fp16-reduce-op-3.c: New test.
---
 gcc/config/i386/i386-expand.c                 |  3 +
 gcc/config/i386/sse.md                        | 10 +-
 .../gcc.target/i386/avx512fp16-reduce-op-2.c  | 96 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-reduce-op-3.c  | 91 ++++++++++++++++++
 4 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 94ac303585e..4780b993917 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -16045,6 +16045,7 @@  emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
       d = gen_reg_rtx (V1TImode);
@@ -16066,6 +16067,7 @@  emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V32QImode:
     case E_V16HImode:
+    case E_V16HFmode:
     case E_V8SImode:
     case E_V4DImode:
       if (i == 256)
@@ -16085,6 +16087,7 @@  emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V64QImode:
     case E_V32HImode:
+    case E_V32HFmode:
       if (i < 64)
 	{
 	  d = gen_reg_rtx (V4TImode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb7600edbab..4559b0ce9c9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3157,7 +3157,8 @@  (define_insn "sse3_h<insn>v4sf3"
    (set_attr "mode" "V4SF")])
 
 (define_mode_iterator REDUC_SSE_PLUS_MODE
- [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")])
+ [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")
+  (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
 
 (define_expand "reduc_plus_scal_<mode>"
  [(plus:REDUC_SSE_PLUS_MODE
@@ -3194,7 +3195,9 @@  (define_expand "reduc_plus_scal_v16qi"
 
 (define_mode_iterator REDUC_PLUS_MODE
  [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
+  (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
 
 (define_expand "reduc_plus_scal_<mode>"
@@ -3214,7 +3217,8 @@  (define_expand "reduc_plus_scal_<mode>"
 
 ;; Modes handled by reduc_sm{in,ax}* patterns.
 (define_mode_iterator REDUC_SSE_SMINMAX_MODE
-  [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
+  [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
    (V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2")
    (V2DI "TARGET_SSE4_2")])
 
@@ -3233,9 +3237,11 @@  (define_expand "reduc_<code>_scal_<mode>"
 
 (define_mode_iterator REDUC_SMINMAX_MODE
   [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
    (V64QI "TARGET_AVX512BW")
+   (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F")
    (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
    (V8DF "TARGET_AVX512F")])
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
new file mode 100644
index 00000000000..593340e4afa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
@@ -0,0 +1,96 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mprefer-vector-width=512 -fdump-tree-optimized" } */
+
+/* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.REDUC_MIN" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.REDUC_MAX" 3 "optimized" } } */
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_128 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 8; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_256 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 16; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_512 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 32; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_128 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 8; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_256 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 16; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_512 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 32; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_128 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 8; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_256 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 16; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_512 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 32; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c
new file mode 100644
index 00000000000..9281a3be248
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c
@@ -0,0 +1,91 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512FP16
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "avx512fp16-reduce-op-2.c"
+
+void
+test_256 (void)
+{
+  _Float16 a[32];
+  int sign = 1;
+  _Float16 res1 = 0, exp1;
+  _Float16 res2 = 0, exp2;
+  _Float16 res3 = 0, exp3;
+
+  for (int i = 0; i != 32; i++)
+    {
+      a[i] = sign * (4.0 * i);
+      sign *= -1;
+      if (i < 8)
+	res1 += a[i];
+      if (i < 16)
+	res2 += a[i];
+      res3 += a[i];
+    }
+
+  exp1 = reduc_add_128 (a);
+  exp2 = reduc_add_256 (a);
+  exp3 = reduc_add_512 (a);
+  if (exp1 != res1 || exp2 != res2 || exp3 != res3)
+    abort();
+}
+
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+
+void
+test_128 ()
+{
+  _Float16 a[32];
+  int sign = 1;
+  _Float16 min_res1, min_exp1, max_res1, max_exp1;
+  _Float16 min_res2, min_exp2, max_res2, max_exp2;
+  _Float16 min_res3, min_exp3, max_res3, max_exp3;
+
+  for (int i = 0; i != 32; i++)
+    {
+      a[i] = sign * (4.9 * i * i - 8.3 * i + 14.8);
+      sign *= -1;
+    }
+
+  min_res1 = max_res1 = a[0];
+  for (int i = 0 ; i != 8; i++)
+    {
+      min_res1 = MIN (min_res1, a[i]);
+      max_res1 = MAX (max_res1, a[i]);
+    }
+
+  min_res2 = min_res1;
+  max_res2 = max_res1;
+  for (int i = 8 ; i != 16; i++)
+    {
+      min_res2 = MIN (min_res2, a[i]);
+      max_res2 = MAX (max_res2, a[i]);
+    }
+
+  min_res3 = min_res2;
+  max_res3 = max_res2;
+  for (int i = 16 ; i != 32; i++)
+    {
+      min_res3 = MIN (min_res3, a[i]);
+      max_res3 = MAX (max_res3, a[i]);
+    }
+
+  min_exp1 = reduc_min_128 (a);
+  min_exp2 = reduc_min_256 (a);
+  min_exp3 = reduc_min_512 (a);
+  max_exp1 = reduc_max_128 (a);
+  max_exp2 = reduc_max_256 (a);
+  max_exp3 = reduc_max_512 (a);
+
+  if (min_exp1 != min_res1 || min_exp2 != min_res2 || min_exp3 != min_res3
+      || max_exp1 != max_res1 || max_exp2 != max_res2 || max_exp3 != max_res3)
+    abort();
+}