diff mbox

[rs6000,V2] Add support for vec_extract_fp_from_shorth() and vec_extract_fp_from_short

Message ID 1500053886.474.27.camel@us.ibm.com
State New
Headers show

Commit Message

Carl Love July 14, 2017, 5:38 p.m. UTC
GCC Maintainers:

The following patch adds support for the vec_extract_fp_from_shorth()
and vec_extract_fp_from_short builtin functions has been updated per
Segher's comments. The patch has been re-tested on
powerpc64le-unknown-linux-gnu (Power 8 LE) and
powerpc64le-unknown-linux-gnu (Power 9 LE).  The test generates 1
unsupported test on Power 8 and 2 test passes on Power 9. 

Note, I verified that "dg-require-effective-target powerpc_p9vector_ok"
is not required in the test case.

Also, per the note from Segher I could use xxperm instead of vperm, I
stayed with the vperm instruction.  I don't see any functional or
performance advantage of the xxperm over vperm.

Please let me know if the following patch is acceptable.  Thanks.

                        Carl Love

----------------------------------------------------

gcc/ChangeLog:

2017-07-14  Carl Love  <cel@us.ibm.com>

	* config/rs6000/rs6000-c.c: Add support for built-in functions
	vector float vec_extract_fp32_from_shorth (vector unsigned short);
	vector float vec_extract_fp32_from_shortl (vector unsigned short);
	* config/rs6000/altivec.h (vec_extract_fp_from_shorth,
	vec_extract_fp_from_shortl): Add defines for the two builtins.
	* config/rs6000/rs6000-builtin.def (VEXTRACT_FP_FROM_SHORTH,
	VEXTRACT_FP_FROM_SHORTL): Add BU_P9V_OVERLOAD_1 and BU_P9V_VSX_1
	new builtins.
	* config/rs6000/vsx.md vsx_xvcvhpsp): Add define_insn.
	(vextract_fp_from_shorth, vextract_fp_from_shortl): Add define_expands.
	* doc/extend.texi: Update the built-in documentation file for the
	new built-in function.

gcc/testsuite/ChangeLog:
2017-07-14  Carl Love  <cel@us.ibm.com>

	* gcc.target/powerpc/builtins-3-p9-runnable.c: Add new test file for
	the new built-ins.
---
 gcc/config/rs6000/altivec.h                        |  3 +
 gcc/config/rs6000/rs6000-builtin.def               |  5 ++
 gcc/config/rs6000/rs6000-c.c                       |  5 ++
 gcc/config/rs6000/vsx.md                           | 70 +++++++++++++++++++++-
 gcc/doc/extend.texi                                |  3 +
 .../gcc.target/powerpc/builtins-3-p9-runnable.c    | 35 +++++++++++
 6 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/builtins-3-p9-runnable.c

Comments

Segher Boessenkool July 14, 2017, 7:51 p.m. UTC | #1
Hi Carl,

> Also, per the note from Segher I could use xxperm instead of vperm, I
> stayed with the vperm instruction.  I don't see any functional or
> performance advantage of the xxperm over vperm.

xxperm does the same as vperm, but for all vector registers instead
of only v0..v31.  Since xvcvhpsp allows all registers as well, there
is no real reason to restrict to "v" (instead of "wa") as far as I see.

> +;; Generate vector extract four float 32 values from left four elements
> +;; of eight element vector of float 16 values.
> +(define_expand "vextract_fp_from_shorth"
> +  [(set (match_operand:V4SF 0 "register_operand" "=v")
> +	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "v")]
> +	UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTH))]

Nit: this last line should be indented the same as the [ on the line
above (both are arguments to the unspec).

Okay for trunk with those fixed.  Thanks,


Segher
diff mbox

Patch

diff --git a/gcc/config/rs6000/altivec.h b/gcc/config/rs6000/altivec.h
index 71cdca5..4d34a97 100644
--- a/gcc/config/rs6000/altivec.h
+++ b/gcc/config/rs6000/altivec.h
@@ -449,6 +449,9 @@ 
 #define vec_insert_exp __builtin_vec_insert_exp
 #define vec_test_data_class __builtin_vec_test_data_class
 
+#define vec_extract_fp_from_shorth __builtin_vec_vextract_fp_from_shorth
+#define vec_extract_fp_from_shortl __builtin_vec_vextract_fp_from_shortl
+
 #define scalar_extract_exp __builtin_vec_scalar_extract_exp
 #define scalar_extract_sig __builtin_vec_scalar_extract_sig
 #define scalar_insert_exp __builtin_vec_scalar_insert_exp
diff --git a/gcc/config/rs6000/rs6000-builtin.def b/gcc/config/rs6000/rs6000-builtin.def
index e098e1c..400189e 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -2057,6 +2057,9 @@  BU_P9V_OVERLOAD_1 (VSTDCNSP,	"scalar_test_neg_sp")
 
 BU_P9V_OVERLOAD_1 (REVB,	"revb")
 
+BU_P9V_OVERLOAD_1 (VEXTRACT_FP_FROM_SHORTH, "vextract_fp_from_shorth")
+BU_P9V_OVERLOAD_1 (VEXTRACT_FP_FROM_SHORTL, "vextract_fp_from_shortl")
+
 /* ISA 3.0 vector scalar overloaded 2 argument functions.  */
 BU_P9V_OVERLOAD_2 (VSIEDP,	"scalar_insert_exp")
 
@@ -2074,6 +2077,8 @@  BU_P9V_VSX_1 (VEEDP, "extract_exp_dp", CONST, xvxexpdp)
 BU_P9V_VSX_1 (VEESP, "extract_exp_sp", CONST, xvxexpsp)
 BU_P9V_VSX_1 (VESDP, "extract_sig_dp", CONST, xvxsigdp)
 BU_P9V_VSX_1 (VESSP, "extract_sig_sp", CONST, xvxsigsp)
+BU_P9V_VSX_1 (VEXTRACT_FP_FROM_SHORTH, "vextract_fp_from_shorth", CONST, vextract_fp_from_shorth)
+BU_P9V_VSX_1 (VEXTRACT_FP_FROM_SHORTL, "vextract_fp_from_shortl", CONST, vextract_fp_from_shortl)
 
 /* 2 argument vsx vector functions added in ISA 3.0 (power9).  */
 BU_P9V_VSX_2 (VIEDP, "insert_exp_dp", CONST, xviexpdp)
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index c769442..a1d09ba 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -5164,6 +5164,11 @@  const struct altivec_builtin_types altivec_overloaded_builtins[] = {
   { P9V_BUILTIN_VEC_VEXTRACT4B, P9V_BUILTIN_VEXTRACT4B,
     RS6000_BTI_INTDI, RS6000_BTI_unsigned_V16QI, RS6000_BTI_UINTSI, 0 },
 
+  { P9V_BUILTIN_VEC_VEXTRACT_FP_FROM_SHORTH, P9V_BUILTIN_VEXTRACT_FP_FROM_SHORTH,
+    RS6000_BTI_V4SF, RS6000_BTI_unsigned_V8HI, 0, 0 },
+  { P9V_BUILTIN_VEC_VEXTRACT_FP_FROM_SHORTL, P9V_BUILTIN_VEXTRACT_FP_FROM_SHORTL,
+    RS6000_BTI_V4SF, RS6000_BTI_unsigned_V8HI, 0, 0 },
+
   { P9V_BUILTIN_VEC_VEXTULX, P9V_BUILTIN_VEXTUBLX,
     RS6000_BTI_INTQI, RS6000_BTI_UINTSI,
     RS6000_BTI_V16QI, 0 },
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 2ddfae5..7fac5f9 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -326,6 +326,7 @@ 
    UNSPEC_VSX_CVDPSXWS
    UNSPEC_VSX_CVDPUXWS
    UNSPEC_VSX_CVSPDP
+   UNSPEC_VSX_CVHPSP
    UNSPEC_VSX_CVSPDPN
    UNSPEC_VSX_CVDPSPN
    UNSPEC_VSX_CVSXWDP
@@ -367,6 +368,8 @@ 
    UNSPEC_VSX_SIEXPDP
    UNSPEC_VSX_SCMPEXPDP
    UNSPEC_VSX_STSTDC
+   UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTH
+   UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTL
    UNSPEC_VSX_VXEXP
    UNSPEC_VSX_VXSIG
    UNSPEC_VSX_VIEXP
@@ -1745,6 +1748,15 @@ 
   "xscvspdp %x0,%x1"
   [(set_attr "type" "fp")])
 
+;; Generate xvcvhpsp instruction
+(define_insn "vsx_xvcvhpsp"
+  [(set (match_operand:V4SF 0 "vsx_register_operand" "=wa")
+	(unspec:V4SF [(match_operand: V8HI 1 "vsx_register_operand" "wa")]
+		     UNSPEC_VSX_CVHPSP))]
+  "TARGET_P9_VECTOR"
+  "xvcvhpsp %x0,%x1"
+  [(set_attr "type" "vecfloat")])
+
 ;; xscvdpsp used for splat'ing a scalar to V4SF, knowing that the internal SF
 ;; format of scalars is actually DF.
 (define_insn "vsx_xscvdpsp_scalar"
@@ -4419,7 +4431,63 @@ 
   "xxinsertw %x0,%x1,%3"
   [(set_attr "type" "vecperm")])
 
-
+;; Generate vector extract four float 32 values from left four elements
+;; of eight element vector of float 16 values.
+(define_expand "vextract_fp_from_shorth"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "v")]
+	UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTH))]
+  "TARGET_P9_VECTOR"
+{
+  int vals[16] = {0, 1, 0, 0, 2, 3, 0, 0, 4, 5, 0, 0, 6, 7, 0, 0};
+  int i;
+
+  rtx rtx_tmp = gen_reg_rtx (V8HImode);
+  rtx rvals[16];
+  rtx mask = gen_reg_rtx (V16QImode);
+  rtvec v;
+
+  for (i = 0; i < 16; i++)
+    rvals[i] = GEN_INT (vals[i]);
+
+  /* xvcvhpsp - vector convert F16 to vector F32 requires the four F16
+     inputs in half words 1,3,5,7 (IBM numbering).  Use xxperm to move
+     src half words 0,1,2,3 for the conversion instruction.  */
+  v = gen_rtvec_v (16, rvals);
+  emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v)));
+  emit_insn (gen_altivec_vperm_v8hi (rtx_tmp, operands[1], operands[1], mask));
+  emit_insn (gen_vsx_xvcvhpsp (operands[0], rtx_tmp));
+  DONE;
+})
+
+;; Generate vector extract four float 32 values from right four elements
+;; of eight element vector of float 16 values.
+(define_expand "vextract_fp_from_shortl"
+  [(set (match_operand:V4SF 0 "register_operand" "=v")
+	(unspec:V4SF [(match_operand:V8HI 1 "register_operand" "v")]
+	UNSPEC_VSX_VEXTRACT_FP_FROM_SHORTL))]
+  "TARGET_P9_VECTOR"
+{
+  int vals[16] = {8, 9, 0, 0, 10, 11, 0, 0, 12, 13, 0, 0, 14, 15, 0, 0};
+  int i;
+  rtx rtx_tmp = gen_reg_rtx (V8HImode);
+  rtx rvals[16];
+  rtx mask = gen_reg_rtx (V16QImode);
+  rtvec v;
+
+  for (i = 0; i < 16; i++)
+    rvals[i] = GEN_INT (vals[i]);
+
+  /* xvcvhpsp - vector convert F16 to vector F32 requires the four F16
+     inputs in half words 1,3,5,7 (IBM numbering).  Use xxperm to move
+     src half words 4,5,6,7 for the conversion instruction.  */
+  v = gen_rtvec_v (16, rvals);
+  emit_insn (gen_vec_initv16qi (mask, gen_rtx_PARALLEL (V16QImode, v)));
+  emit_insn (gen_altivec_vperm_v8hi (rtx_tmp, operands[1], operands[1], mask));
+  emit_insn (gen_vsx_xvcvhpsp (operands[0], rtx_tmp));
+  DONE;
+})
+
 ;; Support for ISA 3.0 vector byte reverse
 
 ;; Swap all bytes with in a vector
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 530a82d..0135fc7 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -18258,6 +18258,9 @@  vector bool short vec_cmpne (vector bool short, vector bool short);
 vector bool int vec_cmpne (vector bool int, vector bool int);
 vector bool long long vec_cmpne (vector bool long long, vector bool long long);
 
+vector float vec_extract_fp32_from_shorth (vector unsigned short);
+vector float vec_extract_fp32_from_shortl (vector unsigned short);
+
 vector long long vec_vctz (vector long long);
 vector unsigned long long vec_vctz (vector unsigned long long);
 vector int vec_vctz (vector int);
diff --git a/gcc/testsuite/gcc.target/powerpc/builtins-3-p9-runnable.c b/gcc/testsuite/gcc.target/powerpc/builtins-3-p9-runnable.c
new file mode 100644
index 0000000..24589b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/builtins-3-p9-runnable.c
@@ -0,0 +1,35 @@ 
+/* { dg-do run { target { powerpc64*-*-* && { lp64 && p9vector_hw } } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-options "-mcpu=power9 -O2 -mupper-regs-di" } */
+
+#include <altivec.h> // vector
+
+void abort (void);
+
+int main() {
+   int i;
+   vector float vfr, vfexpt;
+   vector unsigned short vusha;
+
+   /* 1.0, -2.0, 0.0, 8.5, 1.5, 0.5, 1.25, -0.25 */
+   vusha = (vector unsigned short){0B011110000000000, 0B1100000000000000,
+                                   0B000000000000000, 0B0100100001000000,
+                                   0B011111000000000, 0B0011100000000000,
+                                   0B011110100000000, 0B1011010000000000};
+   
+   vfexpt = (vector float){1.0, -2.0, 0.0, 8.5};
+   vfr = vec_extract_fp_from_shorth(vusha);
+
+   for (i=0; i<4; i++) {
+      if (vfr[i] != vfexpt[i])
+         abort();
+   }
+
+   vfexpt = (vector float){1.5, 0.5, 1.25, -0.25};
+   vfr = vec_extract_fp_from_shortl(vusha);
+
+   for (i=0; i<4; i++) {
+      if (vfr[i] != vfexpt[i])
+         abort();
+   }
+}