diff mbox series

[PATCHv3,rs6000] Optimize vector construction with two vector doubleword loads [PR103568]

Message ID 0da5f7f3-2cb4-41b9-b7ac-3c88354af377@linux.ibm.com
State New
Headers show
Series [PATCHv3,rs6000] Optimize vector construction with two vector doubleword loads [PR103568] | expand

Commit Message

HAO CHEN GUI July 25, 2024, 6:09 a.m. UTC
Hi,
  This patch optimizes vector construction with two vector doubleword
loads. It generates an optimal insn sequence as "xxlor" has lower
latency than "mtvsrdd" on Power10.

  Compared with previous version, the main change is to add new
patterns for LE platform. Also lxsd[x] instructions are guarded by
POWER10 as the dword1 is undefined before ISA3.1.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/653180.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no
regressions. OK for the trunk?

Thanks
Gui Haochen

ChangeLog
rs6000: Optimize vector construction with two vector doubleword loads

When constructing a vector by two doublewords from memory, originally it
does
	ld 10,0(3)
	ld 9,0(4)
	mtvsrdd 34,9,10

An optimal sequence on Power10 should be
	lxsd 0,0(4)
	lxvrdx 1,0,3
	xxlor 34,1,32

This patch does this optimization by insn combine and split.

gcc/
	PR target/103568
	* config/rs6000/vsx.md (lxsd_<mode>_be, lxsd_<mode>_le,
	lxvrdx_<mode>_be, lxvrdx_<mode>_le): New insn pattern.
	(*vsx_concat_mem_<mode>_be, *vsx_concat_mem_<mode>_le): New
	insn_and_split pattern.

gcc/testsuite/
	PR target/103568
	* gcc.target/powerpc/pr103568.c: New test.

patch.diff
diff mbox series

Patch

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index f135fa079bd..9182d824d25 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1395,6 +1395,49 @@  (define_insn "vsx_ld_elemrev_v2di"
   "lxvd2x %x0,%y1"
   [(set_attr "type" "vecload")])

+;; Before ISA3.1 the dword1 of lxsd[x] is undefined.
+(define_insn "lxsd_<mode>_be"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
+	  (match_operand:<VEC_base> 2 "zero_constant" "j,j")))]
+  "TARGET_POWER10 && BYTES_BIG_ENDIAN"
+  "@
+   lxsd %0,%1
+   lxsdx %x0,%y1"
+  [(set_attr "type" "vecload,vecload")
+   (set_attr "prefixed" "yes,no")])
+
+(define_insn "lxsd_<mode>_le"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "zero_constant" "j,j")
+	  (match_operand:<VEC_base> 2 "memory_operand" "wY,Z")))]
+  "TARGET_POWER10 && !BYTES_BIG_ENDIAN"
+  "@
+   lxsd %0,%2
+   lxsdx %x0,%y2"
+  [(set_attr "type" "vecload,vecload")
+   (set_attr "prefixed" "yes,no")])
+
+(define_insn "lxvrdx_<mode>_be"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "zero_constant" "j")
+	  (match_operand:<VEC_base> 2 "memory_operand" "Z")))]
+  "TARGET_POWER10 && BYTES_BIG_ENDIAN"
+  "lxvrdx %x0,%y2"
+  [(set_attr "type" "vecload")])
+
+(define_insn "lxvrdx_<mode>_le"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "Z")
+	  (match_operand:<VEC_base> 2 "zero_constant" "j")))]
+  "TARGET_POWER10 && !BYTES_BIG_ENDIAN"
+  "lxvrdx %x0,%y1"
+  [(set_attr "type" "vecload")])
+
 (define_insn "vsx_ld_elemrev_v1ti"
   [(set (match_operand:V1TI 0 "vsx_register_operand" "=wa")
         (vec_select:V1TI
@@ -3063,6 +3106,48 @@  (define_insn "vsx_concat_<mode>"
 }
   [(set_attr "type" "vecperm,vecmove")])

+(define_insn_and_split "*vsx_concat_mem_<mode>_be"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "wY,Z")
+	  (match_operand:<VEC_base> 2 "memory_operand" "Z,Z")))]
+  "TARGET_POWER10 && BYTES_BIG_ENDIAN
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx tmp1 = gen_reg_rtx (<MODE>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_lxsd_<mode>_be (tmp1, CONST0_RTX (<VEC_base>mode),
+				 operands[1]));
+  emit_insn (gen_lxvrdx_<mode>_be (tmp2, operands[2],
+				   CONST0_RTX (<VEC_base>mode)));
+  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
+  DONE;
+})
+
+(define_insn_and_split "*vsx_concat_mem_<mode>_le"
+  [(set (match_operand:VSX_D 0 "vsx_register_operand" "=v,wa")
+	(vec_concat:VSX_D
+	  (match_operand:<VEC_base> 1 "memory_operand" "Z,Z")
+	  (match_operand:<VEC_base> 2 "memory_operand" "wY,Z")))]
+  "TARGET_POWER10 && !BYTES_BIG_ENDIAN
+   && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx tmp1 = gen_reg_rtx (<MODE>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_lxsd_<mode>_le (tmp1, CONST0_RTX (<VEC_base>mode),
+				 operands[2]));
+  emit_insn (gen_lxvrdx_<mode>_le (tmp2, operands[1],
+				   CONST0_RTX (<VEC_base>mode)));
+  emit_insn (gen_ior<mode>3 (operands[0], tmp1, tmp2));
+  DONE;
+})
+
 ;; Combiner patterns to allow creating XXPERMDI's to access either double
 ;; word element in a vector register.
 (define_insn "*vsx_concat_<mode>_1"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103568.c b/gcc/testsuite/gcc.target/powerpc/pr103568.c
new file mode 100644
index 00000000000..106fad7c8c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103568.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+vector double test (double *a, double *b)
+{
+  return (vector double) {*a, *b};
+}
+
+vector long long test1 (long long *a, long long *b)
+{
+  return (vector long long) {*a, *b};
+}
+
+/* { dg-final { scan-assembler-times {\mp?lxsd} 2 } } */
+/* { dg-final { scan-assembler-times {\mlxvrdx\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxlor\M} 2 } } */
+