diff mbox series

[1/5] IBM Z: Get rid of vec merge unspec

Message ID 20210729073730.23208-2-krebbel@linux.ibm.com
State New
Headers show
Series IBM Z: Implement TARGET_VECTORIZE_VEC_PERM_CONST | expand

Commit Message

Andreas Krebbel July 29, 2021, 7:37 a.m. UTC
This patch gets rid of the unspecs we were using for the vector merge
instruction and replaces it with generic rtx.

gcc/ChangeLog:

	* config/s390/s390-modes.def: Add more vector modes to support
	concatenation of two vectors.
	* config/s390/s390-protos.h (s390_expand_merge_perm_const): Add
	prototype.
	(s390_expand_merge): Likewise.
	* config/s390/s390.c (s390_expand_merge_perm_const): New function.
	(s390_expand_merge): New function.
	* config/s390/s390.md (UNSPEC_VEC_MERGEH, UNSPEC_VEC_MERGEL):
	Remove constant definitions.
	* config/s390/vector.md (V_HW_2): Add mode iterators.
	(VI_HW_4, V_HW_4): Rename VI_HW_4 to V_HW_4.
	(vec_2x_nelts, vec_2x_wide): New mode attributes.
	(*vmrhb, *vmrlb, *vmrhh, *vmrlh, *vmrhf, *vmrlf, *vmrhg, *vmrlg):
	New pattern definitions.
	(vec_widen_umult_lo_<mode>, vec_widen_umult_hi_<mode>)
	(vec_widen_smult_lo_<mode>, vec_widen_smult_hi_<mode>)
	(vec_unpacks_lo_v4sf, vec_unpacks_hi_v4sf, vec_unpacks_lo_v2df)
	(vec_unpacks_hi_v2df): Adjust expanders to emit non-unspec RTX for
	vec merge.
	* config/s390/vx-builtins.md (V_HW_4): Remove mode iterator. Now
	in vector.md.
	(vec_mergeh<mode>, vec_mergel<mode>): Use s390_expand_merge to
	emit vec merge pattern.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c:
	Instead of vpdi with 0 and 5 vmrlg and vmrhg are used now.
	* gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c: Likewise.
	* gcc.target/s390/zvector/vec-types.h: New test.
	* gcc.target/s390/zvector/vec_merge.c: New test.
---
 gcc/config/s390/s390-modes.def                |  11 +-
 gcc/config/s390/s390-protos.h                 |   2 +
 gcc/config/s390/s390.c                        |  36 ++++
 gcc/config/s390/s390.md                       |   2 -
 gcc/config/s390/vector.md                     | 204 +++++++++++++++---
 gcc/config/s390/vx-builtins.md                |  35 ++-
 .../long-double-asm-in-out-hard-fp-reg.c      |   8 +-
 .../long-double-asm-inout-hard-fp-reg.c       |   6 +-
 .../gcc.target/s390/zvector/vec-types.h       |  37 ++++
 .../gcc.target/s390/zvector/vec_merge.c       |  88 ++++++++
 10 files changed, 367 insertions(+), 62 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec-types.h
 create mode 100644 gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
diff mbox series

Patch

diff --git a/gcc/config/s390/s390-modes.def b/gcc/config/s390/s390-modes.def
index 6d814fc490c..245c2b811d4 100644
--- a/gcc/config/s390/s390-modes.def
+++ b/gcc/config/s390/s390-modes.def
@@ -259,14 +259,17 @@  CC_MODE (CCVFANY);
 
 /* Vector modes.  */
 
-VECTOR_MODES (INT, 2);        /*                 V2QI */
-VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
-VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
-VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 2);        /*                       V2QI */
+VECTOR_MODES (INT, 4);        /*                  V4QI V2HI */
+VECTOR_MODES (INT, 8);        /*             V8QI V4HI V2SI */
+VECTOR_MODES (INT, 16);       /*       V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32);       /* V32QI V16HI V8SI V4DI V2TI */
 
 VECTOR_MODE (FLOAT, SF, 2);   /* V2SF */
 VECTOR_MODE (FLOAT, SF, 4);   /* V4SF */
+VECTOR_MODE (FLOAT, SF, 8);   /* V8SF */
 VECTOR_MODE (FLOAT, DF, 2);   /* V2DF */
+VECTOR_MODE (FLOAT, DF, 4);   /* V4DF */
 
 VECTOR_MODE (INT, QI, 1);     /* V1QI */
 VECTOR_MODE (INT, HI, 1);     /* V1HI */
diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 289e018cf0f..4b03c6e99f5 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -122,6 +122,8 @@  extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool);
 extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code);
 extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
 extern void s390_expand_vec_init (rtx, rtx);
+extern rtx s390_expand_merge_perm_const (machine_mode, bool);
+extern void s390_expand_merge (rtx, rtx, rtx, bool);
 extern rtx s390_build_signbit_mask (machine_mode);
 extern rtx s390_return_addr_rtx (int, rtx);
 extern rtx s390_back_chain_rtx (void);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index b1d3b99784d..b1a9ca9d8aa 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -7014,6 +7014,42 @@  s390_expand_vec_init (rtx target, rtx vals)
     }
 }
 
+/* Return a parallel of constant integers to be used as permutation
+   vector for a vector merge operation in MODE.  If HIGH_P is true the
+   left-most elements of the source vectors are merged otherwise the
+   right-most elements.  */
+rtx
+s390_expand_merge_perm_const (machine_mode mode, bool high_p)
+{
+  int nelts = GET_MODE_NUNITS (mode);
+  rtx perm[16];
+  int addend = high_p ? 0 : nelts;
+
+  for (int i = 0; i < nelts; i++)
+    perm[i] = GEN_INT ((i + addend) / 2 + (i % 2) * nelts);
+
+  return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelts, perm));
+}
+
+/* Emit RTL to implement a vector merge operation of SRC1 and SRC2
+   which creates the result in TARGET. HIGH_P determines whether a
+   merge hi or lo will be generated.  */
+void
+s390_expand_merge (rtx target, rtx src1, rtx src2, bool high_p)
+{
+  machine_mode mode = GET_MODE (target);
+  opt_machine_mode opt_mode_2x = mode_for_vector (GET_MODE_INNER (mode),
+						  2 * GET_MODE_NUNITS (mode));
+  gcc_assert (opt_mode_2x.exists ());
+  machine_mode mode_double_nelts = opt_mode_2x.require ();
+  rtx constv = s390_expand_merge_perm_const (mode, high_p);
+  src1 = force_reg (GET_MODE (src1), src1);
+  src2 = force_reg (GET_MODE (src2), src2);
+  rtx x = gen_rtx_VEC_CONCAT (mode_double_nelts, src1, src2);
+  x = gen_rtx_VEC_SELECT (mode, x, constv);
+  emit_insn (gen_rtx_SET (target, x));
+}
+
 /* Emit a vector constant that contains 1s in each element's sign bit position
    and 0s in other positions.  MODE is the desired constant's mode.  */
 extern rtx
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 8ad21b0f4f7..d896faee0fb 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -158,8 +158,6 @@  (define_c_enum "unspec" [
    UNSPEC_VEC_LOAD_BNDRY
    UNSPEC_VEC_LOAD_LEN
    UNSPEC_VEC_LOAD_LEN_R
-   UNSPEC_VEC_MERGEH
-   UNSPEC_VEC_MERGEL
    UNSPEC_VEC_PACK
    UNSPEC_VEC_PACK_SATURATE
    UNSPEC_VEC_PACK_SATURATE_CC
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index ab605b3d2cf..7507aec1c8e 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -50,7 +50,10 @@  (define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI])
 (define_mode_iterator VI_HW_HSD [V8HI  V4SI V2DI])
 (define_mode_iterator VI_HW_HS  [V8HI  V4SI])
 (define_mode_iterator VI_HW_QH  [V16QI V8HI])
-(define_mode_iterator VI_HW_4   [V4SI V4SF])
+
+; Directly supported vector modes with a certain number of elements
+(define_mode_iterator V_HW_2   [V2DI V2DF])
+(define_mode_iterator V_HW_4   [V4SI V4SF])
 
 ; All integer vector modes supported in a vector register + TImode
 (define_mode_iterator VIT [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI V1TI TI])
@@ -163,14 +166,14 @@  (define_mode_attr sdx [(SF "s") (V1SF "s") (V2SF "s") (V4SF "s")
 		       (DF "d") (V1DF "d") (V2DF "d")
 		       (TF "x") (V1TF "x")])
 
-; Vector with doubled element size.
+; Vector with widened element size but half the number of elements.
 (define_mode_attr vec_double [(V1QI "V1HI") (V2QI "V1HI") (V4QI "V2HI") (V8QI "V4HI") (V16QI "V8HI")
 			      (V1HI "V1SI") (V2HI "V1SI") (V4HI "V2SI") (V8HI "V4SI")
 			      (V1SI "V1DI") (V2SI "V1DI") (V4SI "V2DI")
 			      (V1DI "V1TI") (V2DI "V1TI")
 			      (V1SF "V1DF") (V2SF "V1DF") (V4SF "V2DF")])
 
-; Vector with half the element size.
+; Vector with shrinked element size but twice the number of elements.
 (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16QI")
 			    (V1SI "V2HI") (V2SI "V4HI") (V4SI "V8HI")
 			    (V1DI "V2SI") (V2DI "V4SI")
@@ -178,6 +181,22 @@  (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16
 			    (V1DF "V2SF") (V2DF "V4SF")
 			    (V1TF "V1DF")])
 
+; Vector with twice the number of elements but same element size.
+(define_mode_attr vec_2x_nelts [(V1QI "V2QI") (V2QI "V4QI") (V4QI "V8QI") (V8QI "V16QI") (V16QI "V32QI")
+				(V1HI "V2HI") (V2HI "V4HI") (V4HI "V8HI") (V8HI "V16HI")
+				(V1SI "V2SI") (V2SI "V4SI") (V4SI "V8SI")
+				(V1DI "V2DI") (V2DI "V4DI")
+				(V1SF "V2SF") (V2SF "V4SF") (V4SF "V8SF")
+				(V1DF "V2DF") (V2DF "V4DF")])
+
+; Vector with widened element size and the same number of elements.
+(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI")
+			       (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI")
+			       (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI")
+			       (V1DI "V1TI") (V2DI "V2TI")
+			       (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF")
+			       (V1DF "V1TF") (V2DF "V2TF")])
+
 ; Vector with half the element size AND half the number of elements.
 (define_mode_attr vec_halfhalf
   [(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -748,6 +767,109 @@  (define_insn "*vec_perm<mode>"
   "vperm\t%v0,%v1,%v2,%v3"
   [(set_attr "op_type" "VRR")])
 
+(define_insn "*vmrhb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)])))]
+  "TARGET_VX"
+  "vmrhb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlb"
+  [(set (match_operand:V16QI                     0 "register_operand" "=v")
+        (vec_select:V16QI
+	  (vec_concat:V32QI (match_operand:V16QI 1 "register_operand"  "v")
+			    (match_operand:V16QI 2 "register_operand"  "v"))
+	  (parallel [(const_int  8) (const_int 24)
+		     (const_int  9) (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_VX"
+  "vmrlb\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_VX"
+  "vmrhh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlh"
+  [(set (match_operand:V8HI                     0 "register_operand" "=v")
+        (vec_select:V8HI
+	  (vec_concat:V16HI (match_operand:V8HI 1 "register_operand"  "v")
+			    (match_operand:V8HI 2 "register_operand"  "v"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_VX"
+  "vmrlh\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_VX"
+  "vmrhf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlf"
+  [(set (match_operand:V_HW_4                              0 "register_operand" "=v")
+        (vec_select:V_HW_4
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand"  "v")
+				     (match_operand:V_HW_4 2 "register_operand"  "v"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_VX"
+  "vmrlf\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 0) (const_int 2)])))]
+  "TARGET_VX"
+  "vmrhg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlg"
+  [(set (match_operand:V_HW_2                              0 "register_operand" "=v")
+        (vec_select:V_HW_2
+	  (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand"  "v")
+				     (match_operand:V_HW_2 2 "register_operand"  "v"))
+	  (parallel [(const_int 1) (const_int 3)])))]
+  "TARGET_VX"
+  "vmrlg\t%0,%1,%2";
+  [(set_attr "op_type" "VRR")])
+
+
 (define_insn "*tf_to_fprx2_0"
   [(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
 	(subreg:DF (match_operand:TF    1 "general_operand"       "v") 0))]
@@ -1271,12 +1393,14 @@  (define_expand "vec_widen_umult_lo_<mode>"
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_umult_hi_<mode>"
@@ -1288,12 +1412,14 @@  (define_expand "vec_widen_umult_hi_<mode>"
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_UMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 (define_expand "vec_widen_smult_lo_<mode>"
@@ -1305,12 +1431,14 @@  (define_expand "vec_widen_smult_lo_<mode>"
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEL))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
  })
 
 (define_expand "vec_widen_smult_hi_<mode>"
@@ -1322,12 +1450,14 @@  (define_expand "vec_widen_smult_hi_<mode>"
 	(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
 			     UNSPEC_VEC_SMULT_ODD))
    (set (match_operand:<vec_double>                 0 "register_operand" "")
-	(unspec:<vec_double> [(match_dup 3) (match_dup 4)]
-			     UNSPEC_VEC_MERGEH))]
+        (vec_select:<vec_double>
+	 (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+	 (match_dup 5)))]
   "TARGET_VX"
  {
    operands[3] = gen_reg_rtx (<vec_double>mode);
    operands[4] = gen_reg_rtx (<vec_double>mode);
+   operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
  })
 
 ; vec_widen_ushiftl_hi
@@ -2166,29 +2296,35 @@  (define_insn "*vec_extendv4sf"
 
 (define_expand "vec_unpacks_lo_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v4sf"
   [(set (match_dup 2)
-	(unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V2DF               0 "register_operand" "=v")
+        (vec_select:V4SF
+	 (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V2DF                   0 "register_operand" "")
 	(float_extend:V2DF
 	 (vec_select:V2SF
 	  (match_dup 2)
 	  (parallel [(const_int 0) (const_int 2)]))))]
   "TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+  operands[2] = gen_reg_rtx(V4SFmode);
+  operands[3] = s390_expand_merge_perm_const (V4SFmode, true);
+})
 
 
 ; double -> long double
@@ -2204,29 +2340,35 @@  (define_insn "*vec_extendv2df"
 
 (define_expand "vec_unpacks_lo_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEL))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, false);
+})
 
 (define_expand "vec_unpacks_hi_v2df"
   [(set (match_dup 2)
-	(unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
-		      (match_dup 1)]
-		     UNSPEC_VEC_MERGEH))
-   (set (match_operand:V1TF               0 "register_operand" "=v")
+        (vec_select:V2DF
+	 (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+	 (match_dup 3)))
+   (set (match_operand:V1TF                   0 "register_operand" "")
 	(float_extend:V1TF
 	 (vec_select:V1DF
 	  (match_dup 2)
 	  (parallel [(const_int 0)]))))]
   "TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+  operands[2] = gen_reg_rtx (V2DFmode);
+  operands[3] = s390_expand_merge_perm_const (V2DFmode, true);
+})
 
 
 ; 2 x v2df -> 1 x v4sf
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3df501b562c..5abe43b9e53 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -22,7 +22,7 @@ 
 
 (define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VI_HW_SD [V4SI V2DI])
-(define_mode_iterator V_HW_4 [V4SI V4SF])
+
 ; Full size vector modes with more than one element which are directly supported in vector registers by the hardware.
 (define_mode_iterator VEC_HW  [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")])
 (define_mode_iterator VECF_HW [(V4SF "TARGET_VXE") V2DF])
@@ -232,28 +232,27 @@  (define_insn "*vlrlrv16qi"
   [(set_attr "op_type" "VRS,VRX,VSI")])
 
 
-; FIXME: The following two patterns might using vec_merge. But what is
-; the canonical form: (vec_select (vec_merge op0 op1)) or (vec_merge
-; (vec_select op0) (vec_select op1)
 ; vmrhb, vmrhh, vmrhf, vmrhg
-(define_insn "vec_mergeh<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		       UNSPEC_VEC_MERGEH))]
+(define_expand "vec_mergeh<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrh<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], true);
+  DONE;
+})
 
 ; vmrlb, vmrlh, vmrlf, vmrlg
-(define_insn "vec_mergel<mode>"
-  [(set (match_operand:V_128_NOSINGLE                         0 "register_operand" "=v")
-	(unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand"  "v")
-			(match_operand:V_128_NOSINGLE         2 "register_operand"  "v")]
-		     UNSPEC_VEC_MERGEL))]
+(define_expand "vec_mergel<mode>"
+  [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+   (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
   "TARGET_VX"
-  "vmrl<bhfgq>\t%v0,%1,%2"
-  [(set_attr "op_type" "VRR")])
+{
+  s390_expand_merge (operands[0], operands[1], operands[2], false);
+  DONE;
+})
 
 
 ; Vector pack
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
index 2dcaf08f00b..a89dd460c69 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-in-out-hard-fp-reg.c
@@ -16,13 +16,13 @@  sqxbr (long double x)
   return out;
 }
 
-/* Ideally `vpdi %v3,%v1,%v3,5` should be optimized away, but the compiler
+/* Ideally `vmrlg %v3,%v1,%v3` should be optimized away, but the compiler
  * can't do it, because the UNSPEC pattern operates on the whole register.
  * Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v2,%v0,%v2,5\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v1,%v1,%v3,0\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v3,%v1,%v3,5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v2,%v0,%v2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v1,%v1,%v3\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v3,%v1,%v3\n} 1 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
index 6c5f88d8652..dd894c8136b 100644
--- a/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
+++ b/gcc/testsuite/gcc.target/s390/vector/long-double-asm-inout-hard-fp-reg.c
@@ -15,12 +15,12 @@  sqxbr (long double x)
   return inout;
 }
 
-/* Ideally there should be just one `vpdi %v6,%v4,%v6,5`, but the compiler
+/* Ideally there should be just one `vmrlg %v6,%v4,%v6`, but the compiler
  * can't optimize it away, because the UNSPEC pattern operates on the whole
  * register.  Using the SUBREG pattern solves this problem, but it's fragile.
  */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v6,%v4,%v6,5\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v4,%v4,%v6,0\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v6,%v4,%v6\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v4,%v4,%v6\n} 2 } } */
 
 int
 main (void)
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec-types.h b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
new file mode 100644
index 00000000000..35bd2a5b7b1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec-types.h
@@ -0,0 +1,37 @@ 
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+#include <vecintrin.h>
+
+typedef __vector signed char v16qi;
+typedef __vector unsigned char uv16qi;
+
+typedef __vector signed short v8hi;
+typedef __vector unsigned short uv8hi;
+
+typedef __vector signed int v4si;
+typedef __vector unsigned int uv4si;
+
+typedef __vector signed long long v2di;
+typedef __vector unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __vector __int128_t v1ti;
+#endif
+
+typedef __vector double v2df;
+typedef __vector long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __vector float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND)					\
+  ({ VEC_TYPE dummy;							\
+    const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]);		\
+    typeof(dummy[0]) __attribute__((aligned(8))) ar[elts];		\
+    for (int i = 0; i < elts; i++)					\
+      ar[i] = (typeof(dummy[0]))(i + (ADDEND));				\
+    *(VEC_TYPE*)ar;})
+
+#endif
diff --git a/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
new file mode 100644
index 00000000000..348d1f68f80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/zvector/vec_merge.c
@@ -0,0 +1,88 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_MERGE(VEC_TYPE, HILO)					\
+  VEC_TYPE __attribute__((noinline))					\
+  merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) {			\
+    return vec_merge##HILO (a, b); }
+
+GEN_MERGE(v16qi, l)
+GEN_MERGE(v16qi, h)
+GEN_MERGE(uv16qi, l)
+GEN_MERGE(uv16qi, h)
+
+GEN_MERGE(v8hi, l)
+GEN_MERGE(v8hi, h)
+GEN_MERGE(uv8hi, l)
+GEN_MERGE(uv8hi, h)
+
+GEN_MERGE(v4si, l)
+GEN_MERGE(v4si, h)
+GEN_MERGE(uv4si, l)
+GEN_MERGE(uv4si, h)
+
+GEN_MERGE(v4sf, l)
+GEN_MERGE(v4sf, h)
+
+GEN_MERGE(v2di, l)
+GEN_MERGE(v2di, h)
+GEN_MERGE(uv2di, l)
+GEN_MERGE(uv2di, h)
+
+GEN_MERGE(v2df, l)
+GEN_MERGE(v2df, h)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != (i + elts) / 2 + (i % 2) * elts)			\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2)				\
+  {									\
+    VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2));			\
+    int elts = sizeof(v) / sizeof(v[0]);				\
+    for (int i = 0; i < elts; i++)					\
+      if (v[i] != i / 2 + (i % 2) * elts)				\
+	__builtin_abort();						\
+  }
+
+#define CHECK_MERGE(VEC_TYPE)						\
+  {									\
+    VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0);				\
+    VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+    CHECK_MERGE_LO (VEC_TYPE, a, b);					\
+    CHECK_MERGE_HI (VEC_TYPE, a, b);					\
+  }
+
+int
+main ()
+{
+  CHECK_MERGE(v16qi);
+  CHECK_MERGE(uv16qi);
+  CHECK_MERGE(v8hi);
+  CHECK_MERGE(uv8hi);
+  CHECK_MERGE(v4si);
+  CHECK_MERGE(uv4si);
+  CHECK_MERGE(v4sf);
+  CHECK_MERGE(v2di);
+  CHECK_MERGE(uv2di);
+  CHECK_MERGE(v2df);
+}