@@ -259,14 +259,17 @@ CC_MODE (CCVFANY);
/* Vector modes. */
-VECTOR_MODES (INT, 2); /* V2QI */
-VECTOR_MODES (INT, 4); /* V4QI V2HI */
-VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */
-VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 2); /* V2QI */
+VECTOR_MODES (INT, 4); /* V4QI V2HI */
+VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */
+VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32); /* V32QI V16HI V8SI V4DI V2TI */
VECTOR_MODE (FLOAT, SF, 2); /* V2SF */
VECTOR_MODE (FLOAT, SF, 4); /* V4SF */
+VECTOR_MODE (FLOAT, SF, 8); /* V8SF */
VECTOR_MODE (FLOAT, DF, 2); /* V2DF */
+VECTOR_MODE (FLOAT, DF, 4); /* V4DF */
VECTOR_MODE (INT, QI, 1); /* V1QI */
VECTOR_MODE (INT, HI, 1); /* V1HI */
@@ -122,6 +122,8 @@ extern void s390_expand_vec_compare_cc (rtx, enum rtx_code, rtx, rtx, bool);
extern enum rtx_code s390_reverse_condition (machine_mode, enum rtx_code);
extern void s390_expand_vcond (rtx, rtx, rtx, enum rtx_code, rtx, rtx);
extern void s390_expand_vec_init (rtx, rtx);
+extern rtx s390_expand_merge_perm_const (machine_mode, bool);
+extern void s390_expand_merge (rtx, rtx, rtx, bool);
extern rtx s390_build_signbit_mask (machine_mode);
extern rtx s390_return_addr_rtx (int, rtx);
extern rtx s390_back_chain_rtx (void);
@@ -7014,6 +7014,42 @@ s390_expand_vec_init (rtx target, rtx vals)
}
}
+/* Return a parallel of constant integers to be used as permutation
+ vector for a vector merge operation in MODE. If HIGH_P is true the
+ left-most elements of the source vectors are merged otherwise the
+ right-most elements. */
+rtx
+s390_expand_merge_perm_const (machine_mode mode, bool high_p)
+{
+ int nelts = GET_MODE_NUNITS (mode);
+ rtx perm[16];
+ int addend = high_p ? 0 : nelts;
+
+ for (int i = 0; i < nelts; i++)
+ perm[i] = GEN_INT ((i + addend) / 2 + (i % 2) * nelts);
+
+ return gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelts, perm));
+}
+
+/* Emit RTL to implement a vector merge operation of SRC1 and SRC2
+ which creates the result in TARGET. HIGH_P determines whether a
+ merge hi or lo will be generated. */
+void
+s390_expand_merge (rtx target, rtx src1, rtx src2, bool high_p)
+{
+ machine_mode mode = GET_MODE (target);
+ opt_machine_mode opt_mode_2x = mode_for_vector (GET_MODE_INNER (mode),
+ 2 * GET_MODE_NUNITS (mode));
+ gcc_assert (opt_mode_2x.exists ());
+ machine_mode mode_double_nelts = opt_mode_2x.require ();
+ rtx constv = s390_expand_merge_perm_const (mode, high_p);
+ src1 = force_reg (GET_MODE (src1), src1);
+ src2 = force_reg (GET_MODE (src2), src2);
+ rtx x = gen_rtx_VEC_CONCAT (mode_double_nelts, src1, src2);
+ x = gen_rtx_VEC_SELECT (mode, x, constv);
+ emit_insn (gen_rtx_SET (target, x));
+}
+
/* Emit a vector constant that contains 1s in each element's sign bit position
and 0s in other positions. MODE is the desired constant's mode. */
extern rtx
@@ -158,8 +158,6 @@ (define_c_enum "unspec" [
UNSPEC_VEC_LOAD_BNDRY
UNSPEC_VEC_LOAD_LEN
UNSPEC_VEC_LOAD_LEN_R
- UNSPEC_VEC_MERGEH
- UNSPEC_VEC_MERGEL
UNSPEC_VEC_PACK
UNSPEC_VEC_PACK_SATURATE
UNSPEC_VEC_PACK_SATURATE_CC
@@ -50,7 +50,10 @@ (define_mode_iterator VI_HW_QHS [V16QI V8HI V4SI])
(define_mode_iterator VI_HW_HSD [V8HI V4SI V2DI])
(define_mode_iterator VI_HW_HS [V8HI V4SI])
(define_mode_iterator VI_HW_QH [V16QI V8HI])
-(define_mode_iterator VI_HW_4 [V4SI V4SF])
+
+; Directly supported vector modes with a certain number of elements
+(define_mode_iterator V_HW_2 [V2DI V2DF])
+(define_mode_iterator V_HW_4 [V4SI V4SF])
; All integer vector modes supported in a vector register + TImode
(define_mode_iterator VIT [V1QI V2QI V4QI V8QI V16QI V1HI V2HI V4HI V8HI V1SI V2SI V4SI V1DI V2DI V1TI TI])
@@ -163,14 +166,14 @@ (define_mode_attr sdx [(SF "s") (V1SF "s") (V2SF "s") (V4SF "s")
(DF "d") (V1DF "d") (V2DF "d")
(TF "x") (V1TF "x")])
-; Vector with doubled element size.
+; Vector with widened element size but half the number of elements.
(define_mode_attr vec_double [(V1QI "V1HI") (V2QI "V1HI") (V4QI "V2HI") (V8QI "V4HI") (V16QI "V8HI")
(V1HI "V1SI") (V2HI "V1SI") (V4HI "V2SI") (V8HI "V4SI")
(V1SI "V1DI") (V2SI "V1DI") (V4SI "V2DI")
(V1DI "V1TI") (V2DI "V1TI")
(V1SF "V1DF") (V2SF "V1DF") (V4SF "V2DF")])
-; Vector with half the element size.
+; Vector with shrinked element size but twice the number of elements.
(define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16QI")
(V1SI "V2HI") (V2SI "V4HI") (V4SI "V8HI")
(V1DI "V2SI") (V2DI "V4SI")
@@ -178,6 +181,22 @@ (define_mode_attr vec_half [(V1HI "V2QI") (V2HI "V4QI") (V4HI "V8QI") (V8HI "V16
(V1DF "V2SF") (V2DF "V4SF")
(V1TF "V1DF")])
+; Vector with twice the number of elements but same element size.
+(define_mode_attr vec_2x_nelts [(V1QI "V2QI") (V2QI "V4QI") (V4QI "V8QI") (V8QI "V16QI") (V16QI "V32QI")
+ (V1HI "V2HI") (V2HI "V4HI") (V4HI "V8HI") (V8HI "V16HI")
+ (V1SI "V2SI") (V2SI "V4SI") (V4SI "V8SI")
+ (V1DI "V2DI") (V2DI "V4DI")
+ (V1SF "V2SF") (V2SF "V4SF") (V4SF "V8SF")
+ (V1DF "V2DF") (V2DF "V4DF")])
+
+; Vector with widened element size and the same number of elements.
+(define_mode_attr vec_2x_wide [(V1QI "V1HI") (V2QI "V2HI") (V4QI "V4HI") (V8QI "V8HI") (V16QI "V16HI")
+ (V1HI "V1SI") (V2HI "V2SI") (V4HI "V4SI") (V8HI "V8SI")
+ (V1SI "V1DI") (V2SI "V2DI") (V4SI "V4DI")
+ (V1DI "V1TI") (V2DI "V2TI")
+ (V1SF "V1DF") (V2SF "V2DF") (V4SF "V4DF")
+ (V1DF "V1TF") (V2DF "V2TF")])
+
; Vector with half the element size AND half the number of elements.
(define_mode_attr vec_halfhalf
[(V2HI "V2QI") (V4HI "V4QI") (V8HI "V8QI")
@@ -748,6 +767,109 @@ (define_insn "*vec_perm<mode>"
"vperm\t%v0,%v1,%v2,%v3"
[(set_attr "op_type" "VRR")])
+(define_insn "*vmrhb"
+ [(set (match_operand:V16QI 0 "register_operand" "=v")
+ (vec_select:V16QI
+ (vec_concat:V32QI (match_operand:V16QI 1 "register_operand" "v")
+ (match_operand:V16QI 2 "register_operand" "v"))
+ (parallel [(const_int 0) (const_int 16)
+ (const_int 1) (const_int 17)
+ (const_int 2) (const_int 18)
+ (const_int 3) (const_int 19)
+ (const_int 4) (const_int 20)
+ (const_int 5) (const_int 21)
+ (const_int 6) (const_int 22)
+ (const_int 7) (const_int 23)])))]
+ "TARGET_VX"
+ "vmrhb\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlb"
+ [(set (match_operand:V16QI 0 "register_operand" "=v")
+ (vec_select:V16QI
+ (vec_concat:V32QI (match_operand:V16QI 1 "register_operand" "v")
+ (match_operand:V16QI 2 "register_operand" "v"))
+ (parallel [(const_int 8) (const_int 24)
+ (const_int 9) (const_int 25)
+ (const_int 10) (const_int 26)
+ (const_int 11) (const_int 27)
+ (const_int 12) (const_int 28)
+ (const_int 13) (const_int 29)
+ (const_int 14) (const_int 30)
+ (const_int 15) (const_int 31)])))]
+ "TARGET_VX"
+ "vmrlb\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhh"
+ [(set (match_operand:V8HI 0 "register_operand" "=v")
+ (vec_select:V8HI
+ (vec_concat:V16HI (match_operand:V8HI 1 "register_operand" "v")
+ (match_operand:V8HI 2 "register_operand" "v"))
+ (parallel [(const_int 0) (const_int 8)
+ (const_int 1) (const_int 9)
+ (const_int 2) (const_int 10)
+ (const_int 3) (const_int 11)])))]
+ "TARGET_VX"
+ "vmrhh\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlh"
+ [(set (match_operand:V8HI 0 "register_operand" "=v")
+ (vec_select:V8HI
+ (vec_concat:V16HI (match_operand:V8HI 1 "register_operand" "v")
+ (match_operand:V8HI 2 "register_operand" "v"))
+ (parallel [(const_int 4) (const_int 12)
+ (const_int 5) (const_int 13)
+ (const_int 6) (const_int 14)
+ (const_int 7) (const_int 15)])))]
+ "TARGET_VX"
+ "vmrlh\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhf"
+ [(set (match_operand:V_HW_4 0 "register_operand" "=v")
+ (vec_select:V_HW_4
+ (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand" "v")
+ (match_operand:V_HW_4 2 "register_operand" "v"))
+ (parallel [(const_int 0) (const_int 4)
+ (const_int 1) (const_int 5)])))]
+ "TARGET_VX"
+ "vmrhf\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlf"
+ [(set (match_operand:V_HW_4 0 "register_operand" "=v")
+ (vec_select:V_HW_4
+ (vec_concat:<vec_2x_nelts> (match_operand:V_HW_4 1 "register_operand" "v")
+ (match_operand:V_HW_4 2 "register_operand" "v"))
+ (parallel [(const_int 2) (const_int 6)
+ (const_int 3) (const_int 7)])))]
+ "TARGET_VX"
+ "vmrlf\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrhg"
+ [(set (match_operand:V_HW_2 0 "register_operand" "=v")
+ (vec_select:V_HW_2
+ (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand" "v")
+ (match_operand:V_HW_2 2 "register_operand" "v"))
+ (parallel [(const_int 0) (const_int 2)])))]
+ "TARGET_VX"
+ "vmrhg\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+(define_insn "*vmrlg"
+ [(set (match_operand:V_HW_2 0 "register_operand" "=v")
+ (vec_select:V_HW_2
+ (vec_concat:<vec_2x_nelts> (match_operand:V_HW_2 1 "register_operand" "v")
+ (match_operand:V_HW_2 2 "register_operand" "v"))
+ (parallel [(const_int 1) (const_int 3)])))]
+ "TARGET_VX"
+ "vmrlg\t%0,%1,%2";
+ [(set_attr "op_type" "VRR")])
+
+
(define_insn "*tf_to_fprx2_0"
[(set (subreg:DF (match_operand:FPRX2 0 "nonimmediate_operand" "+f") 0)
(subreg:DF (match_operand:TF 1 "general_operand" "v") 0))]
@@ -1271,12 +1393,14 @@ (define_expand "vec_widen_umult_lo_<mode>"
(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
UNSPEC_VEC_UMULT_ODD))
(set (match_operand:<vec_double> 0 "register_operand" "")
- (unspec:<vec_double> [(match_dup 3) (match_dup 4)]
- UNSPEC_VEC_MERGEL))]
+ (vec_select:<vec_double>
+ (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+ (match_dup 5)))]
"TARGET_VX"
{
operands[3] = gen_reg_rtx (<vec_double>mode);
operands[4] = gen_reg_rtx (<vec_double>mode);
+ operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
})
(define_expand "vec_widen_umult_hi_<mode>"
@@ -1288,12 +1412,14 @@ (define_expand "vec_widen_umult_hi_<mode>"
(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
UNSPEC_VEC_UMULT_ODD))
(set (match_operand:<vec_double> 0 "register_operand" "")
- (unspec:<vec_double> [(match_dup 3) (match_dup 4)]
- UNSPEC_VEC_MERGEH))]
+ (vec_select:<vec_double>
+ (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+ (match_dup 5)))]
"TARGET_VX"
{
operands[3] = gen_reg_rtx (<vec_double>mode);
operands[4] = gen_reg_rtx (<vec_double>mode);
+ operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
})
(define_expand "vec_widen_smult_lo_<mode>"
@@ -1305,12 +1431,14 @@ (define_expand "vec_widen_smult_lo_<mode>"
(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
UNSPEC_VEC_SMULT_ODD))
(set (match_operand:<vec_double> 0 "register_operand" "")
- (unspec:<vec_double> [(match_dup 3) (match_dup 4)]
- UNSPEC_VEC_MERGEL))]
+ (vec_select:<vec_double>
+ (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+ (match_dup 5)))]
"TARGET_VX"
{
operands[3] = gen_reg_rtx (<vec_double>mode);
operands[4] = gen_reg_rtx (<vec_double>mode);
+ operands[5] = s390_expand_merge_perm_const (<vec_double>mode, false);
})
(define_expand "vec_widen_smult_hi_<mode>"
@@ -1322,12 +1450,14 @@ (define_expand "vec_widen_smult_hi_<mode>"
(unspec:<vec_double> [(match_dup 1) (match_dup 2)]
UNSPEC_VEC_SMULT_ODD))
(set (match_operand:<vec_double> 0 "register_operand" "")
- (unspec:<vec_double> [(match_dup 3) (match_dup 4)]
- UNSPEC_VEC_MERGEH))]
+ (vec_select:<vec_double>
+ (vec_concat:<vec_2x_wide> (match_dup 3) (match_dup 4))
+ (match_dup 5)))]
"TARGET_VX"
{
operands[3] = gen_reg_rtx (<vec_double>mode);
operands[4] = gen_reg_rtx (<vec_double>mode);
+ operands[5] = s390_expand_merge_perm_const (<vec_double>mode, true);
})
; vec_widen_ushiftl_hi
@@ -2166,29 +2296,35 @@ (define_insn "*vec_extendv4sf"
(define_expand "vec_unpacks_lo_v4sf"
[(set (match_dup 2)
- (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
- (match_dup 1)]
- UNSPEC_VEC_MERGEL))
- (set (match_operand:V2DF 0 "register_operand" "=v")
+ (vec_select:V4SF
+ (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+ (match_dup 3)))
+ (set (match_operand:V2DF 0 "register_operand" "")
(float_extend:V2DF
(vec_select:V2SF
(match_dup 2)
(parallel [(const_int 0) (const_int 2)]))))]
"TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+ operands[2] = gen_reg_rtx(V4SFmode);
+ operands[3] = s390_expand_merge_perm_const (V4SFmode, false);
+})
(define_expand "vec_unpacks_hi_v4sf"
[(set (match_dup 2)
- (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "v")
- (match_dup 1)]
- UNSPEC_VEC_MERGEH))
- (set (match_operand:V2DF 0 "register_operand" "=v")
+ (vec_select:V4SF
+ (vec_concat:V8SF (match_operand:V4SF 1 "register_operand" "") (match_dup 1))
+ (match_dup 3)))
+ (set (match_operand:V2DF 0 "register_operand" "")
(float_extend:V2DF
(vec_select:V2SF
(match_dup 2)
(parallel [(const_int 0) (const_int 2)]))))]
"TARGET_VX"
-{ operands[2] = gen_reg_rtx(V4SFmode); })
+{
+ operands[2] = gen_reg_rtx(V4SFmode);
+ operands[3] = s390_expand_merge_perm_const (V4SFmode, true);
+})
; double -> long double
@@ -2204,29 +2340,35 @@ (define_insn "*vec_extendv2df"
(define_expand "vec_unpacks_lo_v2df"
[(set (match_dup 2)
- (unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
- (match_dup 1)]
- UNSPEC_VEC_MERGEL))
- (set (match_operand:V1TF 0 "register_operand" "=v")
+ (vec_select:V2DF
+ (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+ (match_dup 3)))
+ (set (match_operand:V1TF 0 "register_operand" "")
(float_extend:V1TF
(vec_select:V1DF
(match_dup 2)
(parallel [(const_int 0)]))))]
"TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+ operands[2] = gen_reg_rtx (V2DFmode);
+ operands[3] = s390_expand_merge_perm_const (V2DFmode, false);
+})
(define_expand "vec_unpacks_hi_v2df"
[(set (match_dup 2)
- (unspec:V2DF [(match_operand:V2DF 1 "register_operand" "v")
- (match_dup 1)]
- UNSPEC_VEC_MERGEH))
- (set (match_operand:V1TF 0 "register_operand" "=v")
+ (vec_select:V2DF
+ (vec_concat:V4DF (match_operand:V2DF 1 "register_operand" "") (match_dup 1))
+ (match_dup 3)))
+ (set (match_operand:V1TF 0 "register_operand" "")
(float_extend:V1TF
(vec_select:V1DF
(match_dup 2)
(parallel [(const_int 0)]))))]
"TARGET_VXE"
-{ operands[2] = gen_reg_rtx (V2DFmode); })
+{
+ operands[2] = gen_reg_rtx (V2DFmode);
+ operands[3] = s390_expand_merge_perm_const (V2DFmode, true);
+})
; 2 x v2df -> 1 x v4sf
@@ -22,7 +22,7 @@
(define_mode_iterator V_HW_32_64 [V4SI V2DI V2DF (V4SF "TARGET_VXE")])
(define_mode_iterator VI_HW_SD [V4SI V2DI])
-(define_mode_iterator V_HW_4 [V4SI V4SF])
+
; Full size vector modes with more than one element which are directly supported in vector registers by the hardware.
(define_mode_iterator VEC_HW [V16QI V8HI V4SI V2DI V2DF (V4SF "TARGET_VXE")])
(define_mode_iterator VECF_HW [(V4SF "TARGET_VXE") V2DF])
@@ -232,28 +232,27 @@ (define_insn "*vlrlrv16qi"
[(set_attr "op_type" "VRS,VRX,VSI")])
-; FIXME: The following two patterns might using vec_merge. But what is
-; the canonical form: (vec_select (vec_merge op0 op1)) or (vec_merge
-; (vec_select op0) (vec_select op1)
; vmrhb, vmrhh, vmrhf, vmrhg
-(define_insn "vec_mergeh<mode>"
- [(set (match_operand:V_128_NOSINGLE 0 "register_operand" "=v")
- (unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand" "v")
- (match_operand:V_128_NOSINGLE 2 "register_operand" "v")]
- UNSPEC_VEC_MERGEH))]
+(define_expand "vec_mergeh<mode>"
+ [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+ (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+ (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
"TARGET_VX"
- "vmrh<bhfgq>\t%v0,%1,%2"
- [(set_attr "op_type" "VRR")])
+{
+ s390_expand_merge (operands[0], operands[1], operands[2], true);
+ DONE;
+})
; vmrlb, vmrlh, vmrlf, vmrlg
-(define_insn "vec_mergel<mode>"
- [(set (match_operand:V_128_NOSINGLE 0 "register_operand" "=v")
- (unspec:V_128_NOSINGLE [(match_operand:V_128_NOSINGLE 1 "register_operand" "v")
- (match_operand:V_128_NOSINGLE 2 "register_operand" "v")]
- UNSPEC_VEC_MERGEL))]
+(define_expand "vec_mergel<mode>"
+ [(match_operand:V_128_NOSINGLE 0 "register_operand" "")
+ (match_operand:V_128_NOSINGLE 1 "register_operand" "")
+ (match_operand:V_128_NOSINGLE 2 "register_operand" "")]
"TARGET_VX"
- "vmrl<bhfgq>\t%v0,%1,%2"
- [(set_attr "op_type" "VRR")])
+{
+ s390_expand_merge (operands[0], operands[1], operands[2], false);
+ DONE;
+})
; Vector pack
@@ -16,13 +16,13 @@ sqxbr (long double x)
return out;
}
-/* Ideally `vpdi %v3,%v1,%v3,5` should be optimized away, but the compiler
+/* Ideally `vmrlg %v3,%v1,%v3` should be optimized away, but the compiler
* can't do it, because the UNSPEC pattern operates on the whole register.
* Using the SUBREG pattern solves this problem, but it's fragile.
*/
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v2,%v0,%v2,5\n} 1 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v1,%v1,%v3,0\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v3,%v1,%v3,5\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v2,%v0,%v2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v1,%v1,%v3\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v3,%v1,%v3\n} 1 } } */
int
main (void)
@@ -15,12 +15,12 @@ sqxbr (long double x)
return inout;
}
-/* Ideally there should be just one `vpdi %v6,%v4,%v6,5`, but the compiler
+/* Ideally there should be just one `vmrlg %v6,%v4,%v6`, but the compiler
* can't optimize it away, because the UNSPEC pattern operates on the whole
* register. Using the SUBREG pattern solves this problem, but it's fragile.
*/
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v6,%v4,%v6,5\n} 2 } } */
-/* { dg-final { scan-assembler-times {\n\tvpdi\t%v4,%v4,%v6,0\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrlg\t%v6,%v4,%v6\n} 2 } } */
+/* { dg-final { scan-assembler-times {\n\tvmrhg\t%v4,%v4,%v6\n} 2 } } */
int
main (void)
new file mode 100644
@@ -0,0 +1,37 @@
+#ifndef VEC_TYPES_H
+#define VEC_TYPES_H 1
+
+#include <vecintrin.h>
+
+typedef __vector signed char v16qi;
+typedef __vector unsigned char uv16qi;
+
+typedef __vector signed short v8hi;
+typedef __vector unsigned short uv8hi;
+
+typedef __vector signed int v4si;
+typedef __vector unsigned int uv4si;
+
+typedef __vector signed long long v2di;
+typedef __vector unsigned long long uv2di;
+
+#if __SIZEOF_INT128__ == 16
+typedef __vector __int128_t v1ti;
+#endif
+
+typedef __vector double v2df;
+typedef __vector long double v1tf;
+
+#if __ARCH__ >= 12
+typedef __vector float v4sf;
+#endif
+
+#define GEN_SEQ_VEC(VEC_TYPE, ADDEND) \
+ ({ VEC_TYPE dummy; \
+ const int elts = sizeof(VEC_TYPE) / sizeof(dummy[0]); \
+ typeof(dummy[0]) __attribute__((aligned(8))) ar[elts]; \
+ for (int i = 0; i < elts; i++) \
+ ar[i] = (typeof(dummy[0]))(i + (ADDEND)); \
+ *(VEC_TYPE*)ar;})
+
+#endif
new file mode 100644
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z14 -mzvector --save-temps" } */
+/* { dg-do run { target { s390_z14_hw } } } */
+
+/* { dg-final { scan-assembler-times "\tvmrhb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlb\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrlh\t" 2 } } */
+/* { dg-final { scan-assembler-times "\tvmrhf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlf\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrhg\t" 3 } } */
+/* { dg-final { scan-assembler-times "\tvmrlg\t" 3 } } */
+
+#include "vec-types.h"
+#include <vecintrin.h>
+
+#define GEN_MERGE(VEC_TYPE, HILO) \
+ VEC_TYPE __attribute__((noinline)) \
+ merge_##HILO##_##VEC_TYPE(VEC_TYPE a, VEC_TYPE b) { \
+ return vec_merge##HILO (a, b); }
+
+GEN_MERGE(v16qi, l)
+GEN_MERGE(v16qi, h)
+GEN_MERGE(uv16qi, l)
+GEN_MERGE(uv16qi, h)
+
+GEN_MERGE(v8hi, l)
+GEN_MERGE(v8hi, h)
+GEN_MERGE(uv8hi, l)
+GEN_MERGE(uv8hi, h)
+
+GEN_MERGE(v4si, l)
+GEN_MERGE(v4si, h)
+GEN_MERGE(uv4si, l)
+GEN_MERGE(uv4si, h)
+
+GEN_MERGE(v4sf, l)
+GEN_MERGE(v4sf, h)
+
+GEN_MERGE(v2di, l)
+GEN_MERGE(v2di, h)
+GEN_MERGE(uv2di, l)
+GEN_MERGE(uv2di, h)
+
+GEN_MERGE(v2df, l)
+GEN_MERGE(v2df, h)
+
+
+#define CHECK_MERGE_LO(VEC_TYPE, SRC1, SRC2) \
+ { \
+ VEC_TYPE v = merge_l_##VEC_TYPE ((SRC1), (SRC2)); \
+ int elts = sizeof(v) / sizeof(v[0]); \
+ for (int i = 0; i < elts; i++) \
+ if (v[i] != (i + elts) / 2 + (i % 2) * elts) \
+ __builtin_abort(); \
+ }
+
+#define CHECK_MERGE_HI(VEC_TYPE, SRC1, SRC2) \
+ { \
+ VEC_TYPE v = merge_h_##VEC_TYPE ((SRC1), (SRC2)); \
+ int elts = sizeof(v) / sizeof(v[0]); \
+ for (int i = 0; i < elts; i++) \
+ if (v[i] != i / 2 + (i % 2) * elts) \
+ __builtin_abort(); \
+ }
+
+#define CHECK_MERGE(VEC_TYPE) \
+ { \
+ VEC_TYPE a = GEN_SEQ_VEC (VEC_TYPE, 0); \
+ VEC_TYPE b = GEN_SEQ_VEC (VEC_TYPE, sizeof(VEC_TYPE) / sizeof(a[0])); \
+ CHECK_MERGE_LO (VEC_TYPE, a, b); \
+ CHECK_MERGE_HI (VEC_TYPE, a, b); \
+ }
+
+int
+main ()
+{
+ CHECK_MERGE(v16qi);
+ CHECK_MERGE(uv16qi);
+ CHECK_MERGE(v8hi);
+ CHECK_MERGE(uv8hi);
+ CHECK_MERGE(v4si);
+ CHECK_MERGE(uv4si);
+ CHECK_MERGE(v4sf);
+ CHECK_MERGE(v2di);
+ CHECK_MERGE(uv2di);
+ CHECK_MERGE(v2df);
+}