diff mbox series

i386: Add init pattern for V2HI vectors [PR100637]

Message ID CAFULd4ZNmDhbMA4zbePjf-qtwR-s9hMqhZS7_wKLNzWud8VEow@mail.gmail.com
State New
Headers show
Series i386: Add init pattern for V2HI vectors [PR100637] | expand

Commit Message

Uros Bizjak June 4, 2021, 3:40 p.m. UTC
2021-06-03  Uroš Bizjak  <ubizjak@gmail.com>

gcc/
    PR target/100637
    * config/i386/i386-expand.c (ix86_expand_vector_init_duplicate):
    Handle V2HI mode.
    (ix86_expand_vector_init_general): Ditto.
    Use SImode instead of word_mode for logic operations
    when GET_MODE_SIZE (mode) < UNITS_PER_WORD.
    (expand_vec_perm_even_odd_1): Assert that V2HI mode should be
    implemented by expand_vec_perm_1.
    (expand_vec_perm_broadcast_1): Assert that V2HI and V4HI modes
    should be implemented using standard shuffle patterns.
    (ix86_vectorize_vec_perm_const): Handle V2HImode.  Add V4HI and
    V2HI modes to modes, implementable with shuffle for one operand.
    * config/i386/mmx.md (*punpckwd): New insn_and_split pattern.
    (*pshufw_1): New insn pattern.
    (*vec_dupv2hi): Ditto.
    (vec_initv2hihi): New expander.

gcc/testsuite/

    PR target/100637
        * gcc.dg/vect/slp-perm-9.c (dg-final): Adjust dumps for vect32 targets.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Pushed to master.

Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index eb7cdb0c14f..661d91abe4e 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -13723,6 +13723,19 @@  ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       goto widen;
 
+    case E_V2HImode:
+      if (TARGET_SSE2)
+	{
+	  rtx x;
+
+	  val = gen_lowpart (SImode, val);
+	  x = gen_rtx_TRUNCATE (HImode, val);
+	  x = gen_rtx_VEC_DUPLICATE (mode, x);
+	  emit_insn (gen_rtx_SET (target, x));
+	  return true;
+	}
+      return false;
+
     case E_V8QImode:
       if (!mmx_ok)
 	return false;
@@ -14524,6 +14537,8 @@  quarter:
 
     case E_V4HImode:
     case E_V8QImode:
+
+    case E_V2HImode:
       break;
 
     default:
@@ -14532,12 +14547,14 @@  quarter:
 
     {
       int i, j, n_elts, n_words, n_elt_per_word;
-      machine_mode inner_mode;
+      machine_mode tmp_mode, inner_mode;
       rtx words[4], shift;
 
+      tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
+
       inner_mode = GET_MODE_INNER (mode);
       n_elts = GET_MODE_NUNITS (mode);
-      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+      n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
       n_elt_per_word = n_elts / n_words;
       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
 
@@ -14548,15 +14565,15 @@  quarter:
 	  for (j = 0; j < n_elt_per_word; ++j)
 	    {
 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
-	      elt = convert_modes (word_mode, inner_mode, elt, true);
+	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
 
 	      if (j == 0)
 		word = elt;
 	      else
 		{
-		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
 					      word, 1, OPTAB_LIB_WIDEN);
-		  word = expand_simple_binop (word_mode, IOR, word, elt,
+		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
 					      word, 1, OPTAB_LIB_WIDEN);
 		}
 	    }
@@ -14570,14 +14587,14 @@  quarter:
 	{
 	  rtx tmp = gen_reg_rtx (mode);
 	  emit_clobber (tmp);
-	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
-	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
+	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
 	  emit_move_insn (target, tmp);
 	}
       else if (n_words == 4)
 	{
 	  rtx tmp = gen_reg_rtx (V4SImode);
-	  gcc_assert (word_mode == SImode);
+	  gcc_assert (tmp_mode == SImode);
 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
 	  emit_move_insn (target, gen_lowpart (mode, tmp));
@@ -19544,6 +19561,7 @@  expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
     case E_V2DImode:
     case E_V2SImode:
     case E_V4SImode:
+    case E_V2HImode:
       /* These are always directly implementable by expand_vec_perm_1.  */
       gcc_unreachable ();
 
@@ -19754,6 +19772,8 @@  expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
     case E_V2DImode:
     case E_V2SImode:
     case E_V4SImode:
+    case E_V2HImode:
+    case E_V4HImode:
       /* These are always implementable using standard shuffle patterns.  */
       gcc_unreachable ();
 
@@ -20263,6 +20283,10 @@  ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
       if (!TARGET_MMX_WITH_SSE)
 	return false;
       break;
+    case E_V2HImode:
+	if (!TARGET_SSE2)
+	  return false;
+	break;
     case E_V2DImode:
     case E_V2DFmode:
       if (!TARGET_SSE)
@@ -20294,10 +20318,11 @@  ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
       /* Check whether the mask can be applied to the vector type.  */
       d.one_operand_p = (which != 3);
 
-      /* Implementable with shufps or pshufd.  */
+      /* Implementable with shufps, pshufd or pshuflw.  */
       if (d.one_operand_p
 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
-	      || d.vmode == V4SImode || d.vmode == V2SImode))
+	      || d.vmode == V4SImode || d.vmode == V2SImode
+	      || d.vmode == V4HImode || d.vmode == V2HImode))
 	return true;
 
       /* Otherwise we have to go through the motions and see if we can
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 914e5e91e90..c3fd2805f25 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3292,6 +3292,88 @@  (define_expand "vec_extractv4qiqi"
   DONE;
 })
 
+(define_insn_and_split "*punpckwd"
+  [(set (match_operand:V2HI 0 "register_operand" "=x,Yw")
+	(vec_select:V2HI
+	  (vec_concat:V4HI
+	    (match_operand:V2HI 1 "register_operand" "0,Yw")
+	    (match_operand:V2HI 2 "register_operand" "x,Yw"))
+          (parallel [(match_operand 3 "const_0_to_3_operand")
+                     (match_operand 4 "const_0_to_3_operand")])))]
+  "TARGET_SSE2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 5)
+        (vec_select:V4HI
+	  (match_dup 5)
+          (parallel [(match_dup 3) (match_dup 4)
+                     (const_int 0) (const_int 0)])))]
+{
+  rtx dest = lowpart_subreg (V8HImode, operands[0], V2HImode);
+  rtx op1 = lowpart_subreg (V8HImode, operands[1], V2HImode);
+  rtx op2 = lowpart_subreg (V8HImode, operands[2], V2HImode);
+
+  emit_insn (gen_vec_interleave_lowv8hi (dest, op1, op2));
+
+  static const int map[4] = { 0, 2, 1, 3 };
+
+  int sel0 = map[INTVAL (operands[3])];
+  int sel1 = map[INTVAL (operands[4])];
+
+  if (sel0 == 0 && sel1 == 1)
+    DONE;
+
+  operands[3] = GEN_INT (sel0);
+  operands[4] = GEN_INT (sel1);
+
+  operands[5] = lowpart_subreg (V4HImode, dest, V8HImode);
+}
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sselog")
+   (set_attr "mode" "TI")])
+
+(define_insn "*pshufw_1"
+  [(set (match_operand:V2HI 0 "register_operand" "=Yw")
+        (vec_select:V2HI
+          (match_operand:V2HI 1 "register_operand" "Yw")
+          (parallel [(match_operand 2 "const_0_to_1_operand")
+                     (match_operand 3 "const_0_to_1_operand")])))]
+  "TARGET_SSE2"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= 2 << 4;
+  mask |= 3 << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_dupv2hi"
+  [(set (match_operand:V2HI 0 "register_operand" "=Yw")
+	(vec_duplicate:V2HI
+	  (truncate:HI
+	    (match_operand:SI 1 "register_operand" "Yw"))))]
+  "TARGET_SSE2"
+  "%vpshuflw\t{$0, %1, %0|%0, %1, 0}"
+  [(set_attr "type" "sselog1")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "vec_initv2hihi"
+  [(match_operand:V2HI 0 "register_operand")
+   (match_operand 1)]
+  "TARGET_SSE2"
+{
+  ix86_expand_vector_init (false, operands[0],
+			   operands[1]);
+  DONE;
+})
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Miscellaneous
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
index ab75f44eb75..873eddf223e 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c
@@ -57,13 +57,13 @@  int main (int argc, const char* argv[])
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { vect_perm_short || vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_perm_short || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 2 "vect" { target { ! { { vect_perm_short || vect32 } || vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { { vect_perm_short || vect32 } || vect_load_lanes } } } } */
 /* We don't try permutes with a group size of 3 for variable-length
    vectors.  */
 /* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && { ! vect_partial_vectors_usage_1 } } } xfail vect_variable_length } } } */
 /* Try to vectorize the epilogue using partial vectors.  */
 /* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 2 "vect" { target { vect_perm_short && { { ! vect_perm3_short } && vect_partial_vectors_usage_1 } } xfail vect_variable_length } } } */
 /* { dg-final { scan-tree-dump-not "permutation requires at least three vectors" "vect" { target vect_perm3_short } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! vect_perm3_short } || vect_load_lanes } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_perm3_short && { ! vect_load_lanes } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { { ! { vect_perm3_short || vect32 } } || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { { vect_perm3_short || vect32 } && { ! vect_load_lanes } } } } } */