diff mbox series

Use pblendw instead of pand to clear upper 16 bits.

Message ID 20240517013813.493642-1-hongtao.liu@intel.com
State New
Headers show
Series Use pblendw instead of pand to clear upper 16 bits. | expand

Commit Message

liuhongt May 17, 2024, 1:38 a.m. UTC
For vec_pack_truncv8si/v4si w/o AVX512,
(const_vector:v4si (const_int 0xffff) x4) is used as mask to clear
upper 16 bits, but vpblendw with zero_vector can also be used, and
zero vector is cheaper than (const_vector:v4si (const_int 0xffff) x4).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:
	PR target/114427
	* config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack):
	Use pblendw instead of pand to clear upper bits.

gcc/testsuite/ChangeLog:
	* gcc.target/i386/pr114427.c: New test.
---
 gcc/config/i386/i386-expand.cc           | 34 +++++++++++++++++++++---
 gcc/testsuite/gcc.target/i386/pr114427.c | 18 +++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114427.c
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..231e9321d81 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22918,6 +22918,7 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
 {
   rtx op, dop0, dop1, t;
   unsigned i, odd, c, s, nelt = d->nelt;
+  int pblendw_i = 0;
   bool end_perm = false;
   machine_mode half_mode;
   rtx (*gen_and) (rtx, rtx, rtx);
@@ -22939,6 +22940,7 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       gen_and = gen_andv2si3;
       gen_pack = gen_mmx_packusdw;
       gen_shift = gen_lshrv2si3;
+      pblendw_i = 0x5;
       break;
     case E_V8HImode:
       /* Required for "pack".  */
@@ -22950,6 +22952,7 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       gen_and = gen_andv4si3;
       gen_pack = gen_sse4_1_packusdw;
       gen_shift = gen_lshrv4si3;
+      pblendw_i = 0x55;
       break;
     case E_V8QImode:
       /* No check as all instructions are SSE2.  */
@@ -22978,6 +22981,7 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       gen_and = gen_andv8si3;
       gen_pack = gen_avx2_packusdw;
       gen_shift = gen_lshrv8si3;
+      pblendw_i = 0x5555;
       end_perm = true;
       break;
     case E_V32QImode:
@@ -23013,10 +23017,32 @@  expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   dop1 = gen_reg_rtx (half_mode);
   if (odd == 0)
     {
-      t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-      t = force_reg (half_mode, t);
-      emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-      emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+      /* Use pblendw since const_vector 0 should be cheaper than
+	 const_vector 0xffff.  */
+      if (d->vmode == V4HImode
+	  || d->vmode == E_V8HImode
+	  || d->vmode == E_V16HImode)
+	{
+	  rtx dop0_t = gen_reg_rtx (d->vmode);
+	  rtx dop1_t = gen_reg_rtx (d->vmode);
+	  t = gen_reg_rtx (d->vmode);
+	  emit_move_insn (t, CONST0_RTX (d->vmode));
+
+	  emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
+						     GEN_INT (pblendw_i)));
+	  emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
+						     GEN_INT (pblendw_i)));
+
+	  emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
+	  emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
+	}
+      else
+	{
+	  t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+	  t = force_reg (half_mode, t);
+	  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+	  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+	}
     }
   else
     {
diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c b/gcc/testsuite/gcc.target/i386/pr114427.c
new file mode 100644
index 00000000000..58b66db7fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114427.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+/* { dg-final { scan-assembler-not "65535" } } */
+
+void
+foo (int* a, short* __restrict b, int* c)
+{
+    for (int i = 0; i != 16; i++)
+      b[i] = c[i] + a[i];
+}
+
+void
+foo1 (int* a, short* __restrict b, int* c)
+{
+    for (int i = 0; i != 8; i++)
+      b[i] = c[i] + a[i];
+}