diff mbox

Fix sse4_1_insertps (PR target/48605)

Message ID 20110414191113.GQ17079@tyan-ft48-01.lab.bos.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek April 14, 2011, 7:11 p.m. UTC
Hi!

The insertps and vinsertps insns work differently if %2 is a register
resp. memory.  The register is 128-bit XMM register and the upper 2 bits
of the 8 bit immediate then select which of the 4 parts of that register to
choose.  If it is a memory though, the upper 2 bits of the 8 bit immediate
are ignored and the memory is 32-bit rather than 128-bit.
The following patch fixes two problems - with -masm=intel any
sse4_1_insertps emitted insn when operands[2] is a MEM wouldn't assemble,
as it was using XMMWORD instead of DWORD.  And the second problem is
that if the top 2 bits are non-zero, the address needs to be adjusted
(and there is no reason not to clear the upper two bits of the immediate
at the same time).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/4.6?

2011-04-14  Jakub Jelinek  <jakub@redhat.com>

	PR target/48605
	* config/i386/sse.md (sse4_1_insertps): If operands[2] is a MEM,
	offset it as needed based on top 2 bits in operands[3], change
	MEM mode to SFmode and mask those 2 bits away from operands[3].

	* gcc.target/i386/sse4_1-insertps-3.c: New test.
	* gcc.target/i386/sse4_1-insertps-4.c: New test.
	* gcc.target/i386/avx-insertps-3.c: New test.
	* gcc.target/i386/avx-insertps-4.c: New test.


	Jakub

Comments

Uros Bizjak April 14, 2011, 8:30 p.m. UTC | #1
On Thu, Apr 14, 2011 at 9:11 PM, Jakub Jelinek <jakub@redhat.com> wrote:

> The insertps and vinsertps insns work differently if %2 is a register
> resp. memory.  The register is 128-bit XMM register and the upper 2 bits
> of the 8 bit immediate then select which of the 4 parts of that register to
> choose.  If it is a memory though, the upper 2 bits of the 8 bit immediate
> are ignored and the memory is 32-bit rather than 128-bit.
> The following patch fixes two problems - with -masm=intel any
> sse4_1_insertps emitted insn when operands[2] is a MEM wouldn't assemble,
> as it was using XMMWORD instead of DWORD.  And the second problem is
> that if the top 2 bits are non-zero, the address needs to be adjusted
> (and there is no reason not to clear the upper two bits of the immediate
> at the same time).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk/4.6?
>
> 2011-04-14  Jakub Jelinek  <jakub@redhat.com>
>
>        PR target/48605
>        * config/i386/sse.md (sse4_1_insertps): If operands[2] is a MEM,
>        offset it as needed based on top 2 bits in operands[3], change
>        MEM mode to SFmode and mask those 2 bits away from operands[3].
>
>        * gcc.target/i386/sse4_1-insertps-3.c: New test.
>        * gcc.target/i386/sse4_1-insertps-4.c: New test.
>        * gcc.target/i386/avx-insertps-3.c: New test.
>        * gcc.target/i386/avx-insertps-4.c: New test.

OK.

Thanks,
Uros.
diff mbox

Patch

--- gcc/config/i386/sse.md.jj	2011-04-12 09:37:54.000000000 +0200
+++ gcc/config/i386/sse.md	2011-04-14 15:39:34.000000000 +0200
@@ -3571,9 +3571,24 @@  (define_insn "sse4_1_insertps"
 		      (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
 		     UNSPEC_INSERTPS))]
   "TARGET_SSE4_1"
-  "@
-   insertps\t{%3, %2, %0|%0, %2, %3}
-   vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+{
+  if (MEM_P (operands[2]))
+    {
+      unsigned count_s = INTVAL (operands[3]) >> 6;
+      if (count_s)
+	operands[3] = GEN_INT (INTVAL (operands[3]) & 0x3f);
+      operands[2] = adjust_address_nv (operands[2], SFmode, count_s * 4);
+    }
+  switch (which_alternative)
+    {
+    case 0:
+      return "insertps\t{%3, %2, %0|%0, %2, %3}";
+    case 1:
+      return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+    default:
+      gcc_unreachable ();
+    }
+}
   [(set_attr "isa" "noavx,avx")
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
--- gcc/testsuite/gcc.target/i386/sse4_1-insertps-3.c.jj	2011-04-14 15:51:24.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-insertps-3.c	2011-04-14 16:02:16.000000000 +0200
@@ -0,0 +1,5 @@ 
+/* { dg-do run { target ilp32 } } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1 -mtune=geode" } */
+
+#include "sse4_1-insertps-2.c"
--- gcc/testsuite/gcc.target/i386/sse4_1-insertps-4.c.jj	2011-04-14 16:05:45.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-insertps-4.c	2011-04-14 17:12:02.000000000 +0200
@@ -0,0 +1,92 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define msk0 0x41
+#define msk1 0x90
+#define msk2 0xe9
+#define msk3 0x70
+
+#define msk4 0xFC
+#define msk5 0x05
+#define msk6 0x0A
+#define msk7 0x0F
+
+union
+  {
+    __m128 x;
+    float f[4];
+  } val1;
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128 x;
+      float f[4];
+    } res[8], val2, tmp;
+  int masks[8];
+  int i, j;
+
+  val2.f[0] = 55.0;
+  val2.f[1] = 55.0;
+  val2.f[2] = 55.0;
+  val2.f[3] = 55.0;
+
+  val1.f[0] = 1.;
+  val1.f[1] = 2.;
+  val1.f[2] = 3.;
+  val1.f[3] = 4.;
+
+  asm volatile ("" : "+m" (val1));
+  res[0].x = _mm_insert_ps (val2.x, val1.x, msk0);
+  asm volatile ("" : "+m" (val1));
+  res[1].x = _mm_insert_ps (val2.x, val1.x, msk1);
+  asm volatile ("" : "+m" (val1));
+  res[2].x = _mm_insert_ps (val2.x, val1.x, msk2);
+  asm volatile ("" : "+m" (val1));
+  res[3].x = _mm_insert_ps (val2.x, val1.x, msk3);
+
+  masks[0] = msk0;
+  masks[1] = msk1;
+  masks[2] = msk2;
+  masks[3] = msk3;
+
+  for (i = 0; i < 4; i++)
+    {
+      asm volatile ("" : "+m" (val1));
+      res[i + 4].x = _mm_insert_ps (val2.x, val1.x, msk4);
+    }
+
+  masks[4] = msk4;
+  masks[5] = msk4;
+  masks[6] = msk4;
+  masks[7] = msk4;
+
+  for (i=0; i < 8; i++)
+    {
+      tmp = val2;
+      tmp.f[(masks[i] & 0x30) >> 4] = val1.f[(masks[i] & 0xC0) >> 6];
+
+      for (j = 0; j < 4; j++)
+	if (masks[i] & (0x1 << j))
+	  tmp.f[j] = 0.f;
+
+      if (memcmp (&res[i], &tmp, sizeof (tmp)))
+	abort ();
+    }
+} 
--- gcc/testsuite/gcc.target/i386/avx-vinsertps-3.c.jj	2011-04-14 15:52:11.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vinsertps-3.c	2011-04-14 16:01:58.000000000 +0200
@@ -0,0 +1,8 @@ 
+/* { dg-do run { target ilp32 } } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mfpmath=sse -mavx -mtune=geode" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-insertps-3.c"
--- gcc/testsuite/gcc.target/i386/avx-vinsertps-4.c.jj	2011-04-14 16:09:24.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-vinsertps-4.c	2011-04-14 16:17:26.000000000 +0200
@@ -0,0 +1,8 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mfpmath=sse -mavx" } */
+
+#define CHECK_H "avx-check.h"
+#define TEST avx_test
+
+#include "sse4_1-insertps-4.c"