Patchwork AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod* (take 2)

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 14, 2011, 6:34 p.m.
Message ID <20111014183412.GB2210@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/119899/
State New
Headers show

Comments

Jakub Jelinek - Oct. 14, 2011, 6:34 p.m.
On Fri, Oct 14, 2011 at 09:21:15AM -0700, Richard Henderson wrote:
> So what you're doing here is the low-part permutation:
> 
> 	0 4 1 5 2 6 3 7
> 
> followed by a shift to get
> 
> 	4 . 5 . 6 . 7 .
> 
> But you need to load a 256-bit constant from memory to get it.

Right.

> I wonder if it wouldn't be better to use VPERMQ to handle the lane change:
> 
> 	0   2   1   3
> 	0 1 4 5 2 3 6 7
> 
> shared between the hi/lo, and a VPSHUFD to handle the in-lane ordering:
> 
> 	0 0 1 1 2 2 3 3
> 	4 4 5 5 6 6 7 7
> 
> In the end we get 2+(2+2)=6 insns as setup prior to the VPMULDQs, as compared
> to your 1+2+(0+2)=5 insns, but no need to wait for the constant load.  Of 
> course, if the constant load gets hoisted out of the loop, yours will likely
> win on throughput.

But it increases register pressure, so it is a win for simpler loops,
and might be worse otherwise.

So here is the alternative patch that does it with VPERMQ+VPSHUFD.
Tested using the avx2-mul-1.c testcase with sde.

2011-10-14  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
	mode iterator and any_extend code iterator.
	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
	expanders.
	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
	also for TARGET_SSE4_1 using pmuldq insn.
	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
	(sse2_sse4_1): New code attr.
	(udot_prodv4si): Macroize using any_extend code iterator.
	(<s>dot_prodv8si): New expander.

	* gcc.target/i386/sse2-mul-1.c: New test.
	* gcc.target/i386/sse4_1-mul-1.c: New test.
	* gcc.target/i386/avx-mul-1.c: New test.
	* gcc.target/i386/xop-mul-1.c: New test.
	* gcc.target/i386/avx2-mul-1.c: New test.



	Jakub
Richard Henderson - Oct. 14, 2011, 7:11 p.m.
On 10/14/2011 11:34 AM, Jakub Jelinek wrote:
> 2011-10-14  Jakub Jelinek  <jakub@redhat.com>
> 
> 	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
> 	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
> 	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
> 	mode iterator and any_extend code iterator.
> 	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
> 	expanders.
> 	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
> 	also for TARGET_SSE4_1 using pmuldq insn.
> 	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
> 	(sse2_sse4_1): New code attr.
> 	(udot_prodv4si): Macroize using any_extend code iterator.
> 	(<s>dot_prodv8si): New expander.
> 
> 	* gcc.target/i386/sse2-mul-1.c: New test.
> 	* gcc.target/i386/sse4_1-mul-1.c: New test.
> 	* gcc.target/i386/avx-mul-1.c: New test.
> 	* gcc.target/i386/xop-mul-1.c: New test.
> 	* gcc.target/i386/avx2-mul-1.c: New test.

Ok.


r~

Patch

--- gcc/config/i386/sse.md.jj	2011-10-14 08:38:47.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-14 13:05:58.000000000 +0200
@@ -5507,83 +5507,97 @@  (define_insn_and_split "mul<mode>3"
   DONE;
 })
 
-(define_expand "vec_widen_smult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_smult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_umult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_hi_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
-
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  rtx t1, t2, t3, t4;
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V4DImode);
+  t2 = gen_reg_rtx (V4DImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
+				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
+  emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
+				GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6))));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
-(define_expand "vec_widen_umult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_lo_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
+  rtx t1, t2, t3, t4;
 
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
-
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V4DImode);
+  t2 = gen_reg_rtx (V4DImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_permv4di_1 (t2,  gen_lowpart (V4DImode, operands[2]),
+				  const0_rtx, const2_rtx,
+				  const1_rtx, GEN_INT (3)));
+  emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1),
+				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
+  emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2),
+				GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6))));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
@@ -5591,24 +5605,28 @@  (define_expand "vec_widen_smult_hi_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5616,24 +5634,28 @@  (define_expand "vec_widen_smult_lo_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5675,30 +5697,35 @@  (define_expand "vec_widen_umult_lo_v4si"
   DONE;
 })
 
-(define_expand "sdot_prodv8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")
-   (match_operand:V4SI 3 "register_operand" "")]
+(define_expand "sdot_prod<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (match_operand:VI2_AVX2 1 "register_operand" "")
+   (match_operand:VI2_AVX2 2 "register_operand" "")
+   (match_operand:<sseunpackmode> 3 "register_operand" "")]
   "TARGET_SSE2"
 {
-  rtx t = gen_reg_rtx (V4SImode);
-  emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
-  emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+  rtx t = gen_reg_rtx (<sseunpackmode>mode);
+  emit_insn (gen_<sse2_avx2>_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_PLUS (<sseunpackmode>mode,
+					operands[3], t)));
   DONE;
 })
 
-(define_expand "udot_prodv4si"
+(define_code_attr sse2_sse4_1
+   [(zero_extend "sse2") (sign_extend "sse4_1")])
+
+(define_expand "<s>dot_prodv4si"
   [(match_operand:V2DI 0 "register_operand" "")
-   (match_operand:V4SI 1 "register_operand" "")
+   (any_extend:V2DI (match_operand:V4SI 1 "register_operand" ""))
    (match_operand:V4SI 2 "register_operand" "")
    (match_operand:V2DI 3 "register_operand" "")]
-  "TARGET_SSE2"
+  "<CODE> == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1"
 {
   rtx t1, t2, t3, t4;
 
   t1 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t1, operands[1], operands[2]));
   emit_insn (gen_addv2di3 (t1, t1, operands[3]));
 
   t2 = gen_reg_rtx (V4SImode);
@@ -5711,12 +5738,41 @@  (define_expand "udot_prodv4si"
 				 GEN_INT (32)));
 
   t4 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t4, t2, t3));
 
   emit_insn (gen_addv2di3 (operands[0], t1, t4));
   DONE;
 })
 
+(define_expand "<s>dot_prodv8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "register_operand" ""))
+   (match_operand:V8SI 2 "register_operand" "")
+   (match_operand:V4DI 3 "register_operand" "")]
+  "TARGET_AVX2"
+{
+  rtx t1, t2, t3, t4;
+
+  t1 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_addv4di3 (t1, t1, operands[3]));
+
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2),
+				 gen_lowpart (V2TImode, operands[1]),
+				 GEN_INT (32)));
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3),
+				 gen_lowpart (V2TImode, operands[2]),
+				 GEN_INT (32)));
+
+  t4 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t4, t2, t3));
+
+  emit_insn (gen_addv4di3 (operands[0], t1, t4));
+  DONE;
+})
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
 	(ashiftrt:VI24_AVX2
--- gcc/testsuite/gcc.target/i386/sse2-mul-1.c.jj	2011-10-14 10:39:57.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-mul-1.c	2011-10-14 13:32:53.000000000 +0200
@@ -0,0 +1,209 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O3 -msse2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+#include <stdlib.h>
+
+#define N 512
+static short a1[N], a2[N], a3[N];
+static unsigned short b1[N], b2[N], b3[N];
+static int c1[N], c2[N], c3[N];
+static unsigned int d1[N], d2[N], d3[N];
+static long long e1[N], e2[N], e3[N];
+static unsigned long long g1[N], g2[N], g3[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    a1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    b1[i] = b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = c2[i] * c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = d2[i] * d3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = e2[i] * e3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = g2[i] * g3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = (unsigned int) b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = (long long) c2[i] * (long long) c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = (unsigned long long) d2[i] * (unsigned long long) d3[i];
+}
+
+__attribute__((noinline, noclone)) int
+f11 (void)
+{
+  int i, r = 0;
+  for (i = 0; i < N; ++i)
+    r += a2[i] * a3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned int
+f12 (void)
+{
+  int i;
+  unsigned r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned int) b2[i] * b3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) long long
+f13 (void)
+{
+  int i;
+  long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (long long) c2[i] * (long long) c3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+f14 (void)
+{
+  int i;
+  unsigned long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+  return r;
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int s1 = 0;
+  unsigned int s2 = 0;
+  long long s3 = 0;
+  unsigned long long s4 = 0;
+  for (i = 0; i < N; ++i)
+    {
+      asm volatile ("" : : "r" (&s1) : "memory");
+      asm volatile ("" : : "r" (&s2) : "memory");
+      asm volatile ("" : : "r" (&s3) : "memory");
+      asm volatile ("" : : "r" (&s4) : "memory");
+      b2[i] = (int) random ();
+      b3[i] = (int) random ();
+      a2[i] = b2[i];
+      a3[i] = b3[i];
+      d2[i] = (((int) random ()) << 16) | b2[i];
+      d3[i] = (((int) random ()) << 16) | b3[i];
+      c2[i] = d2[i];
+      c3[i] = d3[i];
+      s1 += a2[i] * a3[i];
+      s2 += (unsigned int) b2[i] * b3[i];
+      s3 += (long long) c2[i] * (long long) c3[i];
+      s4 += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+    }
+  f1 ();
+  f2 ();
+  f3 ();
+  f4 ();
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (a1[i] != (short) (a2[i] * a3[i]))
+	abort ();
+      if (b1[i] != (unsigned short) (b2[i] * b3[i]))
+	abort ();
+      if (c1[i] != c2[i] * c3[i])
+	abort ();
+      if (d1[i] != d2[i] * d3[i])
+	abort ();
+      if (e1[i] != e2[i] * e3[i])
+	abort ();
+      if (g1[i] != g2[i] * g3[i])
+	abort ();
+    }
+  f7 ();
+  f8 ();
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (c1[i] != a2[i] * a3[i])
+	abort ();
+      if (d1[i] != b2[i] * b3[i])
+	abort ();
+      if (e1[i] != (long long) c2[i] * (long long) c3[i])
+	abort ();
+      if (g1[i] != (unsigned long long) d2[i] * (unsigned long long) d3[i])
+	abort ();
+    }
+  if (f11 () != s1 || f12 () != s2 || f13 () != s3 || f14 () != s4)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c.jj	2011-10-14 10:40:46.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c	2011-10-14 10:41:27.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O3 -msse4.1" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx-mul-1.c.jj	2011-10-14 10:42:35.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-mul-1.c	2011-10-14 10:42:56.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/xop-mul-1.c.jj	2011-10-14 10:44:11.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/xop-mul-1.c	2011-10-14 10:44:25.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O3 -mxop" } */
+
+#ifndef CHECK_H
+#define CHECK_H "xop-check.h"
+#endif
+
+#ifndef TEST
+#define TEST xop_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx2-mul-1.c.jj	2011-10-14 10:43:28.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-mul-1.c	2011-10-14 10:43:45.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include "sse2-mul-1.c"