Patchwork AVX2 vec_widen_[su]mult_{hi,lo}*, sdot_prod* and udot_prod*

login
register
mail settings
Submitter Jakub Jelinek
Date Oct. 14, 2011, 2:18 p.m.
Message ID <20111014141802.GY2210@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/119799/
State New
Headers show

Comments

Jakub Jelinek - Oct. 14, 2011, 2:18 p.m.
Hi!

This patch improves generated code for SSE4.1 and even more
for AVX2 on the attached testcases.  SSE4.1 has pmuldq
(where SSE2 only had pmuludq), so can handle signed widening shifts fine
too.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2011-10-14  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/sse.md (vec_widen_smult_hi_v8hi,
	vec_widen_smult_lo_v8hi, vec_widen_umult_hi_v8hi,
	vec_widen_umult_lo_v8hi): Macroize using VI2_AVX2
	mode iterator and any_extend code iterator.
	(vec_widen_<s>mult_hi_v8si, vec_widen_<s>mult_lo_v8si): New
	expanders.
	(vec_widen_smult_hi_v4si, vec_widen_smult_lo_v4si): Enable
	also for TARGET_SSE4_1 using pmuldq insn.
	(sdot_prodv8hi): Macroize using VI2_AVX2 iterator.
	(sse2_sse4_1): New code attr.
	(udot_prodv4si): Macroize using any_extend code iterator.
	(<s>dot_prodv8si): New expander.

	* gcc.target/i386/sse2-mul-1.c: New test.
	* gcc.target/i386/sse4_1-mul-1.c: New test.
	* gcc.target/i386/avx-mul-1.c: New test.
	* gcc.target/i386/xop-mul-1.c: New test.
	* gcc.target/i386/avx2-mul-1.c: New test.


	Jakub
Richard Henderson - Oct. 14, 2011, 4:21 p.m.
On 10/14/2011 07:18 AM, Jakub Jelinek wrote:
> +  /* This would be 2 insns shorter if
> +     rperm[i] = GEN_INT (((~i & 1) << 2) + i / 2);
> +     has been used instead (both vpslrq insns wouldn't be needed),
> +     but vec_widen_*mult_hi_* is usually used together with
> +     vec_widen_*mult_lo_* and by writing it this way the load
> +     of the constant and the two vpermd instructions (cross-lane)
> +     can be CSEd together.  */
> +  for (i = 0; i < 8; ++i)
> +    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
> +  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
> +  vperm = force_reg (V8SImode, vperm);
> +  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
> +  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
> +  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t3),
> +			    gen_lowpart (V4DImode, t1), GEN_INT (32)));
> +  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t4),
> +			    gen_lowpart (V4DImode, t2), GEN_INT (32)));
> +  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));

So what you're doing here is the low-part permutation:

	0 4 1 5 2 6 3 7

followed by a shift to get

	4 . 5 . 6 . 7 .

But you need to load a 256-bit constant from memory to get it.

I wonder if it wouldn't be better to use VPERMQ to handle the lane change:

	0   2   1   3
	0 1 4 5 2 3 6 7

shared between the hi/lo, and a VPSHUFD to handle the in-lane ordering:

	0 0 1 1 2 2 3 3
	4 4 5 5 6 6 7 7

In the end we get 2+(2+2)=6 insns as setup prior to the VPMULDQs, as compared
to your 1+2+(0+2)=5 insns, but no need to wait for the constant load.  Of 
course, if the constant load gets hoisted out of the loop, yours will likely
win on throughput.

Thoughts, Uros and those looking in from Intel?

Otherwise it looks ok.


r~

Patch

--- gcc/config/i386/sse.md.jj	2011-10-14 08:38:47.000000000 +0200
+++ gcc/config/i386/sse.md	2011-10-14 13:05:58.000000000 +0200
@@ -5507,83 +5507,100 @@  (define_insn_and_split "mul<mode>3"
   DONE;
 })
 
-(define_expand "vec_widen_smult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_hi_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_high<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_smult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
+(define_expand "vec_widen_<s>mult_lo_<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (any_extend:<sseunpackmode>
+     (match_operand:VI2_AVX2 1 "register_operand" ""))
+   (match_operand:VI2_AVX2 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, t1, t2, dest;
 
   op1 = operands[1];
   op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  t1 = gen_reg_rtx (<MODE>mode);
+  t2 = gen_reg_rtx (<MODE>mode);
+  dest = gen_lowpart (<MODE>mode, operands[0]);
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_smulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  emit_insn (gen_mul<mode>3 (t1, op1, op2));
+  emit_insn (gen_<s>mul<mode>3_highpart (t2, op1, op2));
+  emit_insn (gen_vec_interleave_low<mode> (dest, t1, t2));
   DONE;
 })
 
-(define_expand "vec_widen_umult_hi_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_hi_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
-
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
+  rtx t1, t2, t3, t4, rperm[8], vperm;
+  int i;
 
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V8SImode);
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  t4 = gen_reg_rtx (V8SImode);
+  /* This would be 2 insns shorter if
+     rperm[i] = GEN_INT (((~i & 1) << 2) + i / 2);
+     has been used instead (both vpslrq insns wouldn't be needed),
+     but vec_widen_*mult_hi_* is usually used together with
+     vec_widen_*mult_lo_* and by writing it this way the load
+     of the constant and the two vpermd instructions (cross-lane)
+     can be CSEd together.  */
+  for (i = 0; i < 8; ++i)
+    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
+  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
+  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t3),
+			    gen_lowpart (V4DImode, t1), GEN_INT (32)));
+  emit_insn (gen_lshrv4di3 (gen_lowpart (V4DImode, t4),
+			    gen_lowpart (V4DImode, t2), GEN_INT (32)));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t3, t4));
   DONE;
 })
 
-(define_expand "vec_widen_umult_lo_v8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")]
-  "TARGET_SSE2"
+(define_expand "vec_widen_<s>mult_lo_v8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" ""))
+   (match_operand:V8SI 2 "nonimmediate_operand" "")]
+  "TARGET_AVX2"
 {
-  rtx op1, op2, t1, t2, dest;
+  rtx t1, t2, rperm[8], vperm;
+  int i;
 
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V8HImode);
-  t2 = gen_reg_rtx (V8HImode);
-  dest = gen_lowpart (V8HImode, operands[0]);
-
-  emit_insn (gen_mulv8hi3 (t1, op1, op2));
-  emit_insn (gen_umulv8hi3_highpart (t2, op1, op2));
-  emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2));
+  t1 = gen_reg_rtx (V8SImode);
+  t2 = gen_reg_rtx (V8SImode);
+  for (i = 0; i < 8; ++i)
+    rperm[i] = GEN_INT (((i & 1) << 2) + i / 2);
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx2_permvarv8si (t1, vperm, operands[1]));
+  emit_insn (gen_avx2_permvarv8si (t2, vperm, operands[2]));
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5591,24 +5608,28 @@  (define_expand "vec_widen_smult_hi_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_highv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_highv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5616,24 +5637,28 @@  (define_expand "vec_widen_smult_lo_v4si"
   [(match_operand:V2DI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
-  "TARGET_XOP"
+  "TARGET_SSE4_1"
 {
-  rtx t1, t2;
+  rtx op1, op2, t1, t2;
 
+  op1 = operands[1];
+  op2 = operands[2];
   t1 = gen_reg_rtx (V4SImode);
   t2 = gen_reg_rtx (V4SImode);
 
-  emit_insn (gen_sse2_pshufd_1 (t1, operands[1],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_sse2_pshufd_1 (t2, operands[2],
-				GEN_INT (0),
-				GEN_INT (2),
-				GEN_INT (1),
-				GEN_INT (3)));
-  emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+  if (TARGET_XOP)
+    {
+      emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2),
+				    GEN_INT (1), GEN_INT (3)));
+      emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2));
+      DONE;
+    }
+
+  emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1));
+  emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2));
+  emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2));
   DONE;
 })
 
@@ -5675,30 +5700,35 @@  (define_expand "vec_widen_umult_lo_v4si"
   DONE;
 })
 
-(define_expand "sdot_prodv8hi"
-  [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "register_operand" "")
-   (match_operand:V8HI 2 "register_operand" "")
-   (match_operand:V4SI 3 "register_operand" "")]
+(define_expand "sdot_prod<mode>"
+  [(match_operand:<sseunpackmode> 0 "register_operand" "")
+   (match_operand:VI2_AVX2 1 "register_operand" "")
+   (match_operand:VI2_AVX2 2 "register_operand" "")
+   (match_operand:<sseunpackmode> 3 "register_operand" "")]
   "TARGET_SSE2"
 {
-  rtx t = gen_reg_rtx (V4SImode);
-  emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2]));
-  emit_insn (gen_addv4si3 (operands[0], operands[3], t));
+  rtx t = gen_reg_rtx (<sseunpackmode>mode);
+  emit_insn (gen_<sse2_avx2>_pmaddwd (t, operands[1], operands[2]));
+  emit_insn (gen_rtx_SET (VOIDmode, operands[0],
+			  gen_rtx_PLUS (<sseunpackmode>mode,
+					operands[3], t)));
   DONE;
 })
 
-(define_expand "udot_prodv4si"
+(define_code_attr sse2_sse4_1
+   [(zero_extend "sse2") (sign_extend "sse4_1")])
+
+(define_expand "<s>dot_prodv4si"
   [(match_operand:V2DI 0 "register_operand" "")
-   (match_operand:V4SI 1 "register_operand" "")
+   (any_extend:V2DI (match_operand:V4SI 1 "register_operand" ""))
    (match_operand:V4SI 2 "register_operand" "")
    (match_operand:V2DI 3 "register_operand" "")]
-  "TARGET_SSE2"
+  "<CODE> == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1"
 {
   rtx t1, t2, t3, t4;
 
   t1 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t1, operands[1], operands[2]));
   emit_insn (gen_addv2di3 (t1, t1, operands[3]));
 
   t2 = gen_reg_rtx (V4SImode);
@@ -5711,12 +5741,41 @@  (define_expand "udot_prodv4si"
 				 GEN_INT (32)));
 
   t4 = gen_reg_rtx (V2DImode);
-  emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3));
+  emit_insn (gen_<sse2_sse4_1>_<u>mulv2siv2di3 (t4, t2, t3));
 
   emit_insn (gen_addv2di3 (operands[0], t1, t4));
   DONE;
 })
 
+(define_expand "<s>dot_prodv8si"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (any_extend:V4DI (match_operand:V8SI 1 "register_operand" ""))
+   (match_operand:V8SI 2 "register_operand" "")
+   (match_operand:V4DI 3 "register_operand" "")]
+  "TARGET_AVX2"
+{
+  rtx t1, t2, t3, t4;
+
+  t1 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t1, operands[1], operands[2]));
+  emit_insn (gen_addv4di3 (t1, t1, operands[3]));
+
+  t2 = gen_reg_rtx (V8SImode);
+  t3 = gen_reg_rtx (V8SImode);
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2),
+				 gen_lowpart (V2TImode, operands[1]),
+				 GEN_INT (32)));
+  emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3),
+				 gen_lowpart (V2TImode, operands[2]),
+				 GEN_INT (32)));
+
+  t4 = gen_reg_rtx (V4DImode);
+  emit_insn (gen_avx2_<u>mulv4siv4di3 (t4, t2, t3));
+
+  emit_insn (gen_addv4di3 (operands[0], t1, t4));
+  DONE;
+})
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
 	(ashiftrt:VI24_AVX2
--- gcc/testsuite/gcc.target/i386/sse2-mul-1.c.jj	2011-10-14 10:39:57.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse2-mul-1.c	2011-10-14 13:32:53.000000000 +0200
@@ -0,0 +1,209 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse2 } */
+/* { dg-options "-O3 -msse2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse2_test
+#endif
+
+#include CHECK_H
+
+#include <stdlib.h>
+
+#define N 512
+static short a1[N], a2[N], a3[N];
+static unsigned short b1[N], b2[N], b3[N];
+static int c1[N], c2[N], c3[N];
+static unsigned int d1[N], d2[N], d3[N];
+static long long e1[N], e2[N], e3[N];
+static unsigned long long g1[N], g2[N], g3[N];
+
+__attribute__((noinline, noclone)) void
+f1 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    a1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    b1[i] = b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f3 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = c2[i] * c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = d2[i] * d3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f5 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = e2[i] * e3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f6 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = g2[i] * g3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f7 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    c1[i] = a2[i] * a3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f8 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    d1[i] = (unsigned int) b2[i] * b3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f9 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    e1[i] = (long long) c2[i] * (long long) c3[i];
+}
+
+__attribute__((noinline, noclone)) void
+f10 (void)
+{
+  int i;
+  for (i = 0; i < N; ++i)
+    g1[i] = (unsigned long long) d2[i] * (unsigned long long) d3[i];
+}
+
+__attribute__((noinline, noclone)) int
+f11 (void)
+{
+  int i, r = 0;
+  for (i = 0; i < N; ++i)
+    r += a2[i] * a3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned int
+f12 (void)
+{
+  int i;
+  unsigned r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned int) b2[i] * b3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) long long
+f13 (void)
+{
+  int i;
+  long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (long long) c2[i] * (long long) c3[i];
+  return r;
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+f14 (void)
+{
+  int i;
+  unsigned long long r = 0;
+  for (i = 0; i < N; ++i)
+    r += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+  return r;
+}
+
+static void
+TEST (void)
+{
+  int i;
+  int s1 = 0;
+  unsigned int s2 = 0;
+  long long s3 = 0;
+  unsigned long long s4 = 0;
+  for (i = 0; i < N; ++i)
+    {
+      asm volatile ("" : : "r" (&s1) : "memory");
+      asm volatile ("" : : "r" (&s2) : "memory");
+      asm volatile ("" : : "r" (&s3) : "memory");
+      asm volatile ("" : : "r" (&s4) : "memory");
+      b2[i] = (int) random ();
+      b3[i] = (int) random ();
+      a2[i] = b2[i];
+      a3[i] = b3[i];
+      d2[i] = (((int) random ()) << 16) | b2[i];
+      d3[i] = (((int) random ()) << 16) | b3[i];
+      c2[i] = d2[i];
+      c3[i] = d3[i];
+      s1 += a2[i] * a3[i];
+      s2 += (unsigned int) b2[i] * b3[i];
+      s3 += (long long) c2[i] * (long long) c3[i];
+      s4 += (unsigned long long) d2[i] * (unsigned long long) d3[i];
+    }
+  f1 ();
+  f2 ();
+  f3 ();
+  f4 ();
+  f5 ();
+  f6 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (a1[i] != (short) (a2[i] * a3[i]))
+	abort ();
+      if (b1[i] != (unsigned short) (b2[i] * b3[i]))
+	abort ();
+      if (c1[i] != c2[i] * c3[i])
+	abort ();
+      if (d1[i] != d2[i] * d3[i])
+	abort ();
+      if (e1[i] != e2[i] * e3[i])
+	abort ();
+      if (g1[i] != g2[i] * g3[i])
+	abort ();
+    }
+  f7 ();
+  f8 ();
+  f9 ();
+  f10 ();
+  for (i = 0; i < N; ++i)
+    {
+      if (c1[i] != a2[i] * a3[i])
+	abort ();
+      if (d1[i] != b2[i] * b3[i])
+	abort ();
+      if (e1[i] != (long long) c2[i] * (long long) c3[i])
+	abort ();
+      if (g1[i] != (unsigned long long) d2[i] * (unsigned long long) d3[i])
+	abort ();
+    }
+  if (f11 () != s1 || f12 () != s2 || f13 () != s3 || f14 () != s4)
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c.jj	2011-10-14 10:40:46.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/sse4_1-mul-1.c	2011-10-14 10:41:27.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O3 -msse4.1" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx-mul-1.c.jj	2011-10-14 10:42:35.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-mul-1.c	2011-10-14 10:42:56.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/xop-mul-1.c.jj	2011-10-14 10:44:11.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/xop-mul-1.c	2011-10-14 10:44:25.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target xop } */
+/* { dg-options "-O3 -mxop" } */
+
+#ifndef CHECK_H
+#define CHECK_H "xop-check.h"
+#endif
+
+#ifndef TEST
+#define TEST xop_test
+#endif
+
+#include "sse2-mul-1.c"
--- gcc/testsuite/gcc.target/i386/avx2-mul-1.c.jj	2011-10-14 10:43:28.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx2-mul-1.c	2011-10-14 10:43:45.000000000 +0200
@@ -0,0 +1,13 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O3 -mavx2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include "sse2-mul-1.c"