diff mbox

Add MULT_HIGHPART_EXPR

Message ID 20120628140558.GS20264@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek June 28, 2012, 2:05 p.m. UTC
On Thu, Jun 28, 2012 at 09:17:55AM +0200, Jakub Jelinek wrote:
> I'll look at using MULT_HIGHPART_EXPR in the pattern recognizer and
> vectorizing it as either of the sequences next.

And here is corresponding pattern recognizer and vectorizer patch.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
to pessimize the generated code for gcc.dg/vect/pr51581-3.c
testcase (at least with -O3 -mavx) compared to when the hooks aren't
present, because i?86 has more natural support for widen mult lo/hi
compoared to widen mult even/odd, but I assume that on powerpc it is the
other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
and builtin_mul_widen_* are possible for the particular vectype which one
will be cheaper?

2012-06-28  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/51581
	* tree-vect-stmts.c (permute_vec_elements): Add forward decl.
	(vectorizable_operation): Handle vectorization of MULT_HIGHPART_EXPR
	also using VEC_WIDEN_MULT_*_EXPR or builtin_mul_widen_* plus
	VEC_PERM_EXPR if vector MULT_HIGHPART_EXPR isn't supported.
	* tree-vect-patterns.c (vect_recog_divmod_pattern): Use
	MULT_HIGHPART_EXPR instead of VEC_WIDEN_MULT_*_EXPR and shifts.

	* gcc.dg/vect/pr51581-4.c: New test.



	Jakub

Comments

Richard Henderson June 28, 2012, 3:57 p.m. UTC | #1
On 2012-06-28 07:05, Jakub Jelinek wrote:
> Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
> to pessimize the generated code for gcc.dg/vect/pr51581-3.c
> testcase (at least with -O3 -mavx) compared to when the hooks aren't
> present, because i?86 has more natural support for widen mult lo/hi
> compoared to widen mult even/odd, but I assume that on powerpc it is the
> other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
> and builtin_mul_widen_* are possible for the particular vectype which one
> will be cheaper?

I would assume that if the builtin exists, then it is cheaper.

I disagree about "x86 has more natural support for hi/lo".  The basic sse2 multiplication is even.  One shift per input is needed to generate odd.  On the other hand, one interleave per input is required for both hi/lo.  So 4 setup insns for hi/lo, and 2 setup insns for even/odd.  And on top of all that, XOP includes multiply odd at least for signed V4SI.

I'll have a look at the test case you mention while I re-look at the patches...


r~
Jakub Jelinek June 28, 2012, 4:20 p.m. UTC | #2
On Thu, Jun 28, 2012 at 08:57:23AM -0700, Richard Henderson wrote:
> On 2012-06-28 07:05, Jakub Jelinek wrote:
> > Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
> > to pessimize the generated code for gcc.dg/vect/pr51581-3.c
> > testcase (at least with -O3 -mavx) compared to when the hooks aren't
> > present, because i?86 has more natural support for widen mult lo/hi
> > compoared to widen mult even/odd, but I assume that on powerpc it is the
> > other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
> > and builtin_mul_widen_* are possible for the particular vectype which one
> > will be cheaper?
> 
> I would assume that if the builtin exists, then it is cheaper.
> 
> I disagree about "x86 has more natural support for hi/lo".  The basic sse2
> multiplication is even.  One shift per input is needed to generate odd. 
> On the other hand, one interleave per input is required for both hi/lo. 
> So 4 setup insns for hi/lo, and 2 setup insns for even/odd.  And on top of
> all that, XOP includes multiply odd at least for signed V4SI.

Perhaps the problem is then that the permutation is much more expensive
for even/odd.  With even/odd the f2 routine is:
        vmovdqa d(%rip), %xmm2
        vmovdqa .LC1(%rip), %xmm0
        vpsrlq  $32, %xmm2, %xmm4
        vmovdqa d+16(%rip), %xmm1
        vpmuludq        %xmm0, %xmm2, %xmm5
        vpsrlq  $32, %xmm0, %xmm3
        vpmuludq        %xmm3, %xmm4, %xmm4
        vpmuludq        %xmm0, %xmm1, %xmm0
        vmovdqa .LC2(%rip), %xmm2
        vpsrlq  $32, %xmm1, %xmm1
        vpmuludq        %xmm3, %xmm1, %xmm3
        vmovdqa .LC3(%rip), %xmm1
        vpshufb %xmm2, %xmm5, %xmm5
        vpshufb %xmm1, %xmm4, %xmm4
        vpshufb %xmm2, %xmm0, %xmm2
        vpshufb %xmm1, %xmm3, %xmm1
        vpor    %xmm4, %xmm5, %xmm4
        vpor    %xmm1, %xmm2, %xmm1
        vpsrld  $1, %xmm4, %xmm4
        vmovdqa %xmm4, c(%rip)
        vpsrld  $1, %xmm1, %xmm1
        vmovdqa %xmm1, c+16(%rip)
        ret
and with lo/hi it is:
        vmovdqa d(%rip), %xmm2
        vpunpckhdq      %xmm2, %xmm2, %xmm3
        vpunpckldq      %xmm2, %xmm2, %xmm2
        vmovdqa .LC1(%rip), %xmm0
        vpmuludq        %xmm0, %xmm3, %xmm3
        vmovdqa d+16(%rip), %xmm1
        vpmuludq        %xmm0, %xmm2, %xmm2
        vshufps $221, %xmm2, %xmm3, %xmm2
        vpsrld  $1, %xmm2, %xmm2
        vmovdqa %xmm2, c(%rip)
        vpunpckhdq      %xmm1, %xmm1, %xmm2
        vpunpckldq      %xmm1, %xmm1, %xmm1
        vpmuludq        %xmm0, %xmm2, %xmm2
        vpmuludq        %xmm0, %xmm1, %xmm0
        vshufps $221, %xmm0, %xmm2, %xmm0
        vpsrld  $1, %xmm0, %xmm0
        vmovdqa %xmm0, c+16(%rip)
        ret

	Jakub
H.J. Lu June 28, 2012, 4:44 p.m. UTC | #3
On Thu, Jun 28, 2012 at 8:57 AM, Richard Henderson <rth@redhat.com> wrote:
> On 2012-06-28 07:05, Jakub Jelinek wrote:
>> Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
>> to pessimize the generated code for gcc.dg/vect/pr51581-3.c
>> testcase (at least with -O3 -mavx) compared to when the hooks aren't
>> present, because i?86 has more natural support for widen mult lo/hi
>> compoared to widen mult even/odd, but I assume that on powerpc it is the
>> other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
>> and builtin_mul_widen_* are possible for the particular vectype which one
>> will be cheaper?
>
> I would assume that if the builtin exists, then it is cheaper.
>
> I disagree about "x86 has more natural support for hi/lo".  The basic sse2 multiplication is even.  One shift per input is needed to generate odd.  On the other hand, one interleave per input is required for both hi/lo.  So 4 setup insns for hi/lo, and 2 setup insns for even/odd.  And on top of all that, XOP includes multiply odd at least for signed V4SI.
>
> I'll have a look at the test case you mention while I re-look at the patches...
>

The upper 128-bit of 256-bit AVX instructions aren't a good fit with the
current vectorizer infrastructure.
Richard Henderson June 28, 2012, 5:33 p.m. UTC | #4
On 2012-06-28 09:20, Jakub Jelinek wrote:
> Perhaps the problem is then that the permutation is much more expensive
> for even/odd.  With even/odd the f2 routine is:
...
>         vpshufb %xmm2, %xmm5, %xmm5
>         vpshufb %xmm1, %xmm4, %xmm4
>         vpor    %xmm4, %xmm5, %xmm4
...
> and with lo/hi it is:
>         vshufps $221, %xmm2, %xmm3, %xmm2

Hmm.  That second has a reformatting delay.

Last week when I pulled the mulv4si3 routine out to i386.c,
I experimented with a few different options, including that
interleave+shufps sequence seen here for lo/hi.  See the 
comment there discussing options and timing.

This also shows a deficiency in our vec_perm logic:

	0L 0H 2L 2H	1L 1H 3L 3H
	0H 2H 0H 2H	1H 3H 1H 3H	2*pshufd
	0H 1H 2H 3H			punpckldq

without the permutation constants in memory.


r~
Richard Henderson June 28, 2012, 5:44 p.m. UTC | #5
On 2012-06-28 07:05, Jakub Jelinek wrote:
> 	PR tree-optimization/51581
> 	* tree-vect-stmts.c (permute_vec_elements): Add forward decl.
> 	(vectorizable_operation): Handle vectorization of MULT_HIGHPART_EXPR
> 	also using VEC_WIDEN_MULT_*_EXPR or builtin_mul_widen_* plus
> 	VEC_PERM_EXPR if vector MULT_HIGHPART_EXPR isn't supported.
> 	* tree-vect-patterns.c (vect_recog_divmod_pattern): Use
> 	MULT_HIGHPART_EXPR instead of VEC_WIDEN_MULT_*_EXPR and shifts.
> 
> 	* gcc.dg/vect/pr51581-4.c: New test.

Ok, except,

> +	      if (0 && can_vec_perm_p (vec_mode, false, sel))
> +		icode = 0;

Testing hack left in.


r~
Bernhard Reutner-Fischer June 28, 2012, 10 p.m. UTC | #6
On Thu, Jun 28, 2012 at 04:05:58PM +0200, Jakub Jelinek wrote:
>On Thu, Jun 28, 2012@09:17:55AM +0200, Jakub Jelinek wrote:
>> I'll look@using MULT_HIGHPART_EXPR in the pattern recognizer and
>> vectorizing it as either of the sequences next.
>
>And here is corresponding pattern recognizer and vectorizer patch.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
>Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
>to pessimize the generated code for gcc.dg/vect/pr51581-3.c
>testcase (at least with -O3 -mavx) compared to when the hooks aren't
>present, because i?86 has more natural support for widen mult lo/hi
>compoared to widen mult even/odd, but I assume that on powerpc it is the
>other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
>and builtin_mul_widen_* are possible for the particular vectype which one
>will be cheaper?
>
>2012-06-28  Jakub Jelinek  <jakub@redhat.com>
>
>	PR tree-optimization/51581
>	* tree-vect-stmts.c (permute_vec_elements): Add forward decl.
>	(vectorizable_operation): Handle vectorization of MULT_HIGHPART_EXPR
>	also using VEC_WIDEN_MULT_*_EXPR or builtin_mul_widen_* plus
>	VEC_PERM_EXPR if vector MULT_HIGHPART_EXPR isn't supported.
>	* tree-vect-patterns.c (vect_recog_divmod_pattern): Use
>	MULT_HIGHPART_EXPR instead of VEC_WIDEN_MULT_*_EXPR and shifts.
>
>	* gcc.dg/vect/pr51581-4.c: New test.
>
>--- gcc/tree-vect-stmts.c.jj	2012-06-26 11:38:28.000000000 +0200
>+++ gcc/tree-vect-stmts.c	2012-06-28 13:27:50.475158271 +0200
>@@ -3300,17 +3304,18 @@ static bool

>+  icode = optab ? (int) optab_handler (optab, vec_mode) : CODE_FOR_nothing;
>+
>+  if (icode == CODE_FOR_nothing
>+      && code == MULT_HIGHPART_EXPR
>+      && VECTOR_MODE_P (vec_mode)
>+      && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
>+    {
>+      /* If MULT_HIGHPART_EXPR isn't supported by the backend, see
>+	 if we can emit VEC_WIDEN_MULT_{LO,HI}_EXPR followed by VEC_PERM_EXPR
>+	 or builtin_mul_widen_{even,odd} followed by VEC_PERM_EXPR.  */
>+      unsigned int prec = TYPE_PRECISION (TREE_TYPE (scalar_dest));
>+      unsigned int unsignedp = TYPE_UNSIGNED (TREE_TYPE (scalar_dest));
>+      tree wide_type
>+	= build_nonstandard_integer_type (prec * 2, unsignedp);
>+      wide_vectype
>+        = get_same_sized_vectype (wide_type, vectype);
>+
>+      sel = XALLOCAVEC (unsigned char, nunits_in);
>+      if (VECTOR_MODE_P (TYPE_MODE (wide_vectype))
>+	  && GET_MODE_SIZE (TYPE_MODE (wide_vectype))
>+	     == GET_MODE_SIZE (vec_mode))
>+	{
>+	  if (targetm.vectorize.builtin_mul_widen_even
>+	      && (decl1 = targetm.vectorize.builtin_mul_widen_even (vectype))
>+	      && targetm.vectorize.builtin_mul_widen_odd
>+	      && (decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype))
>+	      && TYPE_MODE (TREE_TYPE (TREE_TYPE (decl1)))
>+		 == TYPE_MODE (wide_vectype))
>+	    {
>+	      for (i = 0; i < nunits_in; i++)
>+		sel[i] = !BYTES_BIG_ENDIAN + (i & ~1)
>+			 + ((i & 1) ? nunits_in : 0);
>+	      if (0 && can_vec_perm_p (vec_mode, false, sel))
>+		icode = 0;
>+	    }
>+	  if (icode == CODE_FOR_nothing)
>+	    {
>+	      decl1 = NULL_TREE;
>+	      decl2 = NULL_TREE;
>+	      optab = optab_for_tree_code (VEC_WIDEN_MULT_HI_EXPR,
>+					   vectype, optab_default);
>+	      optab2 = optab_for_tree_code (VEC_WIDEN_MULT_HI_EXPR,
>+					    vectype, optab_default);

Really both HI? If so optab2 could be removed from that fn altogether..

>+	      if (optab != NULL
>+		  && optab2 != NULL
>+		  && optab_handler (optab, vec_mode) != CODE_FOR_nothing
>+		  && optab_handler (optab2, vec_mode) != CODE_FOR_nothing)
>+		{
>+		  for (i = 0; i < nunits_in; i++)
>+		    sel[i] = !BYTES_BIG_ENDIAN + 2 * i;
>+		  if (can_vec_perm_p (vec_mode, false, sel))
>+		    icode = optab_handler (optab, vec_mode);
>+		}
>+	    }
>+	}
>+      if (icode == CODE_FOR_nothing)
>+	{
>+	  if (optab_for_tree_code (code, vectype, optab_default) == NULL)
>+	    {
>+	      if (vect_print_dump_info (REPORT_DETAILS))
>+		fprintf (vect_dump, "no optab.");
>+	      return false;
>+	    }
>+	  wide_vectype = NULL_TREE;
>+	  optab2 = NULL;
>+	}
>+    }
>+
Richard Biener June 29, 2012, 9 a.m. UTC | #7
On Thu, Jun 28, 2012 at 6:44 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Jun 28, 2012 at 8:57 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 2012-06-28 07:05, Jakub Jelinek wrote:
>>> Unfortunately the addition of the builtin_mul_widen_* hooks on i?86 seems
>>> to pessimize the generated code for gcc.dg/vect/pr51581-3.c
>>> testcase (at least with -O3 -mavx) compared to when the hooks aren't
>>> present, because i?86 has more natural support for widen mult lo/hi
>>> compoared to widen mult even/odd, but I assume that on powerpc it is the
>>> other way around.  So, how should I find out if both VEC_WIDEN_MULT_*_EXPR
>>> and builtin_mul_widen_* are possible for the particular vectype which one
>>> will be cheaper?
>>
>> I would assume that if the builtin exists, then it is cheaper.
>>
>> I disagree about "x86 has more natural support for hi/lo".  The basic sse2 multiplication is even.  One shift per input is needed to generate odd.  On the other hand, one interleave per input is required for both hi/lo.  So 4 setup insns for hi/lo, and 2 setup insns for even/odd.  And on top of all that, XOP includes multiply odd at least for signed V4SI.
>>
>> I'll have a look at the test case you mention while I re-look at the patches...
>>
>
> The upper 128-bit of 256-bit AVX instructions aren't a good fit with the
> current vectorizer infrastructure.

Indeed - the lack of cross-sub-128bit-word operations makes it very much
expensive for some vectorizations.  Initially we added the patterns for
vectorization of the hi/lo and interleave stuff because we didn't want
regressions
for vectorizing with 256bit vectors vs. 128bit vectors in the
vectorizer testsuite.
But now as we have support for vectorizing with both sizes we could consider
not advertising the really not existing intstructions for 256bit vectors.  Or at
least properly model their cost.

Richard.

>
> --
> H.J.
Jakub Jelinek June 29, 2012, 9:21 a.m. UTC | #8
On Fri, Jun 29, 2012 at 11:00:14AM +0200, Richard Guenther wrote:
> Indeed - the lack of cross-sub-128bit-word operations makes it very much
> expensive for some vectorizations.  Initially we added the patterns for
> vectorization of the hi/lo and interleave stuff because we didn't want
> regressions
> for vectorizing with 256bit vectors vs. 128bit vectors in the
> vectorizer testsuite.
> But now as we have support for vectorizing with both sizes we could consider
> not advertising the really not existing intstructions for 256bit vectors.  Or at
> least properly model their cost.

The pr51581-3.c (f2) generated code is only shorter with -O3 -mavx
when using hi/lo over even/odd, with -O3 -mavx2 even/odd sequence is
shorter than hi/lo.
$ ~/timing ./pr51581-3-evenodd
Strip out best and worst realtime result
minimum: 0.110145575 sec real / 0.000071177 sec CPU
maximum: 0.134790162 sec real / 0.000140234 sec CPU
average: 0.113982306 sec real / 0.000113236 sec CPU
stdev  : 0.002545680 sec real / 0.000009365 sec CPU
$ ~/timing ./pr51581-3-hilo
Strip out best and worst realtime result
minimum: 0.098651474 sec real / 0.000069318 sec CPU
maximum: 0.102126514 sec real / 0.000129507 sec CPU
average: 0.100120802 sec real / 0.000104589 sec CPU
stdev  : 0.001008010 sec real / 0.000013241 sec CPU
Can't benchmark -mavx2 though...

	Jakub
diff mbox

Patch

--- gcc/tree-vect-stmts.c.jj	2012-06-26 11:38:28.000000000 +0200
+++ gcc/tree-vect-stmts.c	2012-06-28 13:27:50.475158271 +0200
@@ -3288,6 +3288,10 @@  vectorizable_shift (gimple stmt, gimple_
 }
 
 
+static tree permute_vec_elements (tree, tree, tree, gimple,
+				  gimple_stmt_iterator *);
+
+
 /* Function vectorizable_operation.
 
    Check if STMT performs a binary, unary or ternary operation that can
@@ -3300,17 +3304,18 @@  static bool
 vectorizable_operation (gimple stmt, gimple_stmt_iterator *gsi,
 			gimple *vec_stmt, slp_tree slp_node)
 {
-  tree vec_dest;
+  tree vec_dest, vec_dest2 = NULL_TREE;
+  tree vec_dest3 = NULL_TREE, vec_dest4 = NULL_TREE;
   tree scalar_dest;
   tree op0, op1 = NULL_TREE, op2 = NULL_TREE;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
-  tree vectype;
+  tree vectype, wide_vectype = NULL_TREE;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   enum tree_code code;
   enum machine_mode vec_mode;
   tree new_temp;
   int op_type;
-  optab optab;
+  optab optab, optab2 = NULL;
   int icode;
   tree def;
   gimple def_stmt;
@@ -3327,6 +3332,8 @@  vectorizable_operation (gimple stmt, gim
   tree vop0, vop1, vop2;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   int vf;
+  unsigned char *sel = NULL;
+  tree decl1 = NULL_TREE, decl2 = NULL_TREE, perm_mask = NULL_TREE;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -3451,31 +3458,97 @@  vectorizable_operation (gimple stmt, gim
   optab = optab_for_tree_code (code, vectype, optab_default);
 
   /* Supportable by target?  */
-  if (!optab)
+  if (!optab && code != MULT_HIGHPART_EXPR)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "no optab.");
       return false;
     }
   vec_mode = TYPE_MODE (vectype);
-  icode = (int) optab_handler (optab, vec_mode);
+  icode = optab ? (int) optab_handler (optab, vec_mode) : CODE_FOR_nothing;
+
+  if (icode == CODE_FOR_nothing
+      && code == MULT_HIGHPART_EXPR
+      && VECTOR_MODE_P (vec_mode)
+      && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN)
+    {
+      /* If MULT_HIGHPART_EXPR isn't supported by the backend, see
+	 if we can emit VEC_WIDEN_MULT_{LO,HI}_EXPR followed by VEC_PERM_EXPR
+	 or builtin_mul_widen_{even,odd} followed by VEC_PERM_EXPR.  */
+      unsigned int prec = TYPE_PRECISION (TREE_TYPE (scalar_dest));
+      unsigned int unsignedp = TYPE_UNSIGNED (TREE_TYPE (scalar_dest));
+      tree wide_type
+	= build_nonstandard_integer_type (prec * 2, unsignedp);
+      wide_vectype
+        = get_same_sized_vectype (wide_type, vectype);
+
+      sel = XALLOCAVEC (unsigned char, nunits_in);
+      if (VECTOR_MODE_P (TYPE_MODE (wide_vectype))
+	  && GET_MODE_SIZE (TYPE_MODE (wide_vectype))
+	     == GET_MODE_SIZE (vec_mode))
+	{
+	  if (targetm.vectorize.builtin_mul_widen_even
+	      && (decl1 = targetm.vectorize.builtin_mul_widen_even (vectype))
+	      && targetm.vectorize.builtin_mul_widen_odd
+	      && (decl2 = targetm.vectorize.builtin_mul_widen_odd (vectype))
+	      && TYPE_MODE (TREE_TYPE (TREE_TYPE (decl1)))
+		 == TYPE_MODE (wide_vectype))
+	    {
+	      for (i = 0; i < nunits_in; i++)
+		sel[i] = !BYTES_BIG_ENDIAN + (i & ~1)
+			 + ((i & 1) ? nunits_in : 0);
+	      if (0 && can_vec_perm_p (vec_mode, false, sel))
+		icode = 0;
+	    }
+	  if (icode == CODE_FOR_nothing)
+	    {
+	      decl1 = NULL_TREE;
+	      decl2 = NULL_TREE;
+	      optab = optab_for_tree_code (VEC_WIDEN_MULT_HI_EXPR,
+					   vectype, optab_default);
+	      optab2 = optab_for_tree_code (VEC_WIDEN_MULT_HI_EXPR,
+					    vectype, optab_default);
+	      if (optab != NULL
+		  && optab2 != NULL
+		  && optab_handler (optab, vec_mode) != CODE_FOR_nothing
+		  && optab_handler (optab2, vec_mode) != CODE_FOR_nothing)
+		{
+		  for (i = 0; i < nunits_in; i++)
+		    sel[i] = !BYTES_BIG_ENDIAN + 2 * i;
+		  if (can_vec_perm_p (vec_mode, false, sel))
+		    icode = optab_handler (optab, vec_mode);
+		}
+	    }
+	}
+      if (icode == CODE_FOR_nothing)
+	{
+	  if (optab_for_tree_code (code, vectype, optab_default) == NULL)
+	    {
+	      if (vect_print_dump_info (REPORT_DETAILS))
+		fprintf (vect_dump, "no optab.");
+	      return false;
+	    }
+	  wide_vectype = NULL_TREE;
+	  optab2 = NULL;
+	}
+    }
+
   if (icode == CODE_FOR_nothing)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "op not supported by target.");
       /* Check only during analysis.  */
       if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
-	  || (vf < vect_min_worthwhile_factor (code)
-              && !vec_stmt))
+	  || (!vec_stmt && vf < vect_min_worthwhile_factor (code)))
         return false;
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "proceeding using word mode.");
     }
 
   /* Worthwhile without SIMD support?  Check only during analysis.  */
-  if (!VECTOR_MODE_P (TYPE_MODE (vectype))
-      && vf < vect_min_worthwhile_factor (code)
-      && !vec_stmt)
+  if (!VECTOR_MODE_P (vec_mode)
+      && !vec_stmt
+      && vf < vect_min_worthwhile_factor (code))
     {
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "not worthwhile without SIMD support.");
@@ -3497,7 +3570,16 @@  vectorizable_operation (gimple stmt, gim
     fprintf (vect_dump, "transform binary/unary operation.");
 
   /* Handle def.  */
-  vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  if (wide_vectype)
+    {
+      vec_dest = vect_create_destination_var (scalar_dest, wide_vectype);
+      vec_dest2 = vect_create_destination_var (scalar_dest, wide_vectype);
+      vec_dest3 = vect_create_destination_var (scalar_dest, vectype);
+      vec_dest4 = vect_create_destination_var (scalar_dest, vectype);
+      perm_mask = vect_gen_perm_mask (vectype, sel);
+    }
+  else
+    vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
   /* Allocate VECs for vector operands.  In case of SLP, vector operands are
      created in the previous stages of the recursion, so no allocation is
@@ -3606,6 +3688,66 @@  vectorizable_operation (gimple stmt, gim
 		  ? VEC_index (tree, vec_oprnds1, i) : NULL_TREE);
 	  vop2 = ((op_type == ternary_op)
 		  ? VEC_index (tree, vec_oprnds2, i) : NULL_TREE);
+	  if (wide_vectype)
+	    {
+	      tree new_temp2, vce;
+
+	      gcc_assert (code == MULT_HIGHPART_EXPR);
+	      if (decl1 != NULL_TREE)
+		{
+		  new_stmt = gimple_build_call (decl1, 2, vop0, vop1);
+		  new_temp = make_ssa_name (vec_dest, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+		  new_stmt = gimple_build_call (decl2, 2, vop0, vop1);
+		  new_temp2 = make_ssa_name (vec_dest2, new_stmt);
+		  gimple_call_set_lhs (new_stmt, new_temp2);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		}
+	      else
+		{
+		  new_temp = make_ssa_name (vec_dest, NULL);
+		  new_stmt
+		    = gimple_build_assign_with_ops (BYTES_BIG_ENDIAN
+						    ? VEC_WIDEN_MULT_HI_EXPR
+						    : VEC_WIDEN_MULT_LO_EXPR,
+						    new_temp, vop0, vop1);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+		  new_temp2 = make_ssa_name (vec_dest2, NULL);
+		  new_stmt
+		    = gimple_build_assign_with_ops (BYTES_BIG_ENDIAN
+						    ? VEC_WIDEN_MULT_LO_EXPR
+						    : VEC_WIDEN_MULT_HI_EXPR,
+						    new_temp2, vop0, vop1);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		}
+
+	      vce = build1 (VIEW_CONVERT_EXPR, vectype, new_temp);
+	      new_stmt = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
+						       vec_dest3, vce,
+						       NULL_TREE);
+	      new_temp = make_ssa_name (vec_dest3, new_stmt);
+	      gimple_assign_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	      vce = build1 (VIEW_CONVERT_EXPR, vectype, new_temp2);
+	      new_stmt = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
+						       vec_dest4, vce,
+						       NULL_TREE);
+	      new_temp2 = make_ssa_name (vec_dest4, new_stmt);
+	      gimple_assign_set_lhs (new_stmt, new_temp2);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	      new_temp = permute_vec_elements (new_temp, new_temp2,
+					       perm_mask, stmt, gsi);
+	      new_stmt = SSA_NAME_DEF_STMT (new_temp);
+	      if (slp_node)
+		VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node),
+				new_stmt);
+	      continue;
+	    }
 	  new_stmt = gimple_build_assign_with_ops3 (code, vec_dest,
 						    vop0, vop1, vop2);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
--- gcc/tree-vect-patterns.c.jj	2012-06-28 08:32:50.000000000 +0200
+++ gcc/tree-vect-patterns.c	2012-06-28 10:17:14.647783541 +0200
@@ -1635,7 +1635,7 @@  vect_recog_divmod_pattern (VEC (gimple,
 			   tree *type_in, tree *type_out)
 {
   gimple last_stmt = VEC_pop (gimple, *stmts);
-  tree oprnd0, oprnd1, vectype, itype, witype, vecwtype, cond;
+  tree oprnd0, oprnd1, vectype, itype, cond;
   gimple pattern_stmt, def_stmt;
   enum tree_code rhs_code;
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
@@ -1814,17 +1814,23 @@  vect_recog_divmod_pattern (VEC (gimple,
       || prec > HOST_BITS_PER_WIDE_INT)
     return NULL;
 
-  witype = build_nonstandard_integer_type (prec * 2,
-					   TYPE_UNSIGNED (itype));
-  vecwtype = get_vectype_for_scalar_type (witype);
-  if (vecwtype == NULL_TREE)
-    return NULL;
+  optab = optab_for_tree_code (MULT_HIGHPART_EXPR, vectype, optab_default);
+  if (optab == NULL
+      || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+    {
+      tree witype = build_nonstandard_integer_type (prec * 2,
+						    TYPE_UNSIGNED (itype));
+      tree vecwtype = get_vectype_for_scalar_type (witype);
 
-  if (!supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
-				       vecwtype, vectype,
-				       &dummy, &dummy, &dummy_code,
-				       &dummy_code, &dummy_int, &dummy_vec))
-    return NULL;
+      if (vecwtype == NULL_TREE)
+	return NULL;
+      if (!supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
+					   vecwtype, vectype,
+					   &dummy, &dummy, &dummy_code,
+					   &dummy_code, &dummy_int,
+					   &dummy_vec))
+	return NULL;
+    }
 
   STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
 
@@ -1834,7 +1840,7 @@  vect_recog_divmod_pattern (VEC (gimple,
       int pre_shift, post_shift;
       unsigned HOST_WIDE_INT d = tree_low_cst (oprnd1, 1)
 				 & GET_MODE_MASK (TYPE_MODE (itype));
-      tree t1, t2, t3, t4, t5, t6;
+      tree t1, t2, t3, t4;
 
       if (d >= ((unsigned HOST_WIDE_INT) 1 << (prec - 1)))
 	/* FIXME: Can transform this into oprnd0 >= oprnd1 ? 1 : 0.  */
@@ -1861,65 +1867,46 @@  vect_recog_divmod_pattern (VEC (gimple,
 	  if (post_shift - 1 >= prec)
 	    return NULL;
 
-	  /* t1 = oprnd0 w* ml;
-	     t2 = t1 >> prec;
-	     t3 = (type) t2;
-	     t4 = oprnd0 - t3;
-	     t5 = t4 >> 1;
-	     t6 = t3 + t5;
-	     q = t6 >> (post_shift - 1);  */
-	  t1 = vect_recog_temp_ssa_var (witype, NULL);
+	  /* t1 = oprnd0 h* ml;
+	     t2 = oprnd0 - t1;
+	     t3 = t2 >> 1;
+	     t4 = t1 + t3;
+	     q = t4 >> (post_shift - 1);  */
+	  t1 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (WIDEN_MULT_EXPR, t1, oprnd0,
+	    = gimple_build_assign_with_ops (MULT_HIGHPART_EXPR, t1, oprnd0,
 					    build_int_cst (itype, ml));
 	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-	  def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-	  set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-	  STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecwtype;
 
-	  t2 = vect_recog_temp_ssa_var (witype, NULL);
+	  t2 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t2, t1,
-					    build_int_cst (itype, prec));
+	    = gimple_build_assign_with_ops (MINUS_EXPR, t2, oprnd0, t1);
 	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-	  def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-	  set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-	  STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecwtype;
 
 	  t3 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (NOP_EXPR, t3, t2, NULL_TREE);
-	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-
-	  t4 = vect_recog_temp_ssa_var (itype, NULL);
-	  def_stmt
-	    = gimple_build_assign_with_ops (MINUS_EXPR, t4, oprnd0, t3);
-	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-
-	  t5 = vect_recog_temp_ssa_var (itype, NULL);
-	  def_stmt
-	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t5, t4,
+	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t3, t2,
 					    integer_one_node);
 	  append_pattern_def_seq (stmt_vinfo, def_stmt);
 
-	  t6 = vect_recog_temp_ssa_var (itype, NULL);
+	  t4 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (PLUS_EXPR, t6, t3, t5);
+	    = gimple_build_assign_with_ops (PLUS_EXPR, t4, t1, t3);
 
 	  if (post_shift != 1)
 	    {
 	      append_pattern_def_seq (stmt_vinfo, def_stmt);
 
-	      q = vect_recog_temp_ssa_var (witype, NULL);
+	      q = vect_recog_temp_ssa_var (itype, NULL);
 	      pattern_stmt
-		= gimple_build_assign_with_ops (RSHIFT_EXPR, q, t6,
+		= gimple_build_assign_with_ops (RSHIFT_EXPR, q, t4,
 						build_int_cst (itype,
 							       post_shift
 							       - 1));
 	    }
 	  else
 	    {
-	      q = t6;
+	      q = t4;
 	      pattern_stmt = def_stmt;
 	    }
 	}
@@ -1929,9 +1916,8 @@  vect_recog_divmod_pattern (VEC (gimple,
 	    return NULL;
 
 	  /* t1 = oprnd0 >> pre_shift;
-	     t2 = t1 w* ml;
-	     t3 = t2 >> (prec + post_shift);
-	     q = (type) t3;  */
+	     t2 = t1 h* ml;
+	     q = t2 >> post_shift;  */
 	  if (pre_shift)
 	    {
 	      t1 = vect_recog_temp_ssa_var (itype, NULL);
@@ -1944,28 +1930,25 @@  vect_recog_divmod_pattern (VEC (gimple,
 	  else
 	    t1 = oprnd0;
 
-	  t2 = vect_recog_temp_ssa_var (witype, NULL);
+	  t2 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (WIDEN_MULT_EXPR, t2, t1,
+	    = gimple_build_assign_with_ops (MULT_HIGHPART_EXPR, t2, t1,
 					    build_int_cst (itype, ml));
-	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-	  def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-	  set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-	  STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecwtype;
 
-	  t3 = vect_recog_temp_ssa_var (witype, NULL);
-	  def_stmt
-	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t3, t2,
-					    build_int_cst (itype, post_shift
-								  + prec));
-	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-	  def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-	  set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-	  STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecwtype;
+	  if (post_shift)
+	    {
+	      append_pattern_def_seq (stmt_vinfo, def_stmt);
 
-	  q = vect_recog_temp_ssa_var (itype, NULL);
-	  pattern_stmt
-	    = gimple_build_assign_with_ops (NOP_EXPR, q, t3, NULL_TREE);
+	      q = vect_recog_temp_ssa_var (itype, NULL);
+	      def_stmt
+		= gimple_build_assign_with_ops (RSHIFT_EXPR, q, t2,
+						build_int_cst (itype,
+							       post_shift));
+	    }
+	  else
+	    q = t2;
+
+	  pattern_stmt = def_stmt;
 	}
     }
   else
@@ -1975,21 +1958,12 @@  vect_recog_divmod_pattern (VEC (gimple,
       HOST_WIDE_INT d = tree_low_cst (oprnd1, 0);
       unsigned HOST_WIDE_INT abs_d;
       bool add = false;
-      tree uwitype = NULL, vecuwtype = NULL;
-      tree t1, t2, t3, t4, t5, t6, t7;
+      tree t1, t2, t3, t4;
 
       /* Give up for -1.  */
       if (d == -1)
 	return NULL;
 
-      if (!vect_supportable_shift (RSHIFT_EXPR, witype))
-	{
-	  uwitype = build_nonstandard_integer_type (prec * 2, 1);
-	  vecuwtype = get_vectype_for_scalar_type (uwitype);
-	  if (vecuwtype == NULL_TREE)
-	    return NULL;
-	}
-
       /* Since d might be INT_MIN, we have to cast to
 	 unsigned HOST_WIDE_INT before negating to avoid
 	 undefined signed overflow.  */
@@ -2017,85 +1991,48 @@  vect_recog_divmod_pattern (VEC (gimple,
       if (post_shift >= prec)
 	return NULL;
 
-      /* t1 = oprnd1 w* ml;  */
-      t1 = vect_recog_temp_ssa_var (witype, NULL);
+      /* t1 = oprnd1 h* ml;  */
+      t1 = vect_recog_temp_ssa_var (itype, NULL);
       def_stmt
-	= gimple_build_assign_with_ops (WIDEN_MULT_EXPR, t1, oprnd0,
+	= gimple_build_assign_with_ops (MULT_HIGHPART_EXPR, t1, oprnd0,
 					build_int_cst (itype, ml));
       append_pattern_def_seq (stmt_vinfo, def_stmt);
-      def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-      set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-      STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecwtype;
-
-      if (vecuwtype != NULL)
-	{
-	  /* t2 = (uwtype) t1;  */
-	  t2 = vect_recog_temp_ssa_var (uwitype, NULL);
-	  def_stmt
-	    = gimple_build_assign_with_ops (NOP_EXPR, t2, t1, NULL_TREE);
-	  append_pattern_def_seq (stmt_vinfo, def_stmt);
-	  def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-	  set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-	  STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecuwtype;
-	}
-      else
-	t2 = t1;
-
-      /* t3 = t2 >> prec;  or t3 = t2 >> (prec + post_shift);  */
-      t3 = vect_recog_temp_ssa_var (vecuwtype ? uwitype : witype, NULL);
-      def_stmt
-	= gimple_build_assign_with_ops (RSHIFT_EXPR, t3, t2,
-					build_int_cst (itype,
-						       prec
-						       + (!add
-							  && vecuwtype == NULL
-							  ? post_shift : 0)));
-      append_pattern_def_seq (stmt_vinfo, def_stmt);
-      def_stmt_vinfo = new_stmt_vec_info (def_stmt, loop_vinfo, bb_vinfo);
-      set_vinfo_for_stmt (def_stmt, def_stmt_vinfo);
-      STMT_VINFO_VECTYPE (def_stmt_vinfo) = vecuwtype ? vecuwtype : vecwtype;
-
-      /* t4 = (type) t3;  */
-      t4 = vect_recog_temp_ssa_var (itype, NULL);
-      def_stmt
-	= gimple_build_assign_with_ops (NOP_EXPR, t4, t3, NULL_TREE);
-      append_pattern_def_seq (stmt_vinfo, def_stmt);
 
       if (add)
 	{
-	  /* t5 = t4 + oprnd0;  */
-	  t5 = vect_recog_temp_ssa_var (itype, NULL);
+	  /* t2 = t1 + oprnd0;  */
+	  t2 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (PLUS_EXPR, t5, t4, oprnd0);
+	    = gimple_build_assign_with_ops (PLUS_EXPR, t2, t1, oprnd0);
 	  append_pattern_def_seq (stmt_vinfo, def_stmt);
 	}
       else
-	t5 = t4;
+	t2 = t1;
 
-      if ((add || vecuwtype != NULL) && post_shift)
+      if (post_shift)
 	{
-	  /* t6 = t5 >> post_shift;  */
-	  t6 = vect_recog_temp_ssa_var (itype, NULL);
+	  /* t3 = t2 >> post_shift;  */
+	  t3 = vect_recog_temp_ssa_var (itype, NULL);
 	  def_stmt
-	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t6, t5,
+	    = gimple_build_assign_with_ops (RSHIFT_EXPR, t3, t2,
 					    build_int_cst (itype, post_shift));
 	  append_pattern_def_seq (stmt_vinfo, def_stmt);
 	}
       else
-	t6 = t5;
+	t3 = t2;
 
-      /* t7 = oprnd0 >> (prec - 1);  */
-      t7 = vect_recog_temp_ssa_var (itype, NULL);
+      /* t4 = oprnd0 >> (prec - 1);  */
+      t4 = vect_recog_temp_ssa_var (itype, NULL);
       def_stmt
-	= gimple_build_assign_with_ops (RSHIFT_EXPR, t7, oprnd0,
+	= gimple_build_assign_with_ops (RSHIFT_EXPR, t4, oprnd0,
 					build_int_cst (itype, prec - 1));
       append_pattern_def_seq (stmt_vinfo, def_stmt);
 
-      /* q = t6 - t7;  or q = t7 - t6;  */
+      /* q = t3 - t4;  or q = t4 - t3;  */
       q = vect_recog_temp_ssa_var (itype, NULL);
       pattern_stmt
-	= gimple_build_assign_with_ops (MINUS_EXPR, q, d < 0 ? t7 : t6,
-					d < 0 ? t6 : t7);
+	= gimple_build_assign_with_ops (MINUS_EXPR, q, d < 0 ? t4 : t3,
+					d < 0 ? t3 : t4);
     }
 
   if (rhs_code == TRUNC_MOD_EXPR)
--- gcc/testsuite/gcc.dg/vect/pr51581-4.c.jj	2012-06-28 13:24:57.356118979 +0200
+++ gcc/testsuite/gcc.dg/vect/pr51581-4.c	2012-06-28 13:21:22.000000000 +0200
@@ -0,0 +1,166 @@ 
+/* PR tree-optimization/51581 */
+
+#include "tree-vect.h"
+
+short int a[16], b[16];
+unsigned short int c[16], d[16];
+
+void
+f1 (void)
+{
+  a[0] = b[0] / 8;
+  a[1] = b[1] / 8;
+  a[2] = b[2] / 8;
+  a[3] = b[3] / 8;
+  a[4] = b[4] / 8;
+  a[5] = b[5] / 8;
+  a[6] = b[6] / 8;
+  a[7] = b[7] / 8;
+  a[8] = b[8] / 8;
+  a[9] = b[9] / 8;
+  a[10] = b[10] / 8;
+  a[11] = b[11] / 8;
+  a[12] = b[12] / 8;
+  a[13] = b[13] / 8;
+  a[14] = b[14] / 8;
+  a[15] = b[15] / 8;
+}
+
+void
+f2 (void)
+{
+  c[0] = d[0] / 3;
+  c[1] = d[1] / 3;
+  c[2] = d[2] / 3;
+  c[3] = d[3] / 3;
+  c[4] = d[4] / 3;
+  c[5] = d[5] / 3;
+  c[6] = d[6] / 3;
+  c[7] = d[7] / 3;
+  c[8] = d[8] / 3;
+  c[9] = d[9] / 3;
+  c[10] = d[10] / 3;
+  c[11] = d[11] / 3;
+  c[12] = d[12] / 3;
+  c[13] = d[13] / 3;
+  c[14] = d[14] / 3;
+  c[15] = d[15] / 3;
+}
+
+void
+f3 (void)
+{
+  a[0] = b[0] / 8;
+  a[1] = b[1] / 4;
+  a[2] = b[2] / 8;
+  a[3] = b[3] / 4;
+  a[4] = b[4] / 8;
+  a[5] = b[5] / 4;
+  a[6] = b[6] / 8;
+  a[7] = b[7] / 4;
+  a[8] = b[8] / 8;
+  a[9] = b[9] / 4;
+  a[10] = b[10] / 8;
+  a[11] = b[11] / 4;
+  a[12] = b[12] / 8;
+  a[13] = b[13] / 4;
+  a[14] = b[14] / 8;
+  a[15] = b[15] / 4;
+}
+
+void
+f4 (void)
+{
+  c[0] = d[0] / 3;
+  c[1] = d[1] / 5;
+  c[2] = d[2] / 3;
+  c[3] = d[3] / 5;
+  c[4] = d[4] / 3;
+  c[5] = d[5] / 5;
+  c[6] = d[6] / 3;
+  c[7] = d[7] / 5;
+  c[8] = d[8] / 3;
+  c[9] = d[9] / 5;
+  c[10] = d[10] / 3;
+  c[11] = d[11] / 5;
+  c[12] = d[12] / 3;
+  c[13] = d[13] / 5;
+  c[14] = d[14] / 3;
+  c[15] = d[15] / 5;
+}
+
+void
+f5 (void)
+{
+  a[0] = b[0] / 14;
+  a[1] = b[1] / 15;
+  a[2] = b[2] / 14;
+  a[3] = b[3] / 15;
+  a[4] = b[4] / 14;
+  a[5] = b[5] / 15;
+  a[6] = b[6] / 14;
+  a[7] = b[7] / 15;
+  a[8] = b[8] / 14;
+  a[9] = b[9] / 15;
+  a[10] = b[10] / 14;
+  a[11] = b[11] / 15;
+  a[12] = b[12] / 14;
+  a[13] = b[13] / 15;
+  a[14] = b[14] / 14;
+  a[15] = b[15] / 15;
+}
+
+void
+f6 (void)
+{
+  c[0] = d[0] / 6;
+  c[1] = d[1] / 5;
+  c[2] = d[2] / 6;
+  c[3] = d[3] / 5;
+  c[4] = d[4] / 6;
+  c[5] = d[5] / 5;
+  c[6] = d[6] / 13;
+  c[7] = d[7] / 5;
+  c[8] = d[8] / 6;
+  c[9] = d[9] / 5;
+  c[10] = d[10] / 6;
+  c[11] = d[11] / 5;
+  c[12] = d[12] / 6;
+  c[13] = d[13] / 5;
+  c[14] = d[14] / 13;
+  c[15] = d[15] / 5;
+}
+
+int
+main ()
+{
+  int i;
+  check_vect ();
+  asm ("");
+  for (i = 0; i < 16; i++)
+    {
+      asm ("");
+      b[i] = i - 8;
+      d[i] = i - 8;
+    }
+  f1 ();
+  f2 ();
+  for (i = 0; i < 16; i++)
+    if (a[i] != b[i] / 8 || c[i] != d[i] / 3)
+      abort ();
+  f3 ();
+  f4 ();
+  for (i = 0; i < 16; i+= 2)
+    if (a[i] != b[i] / 8 || a[i + 1] != b[i + 1] / 4
+	|| c[i] != d[i] / 3 || c[i + 1] != d[i + 1] / 5)
+      abort ();
+  f5 ();
+  f6 ();
+  for (i = 0; i < 16; i+= 2)
+    if (a[i] != b[i] / 14 || a[i + 1] != b[i + 1] / 15
+	|| c[i] != d[i] / ((i & 7) == 6 ? 13 : 6) || c[i + 1] != d[i + 1] / 5)
+      abort ();
+  return 0;
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */