Patchwork Vector shuffling

login
register
mail settings
Submitter Artem Shinkarov
Date Sept. 29, 2011, 10:44 a.m.
Message ID <CABYV9SX+SDBg9JK7k25n7XAmcRQ+E8ij9bOOC0_s_2b0eZEvDA@mail.gmail.com>
Download mbox | patch
Permalink /patch/116935/
State New
Headers show

Comments

Artem Shinkarov - Sept. 29, 2011, 10:44 a.m.
Here is a new version of the patch which hopefully fixes all the
formatting issues and uses expand_simple_binop instead of force_reg in
binary operations.

Ok?


On Wed, Sep 28, 2011 at 3:46 PM, Richard Henderson <rth@redhat.com> wrote:
> On 09/28/2011 05:59 AM, Artem Shinkarov wrote:
>> I don't really understand this. As far as I know, expand_normal
>> "converts" tree to rtx. All my computations are happening at the level
>> of rtx and force_reg is needed just to bring an rtx expression to the
>> register of the correct mode. If I am missing something, could you
>> give an example how can I use expand_normal instead of force_reg in
>> this particular code.
>
> Sorry, I meant expand_(simple_)binop.
>
>>> Is ssse3_pshufb why you do the wrong thing in the expander for v0 != v1?
>>
>> My personal feeling is that it may be the case with v0 != v1, that it
>> would be more efficient to perform piecewise shuffling rather than
>> bitwise dances around the masks.
>
> Maybe for V2DI and V2DFmode, but probably not otherwise.
>
> We can perform the double-word shuffle in 12 insns; 10 for SSE 4.1.
> Example assembly attached.
>
>>> It's certainly possible to handle it, though it takes a few more steps,
>>> and might well be more efficient as a libgcc function rather than inline.
>>
>> I don't really understand why it could be more efficient. I thought
>> that inline gives more chances to the final RTL optimisation.
>
> We'll not be able to optimize this at the rtl level.  There are too many
> UNSPEC instructions in the way.  In any case, even if that weren't so we'd
> only be able to do useful optimization for a constant permutation.  And
> we should have been able to prove that at the gimple level.
>
>
> r~
>
Richard Henderson - Sept. 29, 2011, 3:22 p.m.
On 09/29/2011 03:44 AM, Artem Shinkarov wrote:
> Here is a new version of the patch which hopefully fixes all the
> formatting issues and uses expand_simple_binop instead of force_reg in
> binary operations.
> 
> Ok?

Well, it's certainly not perfect by any means.  But I guess I can fix
things up myself once this patch is applied.


r~

Patch

Index: gcc/doc/extend.texi
===================================================================
--- gcc/doc/extend.texi	(revision 178354)
+++ gcc/doc/extend.texi	(working copy)
@@ -6561,6 +6561,32 @@  invoke undefined behavior at runtime.  W
 accesses for vector subscription can be enabled with
 @option{-Warray-bounds}.
 
+Vector shuffling is available using functions
+@code{__builtin_shuffle (vec, mask)} and
+@code{__builtin_shuffle (vec0, vec1, mask)}. Both functions construct
+a permutation of elements from one or two vectors and return a vector
+of the same type as input vector(s). The mask is a vector of
+integer-typed elements. The size of each element of the mask must be
+the same as the size of each input vector element. The number of
+elements in input vector(s) and mask must be the same.
+
+The elements of the input vectors are numbered from left to right across
+one or both of the vectors. Each element in the mask specifies a number
+of element from the input vector(s). Consider the following example.
+
+@smallexample
+typedef int v4si __attribute__ ((vector_size (16)));
+
+v4si a = @{1,2,3,4@};
+v4si b = @{5,6,7,8@};
+v4si mask1 = @{0,1,1,3@};
+v4si mask2 = @{0,4,2,5@};
+v4si res;
+
+res = __builtin_shuffle (a, mask1);       /* res is @{1,2,2,4@}  */
+res = __builtin_shuffle (a, b, mask2);    /* res is @{1,5,3,6@}  */
+@end smallexample
+
 You can declare variables and use them in function calls and returns, as
 well as in assignments and some casts.  You can specify a vector type as
 a return type for a function.  Vector types can also be used as function
Index: gcc/tree-pretty-print.c
===================================================================
--- gcc/tree-pretty-print.c	(revision 178354)
+++ gcc/tree-pretty-print.c	(working copy)
@@ -2067,6 +2067,16 @@  dump_generic_node (pretty_printer *buffe
       dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
       pp_string (buffer, " > ");
       break;
+    
+    case VEC_SHUFFLE_EXPR:
+      pp_string (buffer, " VEC_SHUFFLE_EXPR < ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
+      pp_string (buffer, " > ");
+      break;
 
     case DOT_PROD_EXPR:
       pp_string (buffer, " DOT_PROD_EXPR < ");
Index: gcc/c-family/c-common.c
===================================================================
--- gcc/c-family/c-common.c	(revision 178354)
+++ gcc/c-family/c-common.c	(working copy)
@@ -425,6 +425,7 @@  const struct c_common_resword c_common_r
   { "__attribute__",	RID_ATTRIBUTE,	0 },
   { "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY },
   { "__builtin_complex", RID_BUILTIN_COMPLEX, D_CONLY },
+  { "__builtin_shuffle", RID_BUILTIN_SHUFFLE, D_CONLY },
   { "__builtin_offsetof", RID_OFFSETOF, 0 },
   { "__builtin_types_compatible_p", RID_TYPES_COMPATIBLE_P, D_CONLY },
   { "__builtin_va_arg",	RID_VA_ARG,	0 },
Index: gcc/c-family/c-common.h
===================================================================
--- gcc/c-family/c-common.h	(revision 178354)
+++ gcc/c-family/c-common.h	(working copy)
@@ -103,7 +103,7 @@  enum rid
   /* C extensions */
   RID_ASM,       RID_TYPEOF,   RID_ALIGNOF,  RID_ATTRIBUTE,  RID_VA_ARG,
   RID_EXTENSION, RID_IMAGPART, RID_REALPART, RID_LABEL,      RID_CHOOSE_EXPR,
-  RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,
+  RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,	     RID_BUILTIN_SHUFFLE,
   RID_DFLOAT32, RID_DFLOAT64, RID_DFLOAT128,
   RID_FRACT, RID_ACCUM,
 
Index: gcc/optabs.c
===================================================================
--- gcc/optabs.c	(revision 178354)
+++ gcc/optabs.c	(working copy)
@@ -6620,6 +6620,78 @@  vector_compare_rtx (tree cond, bool unsi
   return gen_rtx_fmt_ee (rcode, VOIDmode, ops[0].value, ops[1].value);
 }
 
+/* Return true if VEC_SHUFFLE_EXPR can be expanded using SIMD extensions
+   of the CPU.  */
+bool
+expand_vec_shuffle_expr_p (enum machine_mode mode, tree v0, tree v1, tree mask)
+{
+  int v0_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0))));
+  int mask_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask))));
+
+  if (TREE_CODE (mask) == VECTOR_CST
+      && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), mask))
+    return true;
+
+  if (!operand_equal_p (v0, v1, 0) || v0_mode_s != mask_mode_s
+      || TYPE_VECTOR_SUBPARTS (TREE_TYPE (v0))
+	 != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)))
+    return false;
+
+  return direct_optab_handler (vshuffle_optab, mode) != CODE_FOR_nothing;
+}
+
+/* Generate instructions for VEC_COND_EXPR given its type and three
+   operands.  */
+rtx
+expand_vec_shuffle_expr (tree type, tree v0, tree v1, tree mask, rtx target)
+{
+  struct expand_operand ops[4];
+  enum insn_code icode;
+  enum machine_mode mode = TYPE_MODE (type);
+  rtx rtx_v0, rtx_mask;
+
+  gcc_assert (expand_vec_shuffle_expr_p (mode, v0, v1, mask));
+
+  if (TREE_CODE (mask) == VECTOR_CST)
+    {
+      tree m_type, call;
+      tree fn = targetm.vectorize.builtin_vec_perm (TREE_TYPE (v0), &m_type);
+
+      if (!fn)
+	goto vshuffle;
+
+      if (m_type != TREE_TYPE (TREE_TYPE (mask)))
+	{
+	  int units = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask));
+	  tree cvt = build_vector_type (m_type, units);
+	  mask = fold_convert (cvt, mask);
+	}
+
+      call = fold_build1 (ADDR_EXPR, build_pointer_type (TREE_TYPE (fn)), fn);
+      call = build_call_nary (type, call, 3, v0, v1, mask);
+
+      return expand_expr_real_1 (call, target, VOIDmode, EXPAND_NORMAL, NULL);
+    }
+
+vshuffle:
+  gcc_assert (operand_equal_p (v0, v1, 0));
+
+  icode = direct_optab_handler (vshuffle_optab, mode);
+
+  if (icode == CODE_FOR_nothing)
+    return 0;
+
+  rtx_v0 = expand_normal (v0);
+  rtx_mask = expand_normal (mask);
+
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], rtx_v0, mode);
+  create_input_operand (&ops[2], rtx_mask, mode);
+  expand_insn (icode, 3, ops);
+
+  return ops[0].value;
+}
+
 /* Return insn code for TYPE, the type of a VEC_COND_EXPR.  */
 
 static inline enum insn_code
Index: gcc/optabs.h
===================================================================
--- gcc/optabs.h	(revision 178354)
+++ gcc/optabs.h	(working copy)
@@ -636,6 +636,9 @@  enum direct_optab_index
   DOI_vcond,
   DOI_vcondu,
 
+  /* Vector shuffling.  */
+  DOI_vshuffle,
+
   /* Block move operation.  */
   DOI_movmem,
 
@@ -701,6 +704,7 @@  typedef struct direct_optab_d *direct_op
 #define reload_out_optab (&direct_optab_table[(int) DOI_reload_out])
 #define vcond_optab (&direct_optab_table[(int) DOI_vcond])
 #define vcondu_optab (&direct_optab_table[(int) DOI_vcondu])
+#define vshuffle_optab (&direct_optab_table[(int) DOI_vshuffle])
 #define movmem_optab (&direct_optab_table[(int) DOI_movmem])
 #define setmem_optab (&direct_optab_table[(int) DOI_setmem])
 #define cmpstr_optab (&direct_optab_table[(int) DOI_cmpstr])
@@ -879,8 +883,15 @@  extern rtx expand_widening_mult (enum ma
 /* Return tree if target supports vector operations for COND_EXPR.  */
 bool expand_vec_cond_expr_p (tree, enum machine_mode);
 
+/* Return tree if target supports vector operations for VEC_SHUFFLE_EXPR.  */
+bool expand_vec_shuffle_expr_p (enum machine_mode, tree, tree, tree);
+
 /* Generate code for VEC_COND_EXPR.  */
 extern rtx expand_vec_cond_expr (tree, tree, tree, tree, rtx);
+
+/* Generate code for VEC_SHUFFLE_EXPR.  */
+extern rtx expand_vec_shuffle_expr (tree, tree, tree, tree, rtx);
+
 /* Generate code for VEC_LSHIFT_EXPR and VEC_RSHIFT_EXPR.  */
 extern rtx expand_vec_shift_expr (sepops, rtx);
 
Index: gcc/genopinit.c
===================================================================
--- gcc/genopinit.c	(revision 178354)
+++ gcc/genopinit.c	(working copy)
@@ -255,6 +255,7 @@  static const char * const optabs[] =
   "set_optab_handler (vec_realign_load_optab, $A, CODE_FOR_$(vec_realign_load_$a$))",
   "set_direct_optab_handler (vcond_optab, $A, CODE_FOR_$(vcond$a$))",
   "set_direct_optab_handler (vcondu_optab, $A, CODE_FOR_$(vcondu$a$))",
+  "set_direct_optab_handler (vshuffle_optab, $A, CODE_FOR_$(vshuffle$a$))",
   "set_optab_handler (ssum_widen_optab, $A, CODE_FOR_$(widen_ssum$I$a3$))",
   "set_optab_handler (usum_widen_optab, $A, CODE_FOR_$(widen_usum$I$a3$))",
   "set_optab_handler (udot_prod_optab, $A, CODE_FOR_$(udot_prod$I$a$))",
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c	(revision 0)
@@ -0,0 +1,44 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shuf2compare(type, count, vres, v0, v1, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != ((vidx(type, mask, __i) < count) ? \
+                          vidx(type, v0, vidx(type, mask, __i)) :  \
+                          vidx(type, v1, (vidx(type, mask, __i) - count)))) \
+            __builtin_abort (); \
+        } \
+} while (0)
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {5, 5,5,5,5,5,argc,7};
+    vector (8, short) v1 = {argc, 1,8,8,4,9,argc,4};
+    vector (8, short) v2;
+
+    //vector (8, short) mask = {1,2,5,4,3,6,7};
+    
+    vector (8, short) mask0 = {0,2,3,1,4,5,6,7};
+    vector (8, short) mask1 = {0,12,3,4,3,0,10,9};
+    
+    vector (8, short) mask2 = {0,8,1,9,2,10,3,11};
+
+    v2 = __builtin_shuffle (v0, v1,  mask0);
+    shuf2compare (short, 8, v2, v0, v1, mask0);
+ 
+    v2 = __builtin_shuffle (v0, v1,  mask1);
+    shuf2compare (short, 8, v2, v0, v1, mask1);
+    
+    v2 = __builtin_shuffle (v0, v1,  mask2);
+    shuf2compare (short, 8, v2, v0, v1, mask2);
+
+    v2 = __builtin_shuffle (mask0, mask0,  v0);
+    shuf2compare (short, 8, v2, mask0, mask0, v0);
+
+    return 0; 
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c	(revision 0)
@@ -0,0 +1,50 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shuf2compare(type, count, vres, v0, v1, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != ((vidx(type, mask, __i) < count) ? \
+                          vidx(type, v0, vidx(type, mask, __i)) :  \
+                          vidx(type, v1, (vidx(type, mask, __i) - count)))) \
+            __builtin_abort (); \
+        } \
+} while (0)
+
+
+vector (8, short) __attribute__ ((noinline))
+f (vector (8, short) x, vector (8, short) y, vector (8, short) mask) {
+    return __builtin_shuffle (x, y, mask);
+}
+
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+
+    //vector (8, short) mask = {1,2,5,4,3,6,7};
+    
+    vector (8, short) mask0 = {0,2,3,1,4,5,6,7};
+    vector (8, short) mask1 = {0,12,3,4,3,0,10,9};
+    vector (8, short) mask2 = {0,8,1,9,2,10,3,11};
+
+    v2 = f (v0, v1,  mask0);
+    shuf2compare (short, 8, v2, v0, v1, mask0);
+ 
+    v2 = f (v0, v1,  mask1);
+    shuf2compare (short, 8, v2, v0, v1, mask1);
+
+    v2 = f (v0, v1,  mask2);
+    shuf2compare (short, 8, v2, v0, v1, mask2);
+
+    v2 = f (mask0, mask0,  v0);
+    shuf2compare (short, 8, v2, mask0, mask0, v0);
+
+    return 0; 
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c	(revision 0)
@@ -0,0 +1,46 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shufcompare(type, count, vres, v0, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != vidx(type, v0, vidx(type, mask, __i))) \
+            __builtin_abort (); \
+    } \
+} while (0)
+
+
+int main (int argc, char *argv[]) {
+    /*vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+   
+    vector (8, short) smask = {0,0,1,2,3,4,5,6};
+    
+    v2 = __builtin_shuffle (v0,  smask);
+    shufcompare (short, 8, v2, v0, smask);
+    v2 = __builtin_shuffle (v0, v1);
+    shufcompare (short, 8, v2, v0, v1);
+    v2 = __builtin_shuffle (smask, v0);
+    shufcompare (short, 8, v2, smask, v0);*/
+
+    vector (4, int) i0 = {argc, 1,2,3};
+    vector (4, int) i1 = {argc, 1, argc, 3};
+    vector (4, int) i2;
+
+    vector (4, int) imask = {0,3,2,1};
+
+    /*i2 = __builtin_shuffle (i0, imask);
+    shufcompare (int, 4, i2, i0, imask);*/
+    i2 = __builtin_shuffle (i0, i1);
+    shufcompare (int, 4, i2, i0, i1);
+    
+    i2 = __builtin_shuffle (imask, i0);
+    shufcompare (int, 4, i2, imask, i0);
+    
+    return 0;
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c	(revision 0)
@@ -0,0 +1,36 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shufcompare(type, count, vres, v0, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != vidx(type, v0, vidx(type, mask, __i))) \
+            __builtin_abort (); \
+    } \
+} while (0)
+
+vector (8, short) __attribute__ ((noinline))
+f (vector (8, short) x, vector (8, short) mask) {
+    return __builtin_shuffle (x, mask);
+}
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+
+    vector (8, short) mask = {0,0,1,2,3,4,5,6};
+    
+    v2 = f (v0,  mask);
+    shufcompare (short, 8, v2, v0, mask);
+
+    v2 = f (v0, v1);
+    shufcompare (short, 8, v2, v0, v1);
+
+    return 0;
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-5.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-5.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-5.c	(revision 0)
@@ -0,0 +1,64 @@ 
+/* Test that different type variants are compatible within
+   vector shuffling.  */
+
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define shufcompare(count, vres, v0, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vres[__i] != v0[mask[__i]]) \
+            __builtin_abort (); \
+    } \
+} while (0)
+
+#define test_compat_mask(res, vec, mask) \
+  res = __builtin_shuffle (vec, mask); \
+  shufcompare(4, res, vec, mask); \
+  res = __builtin_shuffle (vec, c ## mask); \
+  shufcompare(4, res, vec, c ##  mask); \
+  res = __builtin_shuffle (vec, r ## mask); \
+  shufcompare(4, res, vec, r ##  mask); \
+  res = __builtin_shuffle (vec, d ## mask); \
+  shufcompare(4, res, vec, d ##  mask); \
+  res = __builtin_shuffle (vec, dc ## mask); \
+  shufcompare(4, res, vec, dc ##  mask); \
+
+#define test_compat_vec(res, vec, mask) \
+  test_compat_mask (res, vec, mask); \
+  test_compat_mask (res, c ## vec, mask); \
+  test_compat_mask (res, r ## vec, mask); \
+  test_compat_mask (res, d ## vec, mask); \
+  test_compat_mask (res, dc ## vec, mask); 
+
+#define test_compat(res, vec, mask) \
+  test_compat_vec (res, vec, mask); \
+  test_compat_vec (d ## res, vec, mask); \
+  test_compat_vec (r ## res, vec, mask);
+
+typedef vector (4, int) v4si;
+typedef const vector (4, int) v4sicst;
+
+int main (int argc, char *argv[]) {
+    vector (4, int) vec = {argc, 1,2,3};
+    const vector (4, int) cvec = {argc, 1,2,3};
+    register vector (4, int) rvec = {argc, 1,2,3};
+    v4si dvec = {argc, 1,2,3};
+    v4sicst dcvec = {argc, 1,2,3};
+    
+    vector (4, int) res; 
+    v4si dres;
+    register vector (4, int) rres;
+
+    vector (4, int) mask = {0,3,2,1};
+    const vector (4, int) cmask = {0,3,2,1};
+    register vector (4, int) rmask = {0,3,2,1};
+    v4si dmask = {0,3,2,1};
+    v4sicst dcmask = {0,3,2,1};
+
+    test_compat (res, vec, mask);
+
+    return 0;
+}
+
Index: gcc/testsuite/gcc.dg/builtin-complex-err-1.c
===================================================================
--- gcc/testsuite/gcc.dg/builtin-complex-err-1.c	(revision 178354)
+++ gcc/testsuite/gcc.dg/builtin-complex-err-1.c	(working copy)
@@ -19,8 +19,8 @@  _Complex float fc3 = __builtin_complex (
 void
 f (void)
 {
-  __builtin_complex (0.0); /* { dg-error "expected" } */
-  __builtin_complex (0.0, 0.0, 0.0); /* { dg-error "expected" } */
+  __builtin_complex (0.0); /* { dg-error "wrong number of arguments" } */
+  __builtin_complex (0.0, 0.0, 0.0); /* { dg-error "wrong number of arguments" } */
 }
 
-void (*p) (void) = __builtin_complex; /* { dg-error "expected" } */
+void (*p) (void) = __builtin_complex; /* { dg-error "cannot take address" } */
Index: gcc/c-tree.h
===================================================================
--- gcc/c-tree.h	(revision 178354)
+++ gcc/c-tree.h	(working copy)
@@ -579,6 +579,7 @@  extern tree c_begin_omp_task (void);
 extern tree c_finish_omp_task (location_t, tree, tree);
 extern tree c_finish_omp_clauses (tree);
 extern tree c_build_va_arg (location_t, tree, tree);
+extern tree c_build_vec_shuffle_expr (location_t, tree, tree, tree);
 
 /* Set to 0 at beginning of a function definition, set to 1 if
    a return statement that specifies a return value is seen.  */
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 178354)
+++ gcc/expr.c	(working copy)
@@ -8605,6 +8605,10 @@  expand_expr_real_2 (sepops ops, rtx targ
     case VEC_PACK_FIX_TRUNC_EXPR:
       mode = TYPE_MODE (TREE_TYPE (treeop0));
       goto binop;
+    
+    case VEC_SHUFFLE_EXPR:
+      target = expand_vec_shuffle_expr (type, treeop0, treeop1, treeop2, target);
+      return target;
 
     case DOT_PROD_EXPR:
       {
Index: gcc/gimple-pretty-print.c
===================================================================
--- gcc/gimple-pretty-print.c	(revision 178354)
+++ gcc/gimple-pretty-print.c	(working copy)
@@ -417,6 +417,16 @@  dump_ternary_rhs (pretty_printer *buffer
       dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
       pp_string (buffer, ">");
       break;
+    
+    case VEC_SHUFFLE_EXPR:
+      pp_string (buffer, "VEC_SHUFFLE_EXPR <");
+      dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+      pp_string (buffer, ", ");
+      dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+      pp_string (buffer, ", ");
+      dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+      pp_string (buffer, ">");
+      break;
 
     case REALIGN_LOAD_EXPR:
       pp_string (buffer, "REALIGN_LOAD <");
Index: gcc/c-typeck.c
===================================================================
--- gcc/c-typeck.c	(revision 178354)
+++ gcc/c-typeck.c	(working copy)
@@ -2307,7 +2307,7 @@  build_array_ref (location_t loc, tree ar
       if (TREE_CODE (TREE_TYPE (index)) != ARRAY_TYPE
 	  && TREE_CODE (TREE_TYPE (index)) != POINTER_TYPE)
 	{
-          error_at (loc, 
+          error_at (loc,
             "subscripted value is neither array nor pointer nor vector");
 
 	  return error_mark_node;
@@ -2339,8 +2339,8 @@  build_array_ref (location_t loc, tree ar
   index = default_conversion (index);
 
   gcc_assert (TREE_CODE (TREE_TYPE (index)) == INTEGER_TYPE);
-  
-  /* For vector[index], convert the vector to a 
+
+  /* For vector[index], convert the vector to a
      pointer of the underlying type.  */
   if (TREE_CODE (TREE_TYPE (array)) == VECTOR_TYPE)
     {
@@ -2348,11 +2348,11 @@  build_array_ref (location_t loc, tree ar
       tree type1;
 
       if (TREE_CODE (index) == INTEGER_CST)
-        if (!host_integerp (index, 1) 
-            || ((unsigned HOST_WIDE_INT) tree_low_cst (index, 1) 
+        if (!host_integerp (index, 1)
+            || ((unsigned HOST_WIDE_INT) tree_low_cst (index, 1)
                >= TYPE_VECTOR_SUBPARTS (TREE_TYPE (array))))
           warning_at (loc, OPT_Warray_bounds, "index value is out of bound");
-     
+
       c_common_mark_addressable_vec (array);
       type = build_qualified_type (TREE_TYPE (type), TYPE_QUALS (type));
       type = build_pointer_type (type);
@@ -2845,6 +2845,99 @@  build_function_call_vec (location_t loc,
     }
   return require_complete_type (result);
 }
+
+/* Build a VEC_SHUFFLE_EXPR if V0, V1 and MASK are not error_mark_nodes
+   and have vector types, V0 has the same type as V1, and the number of
+   elements of V0, V1, MASK is the same.
+
+   In case V1 is a NULL_TREE it is assumed that __builtin_shuffle was
+   called with two arguments.  In this case implementation passes the
+   first argument twice in order to share the same tree code.  This fact
+   could enable the mask-values being twice the vector length.  This is
+   an implementation accident and this semantics is not guaranteed to
+   the user.  */
+tree
+c_build_vec_shuffle_expr (location_t loc, tree v0, tree v1, tree mask)
+{
+  tree vec_shuffle, tmp;
+  bool wrap = true;
+  bool maybe_const = false;
+  bool two_arguments;
+
+  if (v1 == NULL_TREE)
+    {
+      two_arguments = true;
+      v1 = v0;
+    }
+
+  if (v0 == error_mark_node || v1 == error_mark_node
+      || mask == error_mark_node)
+    return error_mark_node;
+
+  if (TREE_CODE (TREE_TYPE (mask)) != VECTOR_TYPE
+      || TREE_CODE (TREE_TYPE (TREE_TYPE (mask))) != INTEGER_TYPE)
+    {
+      error_at (loc, "__builtin_shuffle last argument must "
+		     "be an integer vector");
+      return error_mark_node;
+    }
+
+  if (TREE_CODE (TREE_TYPE (v0)) != VECTOR_TYPE
+      || TREE_CODE (TREE_TYPE (v1)) != VECTOR_TYPE)
+    {
+      error_at (loc, "__builtin_shuffle arguments must be vectors");
+      return error_mark_node;
+    }
+
+  if (TYPE_MAIN_VARIANT (TREE_TYPE (v0)) != TYPE_MAIN_VARIANT (TREE_TYPE (v1)))
+    {
+      error_at (loc, "__builtin_shuffle argument vectors must be of "
+		     "the same type");
+      return error_mark_node;
+    }
+
+  if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (v0))
+      != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask))
+      && TYPE_VECTOR_SUBPARTS (TREE_TYPE (v1))
+	 != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)))
+    {
+      error_at (loc, "__builtin_shuffle number of elements of the "
+		     "argument vector(s) and the mask vector should "
+		     "be the same");
+      return error_mark_node;
+    }
+
+  if (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0))))
+      != GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask)))))
+    {
+      error_at (loc, "__builtin_shuffle argument vector(s) inner type "
+		     "must have the same size as inner type of the mask");
+      return error_mark_node;
+    }
+
+  /* Avoid C_MAYBE_CONST_EXPRs inside VEC_SHUFFLE_EXPR.  */
+  tmp = c_fully_fold (v0, false, &maybe_const);
+  v0 = save_expr (tmp);
+  wrap &= maybe_const;
+
+  if (!two_arguments)
+    {
+      v1 = c_fully_fold (v1, false, &maybe_const);
+      wrap &= maybe_const;
+    }
+  else
+    v1 = v0;
+
+  mask = c_fully_fold (mask, false, &maybe_const);
+  wrap &= maybe_const;
+
+  vec_shuffle = build3 (VEC_SHUFFLE_EXPR, TREE_TYPE (v0), v0, v1, mask);
+
+  if (!wrap)
+    vec_shuffle = c_wrap_maybe_const (vec_shuffle, true);
+
+  return vec_shuffle;
+}
 
 /* Convert the argument expressions in the vector VALUES
    to the types in the list TYPELIST.
@@ -3167,7 +3260,7 @@  convert_arguments (tree typelist, VEC(tr
 
   if (typetail != 0 && TREE_VALUE (typetail) != void_type_node)
     {
-      error_at (input_location, 
+      error_at (input_location,
 		"too few arguments to function %qE", function);
       if (fundecl && !DECL_BUILT_IN (fundecl))
 	inform (DECL_SOURCE_LOCATION (fundecl), "declared here");
@@ -3566,7 +3659,7 @@  build_unary_op (location_t location,
 
       /* Complain about anything that is not a true lvalue.  In
 	 Objective-C, skip this check for property_refs.  */
-      if (!objc_is_property_ref (arg) 
+      if (!objc_is_property_ref (arg)
 	  && !lvalue_or_else (location,
 			      arg, ((code == PREINCREMENT_EXPR
 				     || code == POSTINCREMENT_EXPR)
@@ -3683,7 +3776,7 @@  build_unary_op (location_t location,
 	   need to ask Objective-C to build the increment or decrement
 	   expression for it.  */
 	if (objc_is_property_ref (arg))
-	  return objc_build_incr_expr_for_property_ref (location, code, 
+	  return objc_build_incr_expr_for_property_ref (location, code,
 							arg, inc);
 
 	/* Report a read-only lvalue.  */
@@ -5926,7 +6019,7 @@  void
 pedwarn_init (location_t location, int opt, const char *gmsgid)
 {
   char *ofwhat;
-  
+
   /* The gmsgid may be a format string with %< and %>. */
   pedwarn (location, opt, gmsgid);
   ofwhat = print_spelling ((char *) alloca (spelling_length () + 1));
@@ -9344,8 +9437,8 @@  scalar_to_vector (location_t loc, enum t
   tree type1 = TREE_TYPE (op1);
   bool integer_only_op = false;
   enum stv_conv ret = stv_firstarg;
-  
-  gcc_assert (TREE_CODE (type0) == VECTOR_TYPE 
+
+  gcc_assert (TREE_CODE (type0) == VECTOR_TYPE
 	      || TREE_CODE (type1) == VECTOR_TYPE);
   switch (code)
     {
@@ -9370,7 +9463,7 @@  scalar_to_vector (location_t loc, enum t
       case BIT_AND_EXPR:
 	integer_only_op = true;
 	/* ... fall through ...  */
-      
+
       case PLUS_EXPR:
       case MINUS_EXPR:
       case MULT_EXPR:
@@ -9387,7 +9480,7 @@  scalar_to_vector (location_t loc, enum t
 	  }
 
 	if (TREE_CODE (type0) == INTEGER_TYPE
-	    && TREE_CODE (TREE_TYPE (type1)) == INTEGER_TYPE) 
+	    && TREE_CODE (TREE_TYPE (type1)) == INTEGER_TYPE)
 	  {
 	    if (unsafe_conversion_p (TREE_TYPE (type1), op0, false))
 	      {
@@ -9399,7 +9492,7 @@  scalar_to_vector (location_t loc, enum t
 	  }
 	else if (!integer_only_op
 		    /* Allow integer --> real conversion if safe.  */
-		 && (TREE_CODE (type0) == REAL_TYPE 
+		 && (TREE_CODE (type0) == REAL_TYPE
 		     || TREE_CODE (type0) == INTEGER_TYPE)
 		 && SCALAR_FLOAT_TYPE_P (TREE_TYPE (type1)))
 	  {
@@ -9414,7 +9507,7 @@  scalar_to_vector (location_t loc, enum t
       default:
 	break;
     }
- 
+
   return stv_nothing;
 }
 
@@ -9529,8 +9622,8 @@  build_binary_op (location_t location, en
     int_const = int_const_or_overflow = false;
 
   /* Do not apply default conversion in mixed vector/scalar expression.  */
-  if (convert_p 
-      && !((TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE) 
+  if (convert_p
+      && !((TREE_CODE (TREE_TYPE (op0)) == VECTOR_TYPE)
 	   != (TREE_CODE (TREE_TYPE (op1)) == VECTOR_TYPE)))
     {
       op0 = default_conversion (op0);
@@ -9608,7 +9701,7 @@  build_binary_op (location_t location, en
   if ((code0 == VECTOR_TYPE) != (code1 == VECTOR_TYPE))
     {
       enum stv_conv convert_flag = scalar_to_vector (location, code, op0, op1);
-      
+
       switch (convert_flag)
 	{
 	  case stv_error:
@@ -9949,7 +10042,7 @@  build_binary_op (location_t location, en
 	    {
 	      if (code == EQ_EXPR)
 		warning_at (location,
-			    OPT_Waddress, 
+			    OPT_Waddress,
 			    "the comparison will always evaluate as %<false%> "
 			    "for the address of %qD will never be NULL",
 			    TREE_OPERAND (op1, 0));
Index: gcc/gimplify.c
===================================================================
--- gcc/gimplify.c	(revision 178354)
+++ gcc/gimplify.c	(working copy)
@@ -7286,6 +7286,7 @@  gimplify_expr (tree *expr_p, gimple_seq
 	  }
 
 	case FMA_EXPR:
+	case VEC_SHUFFLE_EXPR:
 	  /* Classified as tcc_expression.  */
 	  goto expr_3;
 
Index: gcc/tree.def
===================================================================
--- gcc/tree.def	(revision 178354)
+++ gcc/tree.def	(working copy)
@@ -497,6 +497,19 @@  DEFTREECODE (COND_EXPR, "cond_expr", tcc
 */
 DEFTREECODE (VEC_COND_EXPR, "vec_cond_expr", tcc_expression, 3)
 
+/* Vector shuffle expression.  A = VEC_SHUFFLE_EXPR<v0, v1, mask>
+   means
+
+   foreach i in length (mask):
+     A = mask[i] < length (v0) ? v0[mask[i]] : v1[mask[i] - length (mask)]
+
+   V0 and V1 are vectors of the same type.  MASK is an integer-typed
+   vector.  The number of MASK elements must be the same with the
+   number of elements in V0 and V1.  The size of the inner type
+   of the MASK and of the V0 and V1 must be the same.
+*/
+DEFTREECODE (VEC_SHUFFLE_EXPR, "vec_shuffle_expr", tcc_expression, 3)
+
 /* Declare local variables, including making RTL and allocating space.
    BIND_EXPR_VARS is a chain of VAR_DECL nodes for the variables.
    BIND_EXPR_BODY is the body, the expression to be computed using
Index: gcc/tree-inline.c
===================================================================
--- gcc/tree-inline.c	(revision 178354)
+++ gcc/tree-inline.c	(working copy)
@@ -3285,6 +3285,7 @@  estimate_operator_cost (enum tree_code c
        ??? We may consider mapping RTL costs to this.  */
     case COND_EXPR:
     case VEC_COND_EXPR:
+    case VEC_SHUFFLE_EXPR:
 
     case PLUS_EXPR:
     case POINTER_PLUS_EXPR:
Index: gcc/tree-vect-generic.c
===================================================================
--- gcc/tree-vect-generic.c	(revision 178354)
+++ gcc/tree-vect-generic.c	(working copy)
@@ -30,6 +30,7 @@  along with GCC; see the file COPYING3.
 #include "tree-pass.h"
 #include "flags.h"
 #include "ggc.h"
+#include "diagnostic.h"
 
 /* Need to include rtl.h, expr.h, etc. for optabs.  */
 #include "expr.h"
@@ -326,10 +327,10 @@  uniform_vector_p (tree vec)
         }
       if (i != TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec)))
 	return NULL_TREE;
-      
+
       return first;
     }
-  
+
   return NULL_TREE;
 }
 
@@ -432,6 +433,263 @@  type_for_widest_vector_mode (enum machin
     }
 }
 
+
+/* Build a reference to the element of the vector VECT.  Function
+   returns either the element itself, either BIT_FIELD_REF, or an
+   ARRAY_REF expression.
+
+   GSI is requred to insert temporary variables while building a
+   refernece to the element of the vector VECT.
+
+   PTMPVEC is a pointer to the temporary variable for caching
+   purposes.  In case when PTMPVEC is NULL new temporary variable
+   will be created.  */
+static tree
+vector_element (gimple_stmt_iterator *gsi, tree vect, tree idx, tree *ptmpvec)
+{
+  tree type;
+  gimple asgn;
+  tree tmpvec;
+  tree arraytype;
+  bool need_asgn = true;
+
+  gcc_assert (TREE_CODE (TREE_TYPE (vect)) == VECTOR_TYPE);
+
+  type = TREE_TYPE (vect);
+  if (TREE_CODE (idx) == INTEGER_CST)
+    {
+      unsigned HOST_WIDE_INT index;
+
+      if (!host_integerp (idx, 1)
+           || (index = tree_low_cst (idx, 1)) > TYPE_VECTOR_SUBPARTS (type)-1)
+        return error_mark_node;
+
+      if (TREE_CODE (vect) == VECTOR_CST)
+        {
+	  unsigned i;
+	  tree vals = TREE_VECTOR_CST_ELTS (vect);
+	  for (i = 0; vals; vals = TREE_CHAIN (vals), ++i)
+	    if (i == index)
+	       return TREE_VALUE (vals);
+	  return error_mark_node;
+        }
+      else if (TREE_CODE (vect) == CONSTRUCTOR)
+        {
+          unsigned i;
+          VEC (constructor_elt, gc) *vals = CONSTRUCTOR_ELTS (vect);
+          constructor_elt *elt;
+
+          for (i = 0; VEC_iterate (constructor_elt, vals, i, elt); i++)
+            if (operand_equal_p (elt->index, idx, 0))
+              return elt->value;
+          return fold_convert (TREE_TYPE (type), integer_zero_node);
+        }
+      else if (TREE_CODE (vect) == SSA_NAME)
+        {
+	  tree size = TYPE_SIZE (TREE_TYPE (type));
+          tree pos = fold_build2 (MULT_EXPR, TREE_TYPE (idx), idx, size);
+          return fold_build3 (BIT_FIELD_REF, TREE_TYPE (type), vect, size, pos);
+        }
+      else
+        return error_mark_node;
+    }
+
+  if (!ptmpvec)
+    tmpvec = create_tmp_var (TREE_TYPE (vect), "vectmp");
+  else if (!*ptmpvec)
+    tmpvec = *ptmpvec = create_tmp_var (TREE_TYPE (vect), "vectmp");
+  else
+    {
+      tmpvec = *ptmpvec;
+      need_asgn = false;
+    }
+
+  if (need_asgn)
+    {
+      TREE_ADDRESSABLE (tmpvec) = 1;
+      asgn = gimple_build_assign (tmpvec, vect);
+      gsi_insert_before (gsi, asgn, GSI_SAME_STMT);
+    }
+
+  arraytype = build_array_type_nelts (TREE_TYPE (type),
+				      TYPE_VECTOR_SUBPARTS (TREE_TYPE (vect)));
+
+  return build4 (ARRAY_REF, TREE_TYPE (type),
+                 build1 (VIEW_CONVERT_EXPR, arraytype, tmpvec),
+                 idx, NULL_TREE, NULL_TREE);
+}
+
+/* Check if VEC_SHUFFLE_EXPR within the given setting is supported
+   by hardware, or lower it piecewise.  Function returns false when
+   the expression must be replaced with TRAP_RETURN, true otherwise.
+
+   When VEC_SHUFFLE_EXPR has the same first and second operands:
+   VEC_SHUFFLE_EXPR <v0, v0, mask> the lowered version would be
+   {v0[mask[0]], v0[mask[1]], ...}
+   MASK and V0 must have the same number of elements.
+
+   Otherwise VEC_SHUFFLE_EXPR <v0, v1, mask> is lowered to
+   {mask[0] < len(v0) ? v0[mask[0]] : v1[mask[0]], ...}
+   V0 and V1 must have the same type.  MASK, V0, V1 must have the
+   same number of arguments.  */
+static bool
+lower_vec_shuffle (gimple_stmt_iterator *gsi, location_t loc)
+{
+
+  gimple stmt = gsi_stmt (*gsi);
+  tree mask = gimple_assign_rhs3 (stmt);
+  tree vec0 = gimple_assign_rhs1 (stmt);
+  tree vec1 = gimple_assign_rhs2 (stmt);
+  unsigned els = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask));
+  tree type0 = TREE_TYPE (TREE_TYPE (vec0));
+  VEC(constructor_elt,gc) *v = NULL;
+  tree vectype, constr;
+  tree vec0tmp = NULL_TREE, masktmp = NULL_TREE;
+
+  if (expand_vec_shuffle_expr_p (TYPE_MODE (TREE_TYPE (vec0)), vec0, vec1, mask))
+    {
+      tree t;
+
+      t = gimplify_build3 (gsi, VEC_SHUFFLE_EXPR, TREE_TYPE (vec0),
+			   vec0, vec1, mask);
+      gimple_assign_set_rhs_from_tree (gsi, t);
+      /* Statement should be updated by callee.  */
+      return true;
+    }
+
+  if (operand_equal_p (vec0, vec1, 0))
+    {
+      unsigned i;
+      tree vec0tmp = NULL_TREE;
+
+      v = VEC_alloc (constructor_elt, gc, els);
+      for (i = 0; i < els; i++)
+        {
+          tree idxval, vecel, t;
+
+	  idxval = vector_element (gsi, mask, size_int (i), &masktmp);
+          if (idxval == error_mark_node)
+            {
+              if (warning_at (loc, 0, "Invalid shuffling mask index %i", i))
+		inform (loc, "if this code is reached the programm will abort");
+	      return false;
+            }
+
+	  vecel = vector_element (gsi, vec0, idxval, &vec0tmp);
+          if (vecel == error_mark_node)
+            {
+              if (warning_at (loc, 0, "Invalid shuffling arguments"))
+		inform (loc, "if this code is reached the programm will abort");
+	      return false;
+            }
+
+          t = force_gimple_operand_gsi (gsi, vecel, true,
+					NULL_TREE, true, GSI_SAME_STMT);
+          CONSTRUCTOR_APPEND_ELT (v, size_int (i), t);
+        }
+    }
+  else
+    {
+      unsigned i;
+      tree var = create_tmp_var (type0, "vecel");
+      tree vec1tmp = NULL_TREE;
+
+      v = VEC_alloc (constructor_elt, gc, els);
+      for (i = 0; i < els; i++)
+        {
+          tree idxval, idx1val, cond, elval0, elval1, condexpr, t, ssatmp;
+          tree vec0el, vec1el;
+          gimple asgn;
+
+          idxval = vector_element (gsi, mask, size_int (i), &masktmp);
+	  if (idxval == error_mark_node)
+            {
+              if (warning_at (loc, 0, "Invalid shuffling mask index %i", i))
+		inform (loc, "if this code is reached the programm will abort");
+	      return false;
+            }
+
+          if (TREE_CODE (idxval) == INTEGER_CST)
+            {
+              if (tree_int_cst_lt (idxval, size_int (els)))
+                {
+                  vec0el = vector_element (gsi, vec0, idxval, &vec0tmp);
+                  t = force_gimple_operand_gsi (gsi, vec0el,
+                                    true, NULL_TREE, true, GSI_SAME_STMT);
+                }
+              else if (tree_int_cst_lt (idxval, size_int (2*els)))
+                {
+                  idx1val = fold_build2 (MINUS_EXPR, TREE_TYPE (idxval),
+                        idxval, build_int_cst (TREE_TYPE (idxval), els));
+
+                  vec1el = vector_element (gsi, vec1, idx1val, &vec1tmp);
+                  t = force_gimple_operand_gsi (gsi, vec1el, true,
+						NULL_TREE, true, GSI_SAME_STMT);
+                }
+              else
+                {
+                  if (warning_at (loc, 0, "Invalid shuffling mask index %i", i))
+		    inform (loc, "if this code is reached the "
+				  "programm will abort");
+		  return false;
+                }
+            }
+          else
+            {
+
+              idx1val = fold_build2 (MINUS_EXPR, TREE_TYPE (idxval),
+                            idxval, build_int_cst (TREE_TYPE (idxval), els));
+              idx1val = force_gimple_operand_gsi (gsi, idx1val,
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+              cond = fold_build2 (GT_EXPR, boolean_type_node, \
+                             idxval, fold_convert (type0, size_int (els - 1)));
+
+	      vec0el = vector_element (gsi, vec0, idxval, &vec0tmp);
+              if (vec0el == error_mark_node)
+                {
+                  if (warning_at (loc, 0, "Invalid shuffling arguments"))
+		    inform (loc, "if this code is reached the "
+				 "programm will abort");
+		  return false;
+                }
+
+              elval0 = force_gimple_operand_gsi (gsi, vec0el,
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+
+	      vec1el = vector_element (gsi, vec1, idx1val, &vec1tmp);
+              if (vec1el == error_mark_node)
+                {
+                  if (warning_at (loc, 0, "Invalid shuffling arguments"))
+		    inform (loc, "if this code is reached the "
+				 "programm will abort");
+		  return false;
+                }
+
+              elval1 = force_gimple_operand_gsi (gsi, vec1el,
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+
+              condexpr = fold_build3 (COND_EXPR, type0, cond, \
+                                      elval1, elval0);
+
+              t = force_gimple_operand_gsi (gsi, condexpr, true, \
+                                        NULL_TREE, true, GSI_SAME_STMT);
+            }
+
+          asgn = gimple_build_assign (var, t);
+          ssatmp = make_ssa_name (var, asgn);
+          gimple_assign_set_lhs (asgn, ssatmp);
+          gsi_insert_before (gsi, asgn, GSI_SAME_STMT);
+          CONSTRUCTOR_APPEND_ELT (v, size_int (i), ssatmp);
+        }
+    }
+
+  vectype = build_vector_type (type0, els);
+  constr = build_constructor (vectype, v);
+  gimple_assign_set_rhs_from_tree (gsi, constr);
+  /* Statement should be updated by callee.  */
+  return true;
+}
+
 /* Process one statement.  If we identify a vector operation, expand it.  */
 
 static void
@@ -451,6 +709,25 @@  expand_vector_operations_1 (gimple_stmt_
   code = gimple_assign_rhs_code (stmt);
   rhs_class = get_gimple_rhs_class (code);
 
+  if (code == VEC_SHUFFLE_EXPR)
+    {
+      if (!lower_vec_shuffle (gsi, gimple_location (stmt)))
+	{
+	  gimple new_stmt;
+	  tree vec0;
+
+	  vec0 = gimple_assign_rhs1 (stmt);
+	  new_stmt = gimple_build_call (built_in_decls[BUILT_IN_TRAP], 0);
+	  gsi_insert_before (gsi, new_stmt,  GSI_SAME_STMT);
+	  split_block (gimple_bb (new_stmt), new_stmt);
+	  new_stmt = gimple_build_assign (gimple_assign_lhs (stmt), vec0);
+	  gsi_replace (gsi, new_stmt, false);
+	}
+
+      gimple_set_modified (gsi_stmt (*gsi), true);
+      update_stmt (gsi_stmt (*gsi));
+    }
+
   if (rhs_class != GIMPLE_UNARY_RHS && rhs_class != GIMPLE_BINARY_RHS)
     return;
 
@@ -485,9 +762,9 @@  expand_vector_operations_1 (gimple_stmt_
     {
       bool vector_scalar_shift;
       op = optab_for_tree_code (code, type, optab_scalar);
-      
+
       /* Vector/Scalar shift is supported.  */
-      vector_scalar_shift = (op && (optab_handler (op, TYPE_MODE (type)) 
+      vector_scalar_shift = (op && (optab_handler (op, TYPE_MODE (type))
 				    != CODE_FOR_nothing));
 
       /* If the 2nd argument is vector, we need a vector/vector shift.
@@ -500,10 +777,10 @@  expand_vector_operations_1 (gimple_stmt_
           /* Check whether we have vector <op> {x,x,x,x} where x
              could be a scalar variable or a constant. Transform
              vector <op> {x,x,x,x} ==> vector <op> scalar.  */
-          if (vector_scalar_shift 
+          if (vector_scalar_shift
               && ((TREE_CODE (rhs2) == VECTOR_CST
 		   && (first = uniform_vector_p (rhs2)) != NULL_TREE)
-		  || (TREE_CODE (rhs2) == SSA_NAME 
+		  || (TREE_CODE (rhs2) == SSA_NAME
 		      && (def_stmt = SSA_NAME_DEF_STMT (rhs2))
 		      && gimple_assign_single_p (def_stmt)
 		      && (first = uniform_vector_p
@@ -516,14 +793,14 @@  expand_vector_operations_1 (gimple_stmt_
           else
             op = optab_for_tree_code (code, type, optab_vector);
         }
-    
+
       /* Try for a vector/scalar shift, and if we don't have one, see if we
          have a vector/vector shift */
       else if (!vector_scalar_shift)
 	{
 	  op = optab_for_tree_code (code, type, optab_vector);
 
-	  if (op && (optab_handler (op, TYPE_MODE (type)) 
+	  if (op && (optab_handler (op, TYPE_MODE (type))
 		     != CODE_FOR_nothing))
 	    {
 	      /* Transform vector <op> scalar => vector <op> {x,x,x,x}.  */
@@ -613,9 +890,9 @@  expand_vector_operations_1 (gimple_stmt_
    if it may need the bit-twiddling tricks implemented in this file.  */
 
 static bool
-gate_expand_vector_operations (void)
+gate_expand_vector_operations_ssa (void)
 {
-  return flag_tree_vectorize != 0;
+  return optimize == 0;
 }
 
 static unsigned int
@@ -648,7 +925,7 @@  struct gimple_opt_pass pass_lower_vector
  {
   GIMPLE_PASS,
   "veclower",				/* name */
-  0,					/* gate */
+  gate_expand_vector_operations_ssa,    /* gate */
   expand_vector_operations,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
@@ -661,6 +938,7 @@  struct gimple_opt_pass pass_lower_vector
   TODO_update_ssa	                /* todo_flags_finish */
     | TODO_verify_ssa
     | TODO_verify_stmts | TODO_verify_flow
+    | TODO_cleanup_cfg
  }
 };
 
@@ -669,7 +947,7 @@  struct gimple_opt_pass pass_lower_vector
  {
   GIMPLE_PASS,
   "veclower2",				/* name */
-  gate_expand_vector_operations,	/* gate */
+  0,	                                /* gate */
   expand_vector_operations,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
@@ -682,6 +960,7 @@  struct gimple_opt_pass pass_lower_vector
   TODO_update_ssa	                /* todo_flags_finish */
     | TODO_verify_ssa
     | TODO_verify_stmts | TODO_verify_flow
+    | TODO_cleanup_cfg
  }
 };
 
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in	(revision 178354)
+++ gcc/Makefile.in	(working copy)
@@ -3178,7 +3178,7 @@  tree-vect-generic.o : tree-vect-generic.
     $(TM_H) $(TREE_FLOW_H) $(GIMPLE_H) tree-iterator.h $(TREE_PASS_H) \
     $(FLAGS_H) $(OPTABS_H) $(MACHMODE_H) $(EXPR_H) \
     langhooks.h $(FLAGS_H) $(DIAGNOSTIC_H) gt-tree-vect-generic.h $(GGC_H) \
-    coretypes.h insn-codes.h
+    coretypes.h insn-codes.h $(DIAGNOSTIC_H)
 df-core.o : df-core.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) \
    insn-config.h $(RECOG_H) $(FUNCTION_H) $(REGS_H) alloc-pool.h \
    hard-reg-set.h $(BASIC_BLOCK_H) $(DF_H) $(BITMAP_H) sbitmap.h $(TIMEVAR_H) \
Index: gcc/gimple.c
===================================================================
--- gcc/gimple.c	(revision 178354)
+++ gcc/gimple.c	(working copy)
@@ -2615,6 +2615,7 @@  get_gimple_rhs_num_ops (enum tree_code c
       || (SYM) == WIDEN_MULT_MINUS_EXPR					    \
       || (SYM) == DOT_PROD_EXPR						    \
       || (SYM) == REALIGN_LOAD_EXPR					    \
+      || (SYM) == VEC_SHUFFLE_EXPR					    \
       || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS			    \
    : ((SYM) == COND_EXPR						    \
       || (SYM) == CONSTRUCTOR						    \
Index: gcc/tree-cfg.c
===================================================================
--- gcc/tree-cfg.c	(revision 178354)
+++ gcc/tree-cfg.c	(working copy)
@@ -3711,6 +3711,59 @@  verify_gimple_assign_ternary (gimple stm
 	}
       break;
 
+    case VEC_SHUFFLE_EXPR:
+      if (!useless_type_conversion_p (lhs_type, rhs1_type)
+	  || !useless_type_conversion_p (lhs_type, rhs2_type))
+	{
+	  error ("type mismatch in vector shuffle expression");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  debug_generic_expr (rhs2_type);
+	  debug_generic_expr (rhs3_type);
+	  return true;
+	}
+
+      if (TREE_CODE (rhs1_type) != VECTOR_TYPE
+	  || TREE_CODE (rhs2_type) != VECTOR_TYPE
+	  || TREE_CODE (rhs3_type) != VECTOR_TYPE)
+	{
+	  error ("vector types expected in vector shuffle expression");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  debug_generic_expr (rhs2_type);
+	  debug_generic_expr (rhs3_type);
+	  return true;
+	}
+
+      if (TYPE_VECTOR_SUBPARTS (rhs1_type) != TYPE_VECTOR_SUBPARTS (rhs2_type)
+	  || TYPE_VECTOR_SUBPARTS (rhs2_type)
+	     != TYPE_VECTOR_SUBPARTS (rhs3_type)
+	  || TYPE_VECTOR_SUBPARTS (rhs3_type)
+	     != TYPE_VECTOR_SUBPARTS (lhs_type))
+	{
+	  error ("vectors with different element number found "
+		 "in vector shuffle expression");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  debug_generic_expr (rhs2_type);
+	  debug_generic_expr (rhs3_type);
+	  return true;
+	}
+
+      if (TREE_CODE (TREE_TYPE (rhs3_type)) != INTEGER_TYPE
+	  || GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs3_type)))
+	     != GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs1_type))))
+	{
+	  error ("invalid mask type in vector shuffle expression");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  debug_generic_expr (rhs2_type);
+	  debug_generic_expr (rhs3_type);
+	  return true;
+	}
+
+      return false;
+
     case DOT_PROD_EXPR:
     case REALIGN_LOAD_EXPR:
       /* FIXME.  */
Index: gcc/passes.c
===================================================================
--- gcc/passes.c	(revision 178354)
+++ gcc/passes.c	(working copy)
@@ -1354,7 +1354,6 @@  init_optimization_passes (void)
 	  NEXT_PASS (pass_vectorize);
 	    {
 	      struct opt_pass **p = &pass_vectorize.pass.sub;
-	      NEXT_PASS (pass_lower_vector_ssa);
 	      NEXT_PASS (pass_dce_loop);
 	    }
           NEXT_PASS (pass_predcom);
@@ -1366,6 +1365,7 @@  init_optimization_passes (void)
 	  NEXT_PASS (pass_lim);
 	  NEXT_PASS (pass_tree_loop_done);
 	}
+      NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_vrp);
Index: gcc/c-parser.c
===================================================================
--- gcc/c-parser.c	(revision 178354)
+++ gcc/c-parser.c	(working copy)
@@ -5989,6 +5989,46 @@  c_parser_alignof_expression (c_parser *p
     }
 }
 
+/* Helper function to read arguments of builtins which are interfaces
+   for the middle-end nodes like COMPLEX_EXPR, VEC_SHUFFLE_EXPR and
+   others.  The name of the builtin is passed using BNAME parameter.
+   Function returns true if there were no errors while parsing and
+   stores the arguments in EXPR_LIST.  List of original types can be
+   obtained by passing non NULL value to ORIG_TYPES.  */
+static bool
+c_parser_get_builtin_args (c_parser *parser, const char *bname,
+			   VEC(tree,gc) **expr_list,
+			   VEC(tree,gc) **orig_types)
+{
+  location_t loc = c_parser_peek_token (parser)->location;
+  *expr_list = NULL;
+
+  if (c_parser_next_token_is_not (parser, CPP_OPEN_PAREN))
+    {
+      error_at (loc, "cannot take address of %qs", bname);
+      return false;
+    }
+
+  c_parser_consume_token (parser);
+
+  if (c_parser_next_token_is (parser, CPP_CLOSE_PAREN))
+    {
+      c_parser_consume_token (parser);
+      return true;
+    }
+    
+  if (orig_types)
+    *expr_list = c_parser_expr_list (parser, false, false, orig_types);
+  else
+    *expr_list = c_parser_expr_list (parser, false, false, NULL);
+
+  if (!c_parser_require (parser, CPP_CLOSE_PAREN, "expected %<)%>"))
+    return false;
+
+  return true;
+}
+
+
 /* Parse a postfix expression (C90 6.3.1-6.3.2, C99 6.5.1-6.5.2).
 
    postfix-expression:
@@ -6027,6 +6067,10 @@  c_parser_alignof_expression (c_parser *p
 			     assignment-expression )
      __builtin_types_compatible_p ( type-name , type-name )
      __builtin_complex ( assignment-expression , assignment-expression )
+     __builtin_shuffle ( assignment-expression , assignment-expression )
+     __builtin_shuffle ( assignment-expression , 
+			 assignment-expression ,
+			 assignment-expression, )
 
    offsetof-member-designator:
      identifier
@@ -6047,7 +6091,7 @@  c_parser_alignof_expression (c_parser *p
 static struct c_expr
 c_parser_postfix_expression (c_parser *parser)
 {
-  struct c_expr expr, e1, e2, e3;
+  struct c_expr expr, e1;
   struct c_type_name *t1, *t2;
   location_t loc = c_parser_peek_token (parser)->location;;
   expr.original_code = ERROR_MARK;
@@ -6333,45 +6377,55 @@  c_parser_postfix_expression (c_parser *p
 	  }
 	  break;
 	case RID_CHOOSE_EXPR:
-	  c_parser_consume_token (parser);
-	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
-	    {
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  loc = c_parser_peek_token (parser)->location;
-	  e1 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e2 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e3 = c_parser_expr_no_commas (parser, NULL);
-	  c_parser_skip_until_found (parser, CPP_CLOSE_PAREN,
-				     "expected %<)%>");
 	  {
-	    tree c;
+	    VEC(tree,gc) *expr_list;
+	    VEC(tree,gc) *orig_types;
+	    tree e1value, e2value, e3value, c;
 
-	    c = e1.value;
-	    mark_exp_read (e2.value);
-	    mark_exp_read (e3.value);
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser,
+					    "__builtin_choose_expr",
+					    &expr_list, &orig_types))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) != 3)
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_choose_expr%>");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    
+	    e1value = VEC_index (tree, expr_list, 0);
+	    e2value = VEC_index (tree, expr_list, 1);
+	    e3value = VEC_index (tree, expr_list, 2);
+
+	    c = e1value;
+	    mark_exp_read (e2value);
+	    mark_exp_read (e3value);
 	    if (TREE_CODE (c) != INTEGER_CST
 		|| !INTEGRAL_TYPE_P (TREE_TYPE (c)))
 	      error_at (loc,
 			"first argument to %<__builtin_choose_expr%> not"
 			" a constant");
 	    constant_expression_warning (c);
-	    expr = integer_zerop (c) ? e3 : e2;
+	    
+	    if (integer_zerop (c))
+	      {
+		expr.value = e3value;
+		expr.original_type = VEC_index (tree, orig_types, 2);
+	      }
+	    else
+	      {
+		expr.value = e2value;
+		expr.original_type = VEC_index (tree, orig_types, 1);
+	      }
+
+	    break;
 	  }
-	  break;
 	case RID_TYPES_COMPATIBLE_P:
 	  c_parser_consume_token (parser);
 	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
@@ -6410,57 +6464,96 @@  c_parser_postfix_expression (c_parser *p
 	  }
 	  break;
 	case RID_BUILTIN_COMPLEX:
-	  c_parser_consume_token (parser);
-	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
-	    {
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  loc = c_parser_peek_token (parser)->location;
-	  e1 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e2 = c_parser_expr_no_commas (parser, NULL);
-	  c_parser_skip_until_found (parser, CPP_CLOSE_PAREN,
-				     "expected %<)%>");
-	  mark_exp_read (e1.value);
-	  if (TREE_CODE (e1.value) == EXCESS_PRECISION_EXPR)
-	    e1.value = convert (TREE_TYPE (e1.value),
-				TREE_OPERAND (e1.value, 0));
-	  mark_exp_read (e2.value);
-	  if (TREE_CODE (e2.value) == EXCESS_PRECISION_EXPR)
-	    e2.value = convert (TREE_TYPE (e2.value),
-				TREE_OPERAND (e2.value, 0));
-	  if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (e1.value))
-	      || DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e1.value))
-	      || !SCALAR_FLOAT_TYPE_P (TREE_TYPE (e2.value))
-	      || DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e2.value)))
-	    {
-	      error_at (loc, "%<__builtin_complex%> operand "
-			"not of real binary floating-point type");
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  if (TYPE_MAIN_VARIANT (TREE_TYPE (e1.value))
-	      != TYPE_MAIN_VARIANT (TREE_TYPE (e2.value)))
-	    {
-	      error_at (loc,
-			"%<__builtin_complex%> operands of different types");
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  if (!flag_isoc99)
-	    pedwarn (loc, OPT_pedantic,
-		     "ISO C90 does not support complex types");
-	  expr.value = build2 (COMPLEX_EXPR,
-			       build_complex_type (TYPE_MAIN_VARIANT
-						   (TREE_TYPE (e1.value))),
-			       e1.value, e2.value);
-	  break;
+	  { 
+	    VEC(tree,gc) *expr_list;
+	    tree e1value, e2value;
+	    
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser,
+					    "__builtin_complex",
+					    &expr_list, NULL))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) != 2)
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_complex%>");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    
+	    e1value = VEC_index (tree, expr_list, 0);
+	    e2value = VEC_index (tree, expr_list, 1);
+
+	    mark_exp_read (e1value);
+	    if (TREE_CODE (e1value) == EXCESS_PRECISION_EXPR)
+	      e1value = convert (TREE_TYPE (e1value),
+				 TREE_OPERAND (e1value, 0));
+	    mark_exp_read (e2value);
+	    if (TREE_CODE (e2value) == EXCESS_PRECISION_EXPR)
+	      e2value = convert (TREE_TYPE (e2value),
+				 TREE_OPERAND (e2value, 0));
+	    if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (e1value))
+		|| DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e1value))
+		|| !SCALAR_FLOAT_TYPE_P (TREE_TYPE (e2value))
+		|| DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e2value)))
+	      {
+		error_at (loc, "%<__builtin_complex%> operand "
+			  "not of real binary floating-point type");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    if (TYPE_MAIN_VARIANT (TREE_TYPE (e1value))
+		!= TYPE_MAIN_VARIANT (TREE_TYPE (e2value)))
+	      {
+		error_at (loc,
+			  "%<__builtin_complex%> operands of different types");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    if (!flag_isoc99)
+	      pedwarn (loc, OPT_pedantic,
+		       "ISO C90 does not support complex types");
+	    expr.value = build2 (COMPLEX_EXPR,
+				 build_complex_type (TYPE_MAIN_VARIANT
+						     (TREE_TYPE (e1value))),
+				 e1value, e2value);
+	    break;
+	  }
+	case RID_BUILTIN_SHUFFLE:
+	  {
+	    VEC(tree,gc) *expr_list;
+	    
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser,
+					    "__builtin_shuffle",
+					    &expr_list, NULL))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) == 2)
+	      expr.value = c_build_vec_shuffle_expr
+				(loc, VEC_index (tree, expr_list, 0),
+				 NULL_TREE,
+				 VEC_index (tree, expr_list, 1));
+	    else if (VEC_length (tree, expr_list) == 3)
+	      expr.value = c_build_vec_shuffle_expr
+				(loc, VEC_index (tree, expr_list, 0),
+				 VEC_index (tree, expr_list, 1),
+				 VEC_index (tree, expr_list, 2));
+	    else
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_shuffle%>");
+		expr.value = error_mark_node;
+	      }
+	    break;
+	  }
 	case RID_AT_SELECTOR:
 	  gcc_assert (c_dialect_objc ());
 	  c_parser_consume_token (parser);
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md	(revision 178354)
+++ gcc/config/i386/sse.md	(working copy)
@@ -231,6 +231,12 @@  (define_mode_attr sseinsnmode
    (V4SF "V4SF") (V2DF "V2DF")
    (TI "TI") (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")])
 
+;; All 128bit vector modes
+(define_mode_attr sseshuffint
+  [(V16QI "V16QI") (V8HI "V8HI")
+   (V4SI "V4SI")  (V2DI "V2DI")
+   (V4SF "V4SI") (V2DF "V2DI")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V8SF "V8SI") (V4DF "V4DI")
@@ -6234,6 +6240,18 @@  (define_expand "vconduv2di"
   DONE;
 })
 
+(define_expand "vshuffle<mode>"
+  [(match_operand:V_128 0 "register_operand" "")
+   (match_operand:V_128 1 "general_operand" "")
+   (match_operand:<sseshuffint> 2 "general_operand" "")]
+  "TARGET_SSSE3 || TARGET_AVX"
+{
+  bool ok = ix86_expand_vshuffle (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel bitwise logical operations
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc/config/i386/i386-protos.h	(revision 178354)
+++ gcc/config/i386/i386-protos.h	(working copy)
@@ -118,6 +118,7 @@  extern bool ix86_expand_int_movcc (rtx[]
 extern bool ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
 extern bool ix86_expand_int_vcond (rtx[]);
+extern bool ix86_expand_vshuffle (rtx[]);
 extern void ix86_expand_sse_unpack (rtx[], bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 178354)
+++ gcc/config/i386/i386.c	(working copy)
@@ -18693,6 +18693,93 @@  ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+bool
+ix86_expand_vshuffle (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx op0 = operands[1];
+  rtx mask = operands[2];
+  rtx mm, vt, cv0, t1;
+  enum machine_mode mode = GET_MODE (op0);
+  enum machine_mode maskmode = GET_MODE (mask);
+  enum machine_mode maskinner = GET_MODE_INNER (mode);
+  rtx vec[16];
+  int w, i, j;
+
+  gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128);
+
+  op0 = force_reg (mode, op0);
+  mask = force_reg (maskmode, mask);
+
+  /* Number of elements in the vector.  */
+  w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner);
+
+  /* mask = mask & {w-1, w-1, w-1,...} */
+  for (i = 0; i < w; i++)
+    vec[i] = GEN_INT (w - 1);
+
+  mm = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+  mask = expand_simple_binop (maskmode, AND, mask, mm,
+			      NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Convert mask to vector of chars.  */
+  mask = simplify_gen_subreg (V16QImode, mask, maskmode, 0);
+  mask = force_reg (V16QImode, mask);
+
+
+  /* Build a helper mask wich we will use in pshufb
+     (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+     (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}
+     ...  */
+  for (i = 0; i < w; i++)
+    for (j = 0; j < 16/w; j++)
+      vec[i*w+j] = GEN_INT (i*16/w);
+
+  vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  vt = force_reg (V16QImode, vt);
+
+  t1 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_ssse3_pshufbv16qi3 (t1, mask, vt));
+  mm = t1;
+
+  /* MM contains now something like
+     mm = {m[0], .., m[0], m[k], .., m[k], ... }, where
+     m[i] is an index of the element in the vector we are
+     selecting from.
+
+     Convert it into the byte positions by doing
+     mm = mm * {16/w, 16/w, ...}
+     mm = mm + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+  for (i = 0; i < 16; i++)
+    vec[i] = GEN_INT (16/w);
+
+  cv0 = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  mm = expand_simple_binop (V16QImode, MULT, mm, cv0,
+			    NULL_RTX, 0, OPTAB_DIRECT);
+
+  for (i = 0; i < w; i++)
+    for (j = 0; j < 16/w; j++)
+      vec[i*w+j] = GEN_INT (j);
+
+  cv0 = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  mm = expand_simple_binop (V16QImode, PLUS, mm, cv0,
+			    NULL_RTX, 0, OPTAB_DIRECT);
+
+  t1 = gen_reg_rtx (V16QImode);
+
+  /* Convert OP0 to vector of chars.  */
+  op0 = simplify_gen_subreg (V16QImode, op0, mode, 0);
+  op0 = force_reg (V16QImode, op0);
+  emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mm));
+
+  /* Convert it back from vector of chars to the original mode.  */
+  t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
+
+  emit_insn (gen_rtx_SET (VOIDmode, target, t1));
+
+  return true;
+}
+
 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
    true if we should do zero extension, else sign extension.  HIGH_P is
    true if we want the N/2 high elements, else the low elements.  */
@@ -30911,6 +30998,9 @@  struct expand_vec_perm_d
 
 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+static int extract_vec_perm_cst (struct expand_vec_perm_d *, tree);
+static bool ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask);
+
 
 /* Get a vector mode of the same size as the original but with elements
    twice as wide.  This is only guaranteed to apply to integral vectors.  */
@@ -32417,7 +32507,7 @@  void ix86_emit_i387_round (rtx op0, rtx
   res = gen_reg_rtx (outmode);
 
   half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
-  
+
   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
 
   /* scratch = fxam(op1) */
@@ -34576,10 +34666,10 @@  ix86_vectorize_builtin_vec_perm_ok (tree
 
   vec_mask = extract_vec_perm_cst (&d, mask);
 
-  /* This hook is cannot be called in response to something that the
-     user does (unlike the builtin expander) so we shouldn't ever see
-     an error generated from the extract.  */
-  gcc_assert (vec_mask > 0 && vec_mask <= 3);
+  /* Check whether the mask can be applied to the vector type.  */
+  if (vec_mask < 0 || vec_mask > 3)
+    return false;
+
   one_vec = (vec_mask != 3);
 
   /* Implementable with shufps or pshufd.  */
Index: gcc/tree-ssa-operands.c
===================================================================
--- gcc/tree-ssa-operands.c	(revision 178354)
+++ gcc/tree-ssa-operands.c	(working copy)
@@ -943,6 +943,7 @@  get_expr_operands (gimple stmt, tree *ex
 
     case COND_EXPR:
     case VEC_COND_EXPR:
+    case VEC_SHUFFLE_EXPR:
       get_expr_operands (stmt, &TREE_OPERAND (expr, 0), uflags);
       get_expr_operands (stmt, &TREE_OPERAND (expr, 1), uflags);
       get_expr_operands (stmt, &TREE_OPERAND (expr, 2), uflags);