diff mbox

Vector shuffling

Message ID CABYV9SXxXdE3tZ0gQfGG_DiwDZVpOwSJVDgpmH7L-wARDf=+kA@mail.gmail.com
State New
Headers show

Commit Message

Artem Shinkarov Sept. 2, 2011, 5:14 p.m. UTC
New version  of the patch with adjusted __builtin_complex and
__builtin_choose_expr.


Thanks,
Artem.

On Fri, Sep 2, 2011 at 5:08 PM, Artem Shinkarov
<artyom.shinkaroff@gmail.com> wrote:
> On Fri, Sep 2, 2011 at 4:41 PM, Joseph S. Myers <joseph@codesourcery.com> wrote:
>> On Fri, 2 Sep 2011, Artem Shinkarov wrote:
>>
>>> +  /* Avoid C_MAYBE_CONST_EXPRs inside VEC_SHUFFLE_EXPR.  */
>>> +  tmp = c_fully_fold (v0, false, &maybe_const);
>>> +  v0 = save_expr (tmp);
>>> +  wrap &= maybe_const;
>>
>> I suppose you need this save_expr because of the two-argument case, but
>> shouldn't need it otherwise.
>>
>>> +  if (!two_arguments)
>>> +    {
>>> +      tmp = c_fully_fold (v1, false, &maybe_const);
>>> +      v1 = save_expr (tmp);
>>
>> And you shouldn't need this save_expr at all.
>>
>>> +  tmp = c_fully_fold (mask, false, &maybe_const);
>>> +  mask = save_expr (tmp);
>>
>> Or this one.
>
> Joseph, I don't understand this comment. I have 2 or 3 arguments in
> the VEC_SHUFFLE_EXPR and any of them can be C_MAYBE_CONST_EXPR, so I
> need to wrap mask (the last argument) to avoid the following failure:
>
> #define vector(elcount, type)  \
>  __attribute__((vector_size((elcount)*sizeof(type)))) type
>
> extern int p, q, v, r;
> int main ()
> {
>  vector (4, int) i0 = {argc, 1,2,3};
>  vector (4, int) i1 = {argc, 1, argc, 3};
>  vector (4, int) imask = {0,3,2,1};
>  vector (4, int) extmask = {p,q,r,v};
>  i2 = __builtin_shuffle (i0, (p,q)? imask:extmask);
>  return 0;
> }
>
> and the same failure would happen if __builtin_shuffle expression will
> be in the following form:
> i2 = __builtin_shuffle (i0, (p,q)? imask:extmask, i2);
>
> All the rest -- agreed, and is fixed already.
>
>
> Thanks,
> Artem.
>
>>> +/* Helper function to read arguments of builtins which are interfaces
>>> +   for the middle-end nodes like COMPLEX_EXPR, VEC_SHUFLE_EXPR and
>>
>> Spelling of SHUFFLE.
>>
>>> +   others. The name of the builtin is passed using BNAME parameter.
>>
>> Two spaces after ".".
>>
>>> +   Function returns true if there were no errors while parsing and
>>> +   stores the arguments in EXPR_LIST*/
>>
>> ".  " at end of comment.
>>
>>> +static bool
>>> +c_parser_get_builtin_args (c_parser *  parser, const char *  bname,
>>> +                        VEC(tree,gc) **  expr_list)
>>
>> No spaces after "*".
>>
>>> +  if (c_parser_next_token_is_not (parser, CPP_OPEN_PAREN))
>>> +    {
>>> +      error_at (loc, "cannot take address of %<%s%>", bname);
>>
>> %qs is a simpler form of %<%s%>.
>>
>>> @@ -6461,6 +6500,35 @@ c_parser_postfix_expression (c_parser *p
>>
>> Should also convert __builtin_choose_expr and __builtin_complex to use the
>> new helper.
>>
>>> +         if (! c_parser_get_builtin_args (parser,
>>
>> No space after "!".
>>
>>> +           {
>>> +             error_at (loc, "%<__builtin_shuffle%> wrong number of arguments");
>>
>> "wrong number of arguments to %<__builtin_shuffle%>".
>>
>> --
>> Joseph S. Myers
>> joseph@codesourcery.com
>>
>
diff mbox

Patch

Index: gcc/doc/extend.texi
===================================================================
--- gcc/doc/extend.texi	(revision 178354)
+++ gcc/doc/extend.texi	(working copy)
@@ -6561,6 +6561,32 @@  invoke undefined behavior at runtime.  W
 accesses for vector subscription can be enabled with
 @option{-Warray-bounds}.
 
+Vector shuffling is available using functions 
+@code{__builtin_shuffle (vec, mask)} and 
+@code{__builtin_shuffle (vec0, vec1, mask)}. Both functions construct
+a permutation of elements from one or two vectors and return a vector
+of the same type as input vector(s). The mask is a vector of 
+integer-typed elements. The size of each element of the mask must be
+the same as the size of each input vector element. The number of 
+elements in input vector(s) and mask must be the same.
+
+The elements of the input vectors are numbered from left to right across
+one or both of the vectors. Each element in the mask specifies a number
+of element from the input vector(s). Consider the following example.
+
+@smallexample
+typedef int v4si __attribute__ ((vector_size (16)));
+
+v4si a = @{1,2,3,4@};
+v4si b = @{5,6,7,8@};
+v4si mask1 = @{0,1,1,3@};
+v4si mask2 = @{0,4,2,5@};
+v4si res;
+
+res = __builtin_shuffle (a, mask1);       /* res is @{1,2,2,4@}  */
+res = __builtin_shuffle2 (a, b, mask2);   /* res is @{1,5,3,6@}  */
+@end smallexample
+
 You can declare variables and use them in function calls and returns, as
 well as in assignments and some casts.  You can specify a vector type as
 a return type for a function.  Vector types can also be used as function
Index: gcc/tree-pretty-print.c
===================================================================
--- gcc/tree-pretty-print.c	(revision 178354)
+++ gcc/tree-pretty-print.c	(working copy)
@@ -2067,6 +2067,16 @@  dump_generic_node (pretty_printer *buffe
       dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
       pp_string (buffer, " > ");
       break;
+    
+    case VEC_SHUFFLE_EXPR:
+      pp_string (buffer, " VEC_SHUFFLE_EXPR < ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 0), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 1), spc, flags, false);
+      pp_string (buffer, " , ");
+      dump_generic_node (buffer, TREE_OPERAND (node, 2), spc, flags, false);
+      pp_string (buffer, " > ");
+      break;
 
     case DOT_PROD_EXPR:
       pp_string (buffer, " DOT_PROD_EXPR < ");
Index: gcc/c-family/c-common.c
===================================================================
--- gcc/c-family/c-common.c	(revision 178354)
+++ gcc/c-family/c-common.c	(working copy)
@@ -425,6 +425,7 @@  const struct c_common_resword c_common_r
   { "__attribute__",	RID_ATTRIBUTE,	0 },
   { "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY },
   { "__builtin_complex", RID_BUILTIN_COMPLEX, D_CONLY },
+  { "__builtin_shuffle", RID_BUILTIN_SHUFFLE, D_CONLY },
   { "__builtin_offsetof", RID_OFFSETOF, 0 },
   { "__builtin_types_compatible_p", RID_TYPES_COMPATIBLE_P, D_CONLY },
   { "__builtin_va_arg",	RID_VA_ARG,	0 },
Index: gcc/c-family/c-common.h
===================================================================
--- gcc/c-family/c-common.h	(revision 178354)
+++ gcc/c-family/c-common.h	(working copy)
@@ -103,7 +103,7 @@  enum rid
   /* C extensions */
   RID_ASM,       RID_TYPEOF,   RID_ALIGNOF,  RID_ATTRIBUTE,  RID_VA_ARG,
   RID_EXTENSION, RID_IMAGPART, RID_REALPART, RID_LABEL,      RID_CHOOSE_EXPR,
-  RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,
+  RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,	     RID_BUILTIN_SHUFFLE,
   RID_DFLOAT32, RID_DFLOAT64, RID_DFLOAT128,
   RID_FRACT, RID_ACCUM,
 
@@ -898,6 +898,7 @@  extern tree build_function_call (locatio
 
 extern tree build_function_call_vec (location_t, tree,
     				     VEC(tree,gc) *, VEC(tree,gc) *);
+extern tree c_build_vec_shuffle_expr (location_t, tree, tree, tree);
 
 extern tree resolve_overloaded_builtin (location_t, tree, VEC(tree,gc) *);
 
Index: gcc/optabs.c
===================================================================
--- gcc/optabs.c	(revision 178354)
+++ gcc/optabs.c	(working copy)
@@ -6620,6 +6620,82 @@  vector_compare_rtx (tree cond, bool unsi
   return gen_rtx_fmt_ee (rcode, VOIDmode, ops[0].value, ops[1].value);
 }
 
+/* Return true if VEC_SHUFF_EXPR can be expanded using SIMD extensions
+   of the CPU.  */
+bool
+expand_vec_shuffle_expr_p (enum machine_mode mode, tree v0, 
+			   tree v1, tree mask)
+{
+  int v0_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0))));
+  int mask_mode_s = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask))));
+
+  if (TREE_CODE (mask) == VECTOR_CST
+      && targetm.vectorize.builtin_vec_perm_ok (TREE_TYPE (v0), mask))
+    return true;
+  
+  if (v0 != v1 || v0_mode_s != mask_mode_s)
+    return false;
+    
+  return direct_optab_handler (vshuffle_optab, mode) != CODE_FOR_nothing;
+}
+
+/* Generate instructions for VEC_COND_EXPR given its type and three
+   operands.  */
+rtx
+expand_vec_shuffle_expr (tree type, tree v0, tree v1, tree mask, rtx target)
+{
+  struct expand_operand ops[4];
+  enum insn_code icode;
+  enum machine_mode mode = TYPE_MODE (type);
+  rtx rtx_v0, rtx_mask;
+
+  gcc_assert (expand_vec_shuffle_expr_p (mode, v0, v1, mask));
+
+  if (TREE_CODE (mask) == VECTOR_CST)
+    {
+      tree m_type, call;
+      tree fn = targetm.vectorize.builtin_vec_perm (TREE_TYPE (v0), &m_type);
+      rtx t;
+
+      if (!fn)
+	goto vshuffle;
+
+      if (m_type != TREE_TYPE (TREE_TYPE (mask)))
+	{	
+	  int units = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask));
+	  tree cvt = build_vector_type (m_type, units);
+	  mask = fold_convert (cvt, mask);
+	}
+
+      fn = copy_node (fn);
+      call = fold_build1 (ADDR_EXPR, build_pointer_type (TREE_TYPE (fn)), fn);
+      call = build_call_nary (type /* ? */, call, 3, v0, v1, mask);
+
+      t = expand_normal (call);  
+      target = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (VOIDmode, target, t));
+      return target;
+    }
+
+vshuffle:
+  gcc_assert (v1 == v0);
+
+  icode = direct_optab_handler (vshuffle_optab, mode);
+
+  if (icode == CODE_FOR_nothing)
+    return 0;
+  
+  rtx_v0 = expand_normal (v0);
+  rtx_mask = expand_normal (mask);
+
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], rtx_v0, mode);
+  create_input_operand (&ops[2], rtx_mask, mode);
+  expand_insn (icode, 3, ops);
+  
+  return ops[0].value;
+}
+
 /* Return insn code for TYPE, the type of a VEC_COND_EXPR.  */
 
 static inline enum insn_code
Index: gcc/optabs.h
===================================================================
--- gcc/optabs.h	(revision 178354)
+++ gcc/optabs.h	(working copy)
@@ -636,6 +636,9 @@  enum direct_optab_index
   DOI_vcond,
   DOI_vcondu,
 
+  /* Vector shuffling.  */
+  DOI_vshuffle,
+
   /* Block move operation.  */
   DOI_movmem,
 
@@ -701,6 +704,7 @@  typedef struct direct_optab_d *direct_op
 #define reload_out_optab (&direct_optab_table[(int) DOI_reload_out])
 #define vcond_optab (&direct_optab_table[(int) DOI_vcond])
 #define vcondu_optab (&direct_optab_table[(int) DOI_vcondu])
+#define vshuffle_optab (&direct_optab_table[(int) DOI_vshuffle])
 #define movmem_optab (&direct_optab_table[(int) DOI_movmem])
 #define setmem_optab (&direct_optab_table[(int) DOI_setmem])
 #define cmpstr_optab (&direct_optab_table[(int) DOI_cmpstr])
@@ -879,8 +883,15 @@  extern rtx expand_widening_mult (enum ma
 /* Return tree if target supports vector operations for COND_EXPR.  */
 bool expand_vec_cond_expr_p (tree, enum machine_mode);
 
+/* Return tree if target supports vector operations for VEC_SHUFFLE_EXPR.  */
+bool expand_vec_shuffle_expr_p (enum machine_mode, tree, tree, tree);
+
 /* Generate code for VEC_COND_EXPR.  */
 extern rtx expand_vec_cond_expr (tree, tree, tree, tree, rtx);
+
+/* Generate code for VEC_SHUFFLE_EXPR.  */
+extern rtx expand_vec_shuffle_expr (tree, tree, tree, tree, rtx);
+
 /* Generate code for VEC_LSHIFT_EXPR and VEC_RSHIFT_EXPR.  */
 extern rtx expand_vec_shift_expr (sepops, rtx);
 
Index: gcc/genopinit.c
===================================================================
--- gcc/genopinit.c	(revision 178354)
+++ gcc/genopinit.c	(working copy)
@@ -255,6 +255,7 @@  static const char * const optabs[] =
   "set_optab_handler (vec_realign_load_optab, $A, CODE_FOR_$(vec_realign_load_$a$))",
   "set_direct_optab_handler (vcond_optab, $A, CODE_FOR_$(vcond$a$))",
   "set_direct_optab_handler (vcondu_optab, $A, CODE_FOR_$(vcondu$a$))",
+  "set_direct_optab_handler (vshuffle_optab, $A, CODE_FOR_$(vshuffle$a$))",
   "set_optab_handler (ssum_widen_optab, $A, CODE_FOR_$(widen_ssum$I$a3$))",
   "set_optab_handler (usum_widen_optab, $A, CODE_FOR_$(widen_usum$I$a3$))",
   "set_optab_handler (udot_prod_optab, $A, CODE_FOR_$(udot_prod$I$a$))",
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-2.c	(revision 0)
@@ -0,0 +1,44 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shuf2compare(type, count, vres, v0, v1, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != ((vidx(type, mask, __i) < count) ? \
+                          vidx(type, v0, vidx(type, mask, __i)) :  \
+                          vidx(type, v1, (vidx(type, mask, __i) - count)))) \
+            __builtin_abort (); \
+        } \
+} while (0)
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {5, 5,5,5,5,5,argc,7};
+    vector (8, short) v1 = {argc, 1,8,8,4,9,argc,4};
+    vector (8, short) v2;
+
+    //vector (8, short) mask = {1,2,5,4,3,6,7};
+    
+    vector (8, short) mask0 = {0,2,3,1,4,5,6,7};
+    vector (8, short) mask1 = {0,12,3,4,3,0,10,9};
+    
+    vector (8, short) mask2 = {0,8,1,9,2,10,3,11};
+
+    v2 = __builtin_shuffle (v0, v1,  mask0);
+    shuf2compare (short, 8, v2, v0, v1, mask0);
+ 
+    v2 = __builtin_shuffle (v0, v1,  mask1);
+    shuf2compare (short, 8, v2, v0, v1, mask1);
+    
+    v2 = __builtin_shuffle (v0, v1,  mask2);
+    shuf2compare (short, 8, v2, v0, v1, mask2);
+
+    v2 = __builtin_shuffle (mask0, mask0,  v0);
+    shuf2compare (short, 8, v2, mask0, mask0, v0);
+
+    return 0; 
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-4.c	(revision 0)
@@ -0,0 +1,50 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shuf2compare(type, count, vres, v0, v1, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != ((vidx(type, mask, __i) < count) ? \
+                          vidx(type, v0, vidx(type, mask, __i)) :  \
+                          vidx(type, v1, (vidx(type, mask, __i) - count)))) \
+            __builtin_abort (); \
+        } \
+} while (0)
+
+
+vector (8, short) __attribute__ ((noinline))
+f (vector (8, short) x, vector (8, short) y, vector (8, short) mask) {
+    return __builtin_shuffle (x, y, mask);
+}
+
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+
+    //vector (8, short) mask = {1,2,5,4,3,6,7};
+    
+    vector (8, short) mask0 = {0,2,3,1,4,5,6,7};
+    vector (8, short) mask1 = {0,12,3,4,3,0,10,9};
+    vector (8, short) mask2 = {0,8,1,9,2,10,3,11};
+
+    v2 = f (v0, v1,  mask0);
+    shuf2compare (short, 8, v2, v0, v1, mask0);
+ 
+    v2 = f (v0, v1,  mask1);
+    shuf2compare (short, 8, v2, v0, v1, mask1);
+
+    v2 = f (v0, v1,  mask2);
+    shuf2compare (short, 8, v2, v0, v1, mask2);
+
+    v2 = f (mask0, mask0,  v0);
+    shuf2compare (short, 8, v2, mask0, mask0, v0);
+
+    return 0; 
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-1.c	(revision 0)
@@ -0,0 +1,46 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shufcompare(type, count, vres, v0, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != vidx(type, v0, vidx(type, mask, __i))) \
+            __builtin_abort (); \
+    } \
+} while (0)
+
+
+int main (int argc, char *argv[]) {
+    /*vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+   
+    vector (8, short) smask = {0,0,1,2,3,4,5,6};
+    
+    v2 = __builtin_shuffle (v0,  smask);
+    shufcompare (short, 8, v2, v0, smask);
+    v2 = __builtin_shuffle (v0, v1);
+    shufcompare (short, 8, v2, v0, v1);
+    v2 = __builtin_shuffle (smask, v0);
+    shufcompare (short, 8, v2, smask, v0);*/
+
+    vector (4, int) i0 = {argc, 1,2,3};
+    vector (4, int) i1 = {argc, 1, argc, 3};
+    vector (4, int) i2;
+
+    vector (4, int) imask = {0,3,2,1};
+
+    /*i2 = __builtin_shuffle (i0, imask);
+    shufcompare (int, 4, i2, i0, imask);*/
+    i2 = __builtin_shuffle (i0, i1);
+    shufcompare (int, 4, i2, i0, i1);
+    
+    i2 = __builtin_shuffle (imask, i0);
+    shufcompare (int, 4, i2, imask, i0);
+    
+    return 0;
+}
+
Index: gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c
===================================================================
--- gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c	(revision 0)
+++ gcc/testsuite/gcc.c-torture/execute/vect-shuffle-3.c	(revision 0)
@@ -0,0 +1,36 @@ 
+#define vector(elcount, type)  \
+__attribute__((vector_size((elcount)*sizeof(type)))) type
+
+#define vidx(type, vec, idx) (*(((type *) &(vec)) + idx))
+
+#define shufcompare(type, count, vres, v0, mask) \
+do { \
+    int __i; \
+    for (__i = 0; __i < count; __i++) { \
+        if (vidx(type, vres, __i) != vidx(type, v0, vidx(type, mask, __i))) \
+            __builtin_abort (); \
+    } \
+} while (0)
+
+vector (8, short) __attribute__ ((noinline))
+f (vector (8, short) x, vector (8, short) mask) {
+    return __builtin_shuffle (x, mask);
+}
+
+
+int main (int argc, char *argv[]) {
+    vector (8, short) v0 = {argc, 1,2,3,4,5,6,7};
+    vector (8, short) v1 = {argc, 1,argc,3,4,5,argc,7};
+    vector (8, short) v2;
+
+    vector (8, short) mask = {0,0,1,2,3,4,5,6};
+    
+    v2 = f (v0,  mask);
+    shufcompare (short, 8, v2, v0, mask);
+
+    v2 = f (v0, v1);
+    shufcompare (short, 8, v2, v0, v1);
+
+    return 0;
+}
+
Index: gcc/testsuite/gcc.dg/builtin-complex-err-1.c
===================================================================
--- gcc/testsuite/gcc.dg/builtin-complex-err-1.c	(revision 178354)
+++ gcc/testsuite/gcc.dg/builtin-complex-err-1.c	(working copy)
@@ -19,8 +19,8 @@  _Complex float fc3 = __builtin_complex (
 void
 f (void)
 {
-  __builtin_complex (0.0); /* { dg-error "expected" } */
-  __builtin_complex (0.0, 0.0, 0.0); /* { dg-error "expected" } */
+  __builtin_complex (0.0); /* { dg-error "wrong number of arguments" } */
+  __builtin_complex (0.0, 0.0, 0.0); /* { dg-error "wrong number of arguments" } */
 }
 
-void (*p) (void) = __builtin_complex; /* { dg-error "expected" } */
+void (*p) (void) = __builtin_complex; /* { dg-error "cannot take address" } */
Index: gcc/expr.c
===================================================================
--- gcc/expr.c	(revision 178354)
+++ gcc/expr.c	(working copy)
@@ -8605,6 +8605,10 @@  expand_expr_real_2 (sepops ops, rtx targ
     case VEC_PACK_FIX_TRUNC_EXPR:
       mode = TYPE_MODE (TREE_TYPE (treeop0));
       goto binop;
+    
+    case VEC_SHUFFLE_EXPR:
+      target = expand_vec_shuffle_expr (type, treeop0, treeop1, treeop2, target);
+      return target;
 
     case DOT_PROD_EXPR:
       {
Index: gcc/gimple-pretty-print.c
===================================================================
--- gcc/gimple-pretty-print.c	(revision 178354)
+++ gcc/gimple-pretty-print.c	(working copy)
@@ -417,6 +417,16 @@  dump_ternary_rhs (pretty_printer *buffer
       dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
       pp_string (buffer, ">");
       break;
+    
+    case VEC_SHUFFLE_EXPR:
+      pp_string (buffer, "VEC_SHUFFLE_EXPR <");
+      dump_generic_node (buffer, gimple_assign_rhs1 (gs), spc, flags, false);
+      pp_string (buffer, ", ");
+      dump_generic_node (buffer, gimple_assign_rhs2 (gs), spc, flags, false);
+      pp_string (buffer, ", ");
+      dump_generic_node (buffer, gimple_assign_rhs3 (gs), spc, flags, false);
+      pp_string (buffer, ">");
+      break;
 
     case REALIGN_LOAD_EXPR:
       pp_string (buffer, "REALIGN_LOAD <");
Index: gcc/c-typeck.c
===================================================================
--- gcc/c-typeck.c	(revision 178354)
+++ gcc/c-typeck.c	(working copy)
@@ -2845,6 +2845,89 @@  build_function_call_vec (location_t loc,
     }
   return require_complete_type (result);
 }
+
+/* Build a VEC_SHUFFLE_EXPR if V0, V1 and MASK are not error_mark_nodes
+   and have vector types, V0 has the same type as V1, and the number of
+   elements of V0, V1, MASK is the same.  */
+tree
+c_build_vec_shuffle_expr (location_t loc, tree v0, tree v1, tree mask)
+{
+  tree vec_shuffle, tmp;
+  bool wrap = true;
+  bool maybe_const = false;
+  bool two_arguments = v0 == v1;
+
+
+  if (v0 == error_mark_node || v1 == error_mark_node 
+      || mask == error_mark_node)
+    return error_mark_node;
+
+  if (TREE_CODE (TREE_TYPE (mask)) != VECTOR_TYPE
+      || TREE_CODE (TREE_TYPE (TREE_TYPE (mask))) != INTEGER_TYPE)
+    {
+      error_at (loc, "__builtin_shuffle last argument must "
+		     "be an integer vector");
+      return error_mark_node;
+    }
+   
+  if (TREE_CODE (TREE_TYPE (v0)) != VECTOR_TYPE
+      || TREE_CODE (TREE_TYPE (v1)) != VECTOR_TYPE)
+    {
+      error_at (loc, "__builtin_shuffle arguments must be vectors");
+      return error_mark_node;
+    }
+
+  if (TREE_TYPE (v0) != TREE_TYPE (v1))
+    {
+      error_at (loc, "__builtin_shuffle argument vectors must be of "
+		     "the same type");
+      return error_mark_node;
+    }
+
+  if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (v0)) 
+      != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask))
+      && TYPE_VECTOR_SUBPARTS (TREE_TYPE (v1))
+	 != TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)))
+    {
+      error_at (loc, "__builtin_shuffle number of elements of the "
+		     "argument vector(s) and the mask vector should "
+		     "be the same");
+      return error_mark_node;
+    }
+  
+  if (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (v0)))) 
+      != GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (TREE_TYPE (mask)))))
+    {
+      error_at (loc, "__builtin_shuffle argument vector(s) inner type "
+		     "must have the same size as inner type of the mask");
+      return error_mark_node;
+    }
+
+  /* Avoid C_MAYBE_CONST_EXPRs inside VEC_SHUFFLE_EXPR.  */
+  tmp = c_fully_fold (v0, false, &maybe_const);
+  v0 = save_expr (tmp);
+  wrap &= maybe_const;
+
+  if (!two_arguments)
+    {
+      tmp = c_fully_fold (v1, false, &maybe_const);
+      v1 = save_expr (tmp);
+      wrap &= maybe_const;
+    }
+  else
+    v1 = v0;
+  
+  tmp = c_fully_fold (mask, false, &maybe_const);
+  mask = save_expr (tmp);
+  wrap &= maybe_const;
+
+  vec_shuffle = build3 (VEC_SHUFFLE_EXPR, TREE_TYPE (v0), v0, v1, mask);
+
+  if (!wrap)
+    vec_shuffle = c_wrap_maybe_const (vec_shuffle, true);
+
+  return vec_shuffle;
+}
 
 /* Convert the argument expressions in the vector VALUES
    to the types in the list TYPELIST.
@@ -6120,7 +6203,14 @@  digest_init (location_t init_loc, tree t
 	  tree value;
 	  bool constant_p = true;
 
-	  /* Iterate through elements and check if all constructor
+	  /* If constructor has less elements than the vector type.  */
+          if (CONSTRUCTOR_NELTS (inside_init) 
+              < TYPE_VECTOR_SUBPARTS (TREE_TYPE (inside_init)))
+            warning_at (init_loc, 0, "vector length does not match "
+                                     "initializer length, zero elements "
+                                     "will be inserted");
+          
+          /* Iterate through elements and check if all constructor
 	     elements are *_CSTs.  */
 	  FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (inside_init), ix, value)
 	    if (!CONSTANT_CLASS_P (value))
Index: gcc/gimplify.c
===================================================================
--- gcc/gimplify.c	(revision 178354)
+++ gcc/gimplify.c	(working copy)
@@ -7053,6 +7053,32 @@  gimplify_expr (tree *expr_p, gimple_seq
 	  }
 	  break;
 
+	case VEC_SHUFFLE_EXPR:
+	  {
+	    enum gimplify_status r0, r1, r2;
+
+	    if (TREE_OPERAND (*expr_p, 0) == TREE_OPERAND (*expr_p, 1))
+	      {
+		r0 = r1 = gimplify_expr (&TREE_OPERAND (*expr_p, 0), pre_p,
+					 post_p, is_gimple_val, fb_rvalue);
+		TREE_OPERAND (*expr_p, 1) = TREE_OPERAND (*expr_p, 0);
+	      }
+	    else
+	      {
+		 r0 = gimplify_expr (&TREE_OPERAND (*expr_p, 0), pre_p,
+				     post_p, is_gimple_val, fb_rvalue);
+		 r1 = gimplify_expr (&TREE_OPERAND (*expr_p, 1), pre_p,
+				     post_p, is_gimple_val, fb_rvalue);
+	      }
+
+	    r2 = gimplify_expr (&TREE_OPERAND (*expr_p, 2), pre_p,
+				post_p, is_gimple_val, fb_rvalue);
+	    recalculate_side_effects (*expr_p);
+
+	    ret = MIN (r0, MIN (r1, r2));
+	    break;
+	  }
+
 	case TARGET_MEM_REF:
 	  {
 	    enum gimplify_status r0 = GS_ALL_DONE, r1 = GS_ALL_DONE;
Index: gcc/tree.def
===================================================================
--- gcc/tree.def	(revision 178354)
+++ gcc/tree.def	(working copy)
@@ -497,6 +497,19 @@  DEFTREECODE (COND_EXPR, "cond_expr", tcc
 */
 DEFTREECODE (VEC_COND_EXPR, "vec_cond_expr", tcc_expression, 3)
 
+/* Vector shuffle expression. A = VEC_SHUFFLE_EXPR<v0, v1, maks>
+   means
+
+   freach i in length (mask):
+     A = mask[i] < length (v0) ? v0[mask[i]] : v1[mask[i]]
+
+   V0 and V1 are vectors of the same type. MASK is an integer-typed
+   vector. The number of MASK elements must be the same with the
+   number of elements in V0 and V1. The size of the inner type
+   of the MASK and of the V0 and V1 must be the same.
+*/
+DEFTREECODE (VEC_SHUFFLE_EXPR, "vec_shuffle_expr", tcc_expression, 3)
+
 /* Declare local variables, including making RTL and allocating space.
    BIND_EXPR_VARS is a chain of VAR_DECL nodes for the variables.
    BIND_EXPR_BODY is the body, the expression to be computed using
Index: gcc/tree-inline.c
===================================================================
--- gcc/tree-inline.c	(revision 178354)
+++ gcc/tree-inline.c	(working copy)
@@ -3285,6 +3285,7 @@  estimate_operator_cost (enum tree_code c
        ??? We may consider mapping RTL costs to this.  */
     case COND_EXPR:
     case VEC_COND_EXPR:
+    case VEC_SHUFFLE_EXPR:
 
     case PLUS_EXPR:
     case POINTER_PLUS_EXPR:
Index: gcc/tree-vect-generic.c
===================================================================
--- gcc/tree-vect-generic.c	(revision 178354)
+++ gcc/tree-vect-generic.c	(working copy)
@@ -30,6 +30,7 @@  along with GCC; see the file COPYING3.
 #include "tree-pass.h"
 #include "flags.h"
 #include "ggc.h"
+#include "diagnostic.h"
 
 /* Need to include rtl.h, expr.h, etc. for optabs.  */
 #include "expr.h"
@@ -432,6 +433,279 @@  type_for_widest_vector_mode (enum machin
     }
 }
 
+
+/* Build a reference to the element of the vector VECT. Function 
+   returns either the element itself, either BIT_FIELD_REF, or an 
+   ARRAY_REF expression.
+   
+   GSI is requred to insert temporary variables while building a
+   refernece to the element of the vector VECT.
+   
+   PTMPVEC is a pointer to the temporary variable for caching
+   purposes. In case when PTMPVEC is NULL new temporary variable
+   will be created.  */
+static tree
+vector_element (gimple_stmt_iterator *gsi, tree vect, tree idx, tree *ptmpvec)
+{
+  tree type;
+  gimple asgn; 
+  unsigned HOST_WIDE_INT maxval;
+  tree tmpvec; 
+  tree indextype, arraytype;
+  bool need_asgn = true;
+
+  gcc_assert (TREE_CODE (TREE_TYPE (vect)) == VECTOR_TYPE);
+
+  type = TREE_TYPE (vect);
+  if (TREE_CODE (idx) == INTEGER_CST)
+    {
+      unsigned HOST_WIDE_INT index;
+
+      if (!host_integerp (idx, 1)
+           || (index = tree_low_cst (idx, 1)) > TYPE_VECTOR_SUBPARTS (type)-1)
+        return error_mark_node;
+
+      if (TREE_CODE (vect) == VECTOR_CST)
+        {
+            unsigned i;
+            tree vals = TREE_VECTOR_CST_ELTS (vect);
+            for (i = 0; vals; vals = TREE_CHAIN (vals), ++i)
+              if (i == index)
+                 return TREE_VALUE (vals);
+            return error_mark_node;
+        }
+      else if (TREE_CODE (vect) == CONSTRUCTOR)
+        {
+          unsigned i;
+          VEC (constructor_elt, gc) *vals = CONSTRUCTOR_ELTS (vect);
+          constructor_elt *elt;
+
+          for (i = 0; VEC_iterate (constructor_elt, vals, i, elt); i++)
+            if (operand_equal_p (elt->index, idx, 0))
+              return elt->value; 
+          return fold_convert (TREE_TYPE (type), integer_zero_node);
+        }
+      else if (TREE_CODE (vect) == SSA_NAME)
+        {
+          tree el;
+          gimple vectdef = SSA_NAME_DEF_STMT (vect);
+          if (gimple_assign_single_p (vectdef)
+              && (el = vector_element (gsi, gimple_assign_rhs1 (vectdef), 
+                                       idx, ptmpvec)) 
+                 != error_mark_node)
+            return el;
+          else
+            {
+              tree size = TYPE_SIZE (TREE_TYPE (type));
+              tree pos = fold_build2 (MULT_EXPR, TREE_TYPE (idx), 
+                                      idx, size);
+              return fold_build3 (BIT_FIELD_REF, TREE_TYPE (type), 
+                             vect, size, pos);
+            }
+        }
+      else
+        return error_mark_node;
+    }
+  
+  if (!ptmpvec)
+    tmpvec = create_tmp_var (TREE_TYPE (vect), "vectmp");
+  else if (!*ptmpvec)
+    tmpvec = *ptmpvec = create_tmp_var (TREE_TYPE (vect), "vectmp");
+  else
+    {
+      tmpvec = *ptmpvec;
+      need_asgn = false;
+    }
+  
+  if (need_asgn)
+    {
+      TREE_ADDRESSABLE (tmpvec) = 1;
+      asgn = gimple_build_assign (tmpvec, vect);
+      gsi_insert_before (gsi, asgn, GSI_SAME_STMT);
+    }
+
+  maxval = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vect)) -1;
+  indextype = build_index_type (size_int (maxval));
+  arraytype = build_array_type (TREE_TYPE (type), indextype);
+  
+  return build4 (ARRAY_REF, TREE_TYPE (type),
+                 build1 (VIEW_CONVERT_EXPR, arraytype, tmpvec),
+                 idx, NULL_TREE, NULL_TREE);
+
+
+}
+
+/* Check if VEC_SHUFFLE_EXPR within the given setting is supported
+   by hardware, or lower it piecewie.
+
+   When VEC_SHUFFLE_EXPR has the same first and second operands:
+   VEC_SHUFFLE_EXPR <v0, v0, mask> the lowered version would be 
+   {v0[mask[0]], v0[mask[1]], ...}
+   MASK and V0 must have the same number of elements.
+        
+   Otherwise VEC_SHUFFLE_EXPR <v0, v1, mask> is lowered to 
+   {mask[0] < len(v0) ? v0[mask[0]] : v1[mask[0]], ...}
+   V0 and V1 must have the same type. MASK, V0, V1 must have the
+   same number of arguments.  */
+static void
+lower_vec_shuffle (gimple_stmt_iterator *gsi, location_t loc)
+{
+#define TRAP_RETURN(new_stmt, stmt, gsi, vec0) \
+do { \
+  new_stmt = gimple_build_call (built_in_decls[BUILT_IN_TRAP], 0); \
+  gsi_insert_before (gsi, new_stmt,  GSI_SAME_STMT); \
+  split_block (gimple_bb (new_stmt), new_stmt); \
+  new_stmt = gimple_build_assign (gimple_call_lhs (stmt), vec0); \
+  gsi_replace (gsi, new_stmt, false); \
+  return; \
+} while (0) 
+ 
+  gimple stmt = gsi_stmt (*gsi);
+  tree mask = gimple_assign_rhs3 (stmt);
+  tree vec0 = gimple_assign_rhs1 (stmt);
+  tree vec1 = gimple_assign_rhs2 (stmt);
+  unsigned els = TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask));
+  tree type0 = TREE_TYPE (TREE_TYPE (vec0));
+  VEC(constructor_elt,gc) *v = NULL;
+  tree vectype, constr;
+  gimple new_stmt;
+  tree vec0tmp = NULL_TREE, masktmp = NULL_TREE;
+
+  if (expand_vec_shuffle_expr_p (TYPE_MODE (TREE_TYPE (vec0)), vec0, vec1, mask))
+    {
+      tree t;
+
+      t = gimplify_build3 (gsi, VEC_SHUFFLE_EXPR, TREE_TYPE (vec0),
+			   vec0, vec1, mask);
+      gimple_assign_set_rhs_from_tree (gsi, t);
+      /* Statement should be updated by callee.  */
+      return;
+    }
+
+  
+  if (vec0 == vec1)
+    {
+      unsigned i;
+      tree vec0tmp = NULL_TREE;
+      
+      v = VEC_alloc (constructor_elt, gc, els);
+      for (i = 0; i < els; i++)
+        {
+          tree idxval, vecel, t;
+	   
+	  idxval = vector_element (gsi, mask, size_int (i), &masktmp);
+          if (idxval == error_mark_node)
+            {
+              warning_at (loc, 0, "Invalid shuffling mask index %i", i);
+              TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+            }
+
+	  vecel = vector_element (gsi, vec0, idxval, &vec0tmp);
+          if (vecel == error_mark_node)
+            {
+              warning_at (loc, 0, "Invalid shuffling arguments");
+              TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+            }
+          
+          t = force_gimple_operand_gsi (gsi, vecel, true, 
+					NULL_TREE, true, GSI_SAME_STMT);
+          CONSTRUCTOR_APPEND_ELT (v, size_int (i), t);
+        }
+    }
+  else
+    {
+      unsigned i;
+      tree var = create_tmp_var (type0, "vecel");
+      tree vec1tmp = NULL_TREE;
+
+      v = VEC_alloc (constructor_elt, gc, els);
+      for (i = 0; i < els; i++)
+        {
+          tree idxval, idx1val, cond, elval0, elval1, condexpr, t, ssatmp;
+          tree vec0el, vec1el;
+          gimple asgn;
+          
+          idxval = vector_element (gsi, mask, size_int (i), &masktmp);
+	  if (idxval == error_mark_node)
+            {
+              warning_at (loc, 0, "Invalid shuffling mask index %i", i);
+              TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+            }
+          
+          if (TREE_CODE (idxval) == INTEGER_CST)
+            {
+              if (tree_int_cst_lt (idxval, size_int (els)))
+                {
+                  vec0el = vector_element (gsi, vec0, idxval, &vec0tmp);
+                  t = force_gimple_operand_gsi (gsi, vec0el,
+                                    true, NULL_TREE, true, GSI_SAME_STMT);
+                }
+              else if (tree_int_cst_lt (idxval, size_int (2*els)))
+                {
+                  idx1val = fold_build2 (MINUS_EXPR, TREE_TYPE (idxval),
+                        idxval, build_int_cst (TREE_TYPE (idxval), els));
+                  
+                  vec1el = vector_element (gsi, vec1, idx1val, &vec1tmp);
+                  t = force_gimple_operand_gsi (gsi, vec1el, true, 
+						NULL_TREE, true, GSI_SAME_STMT);
+                }
+              else
+                {
+                  warning_at (loc, 0, "Invalid shuffling mask index %i", i);
+                  TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+                }
+            }
+          else
+            {
+
+              idx1val = fold_build2 (MINUS_EXPR, TREE_TYPE (idxval),
+                            idxval, build_int_cst (TREE_TYPE (idxval), els));
+              idx1val = force_gimple_operand_gsi (gsi, idx1val, 
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+              cond = build2 (GT_EXPR, boolean_type_node, \
+                             idxval, convert (type0, size_int (els - 1)));
+              
+	      vec0el = vector_element (gsi, vec0, idxval, &vec0tmp);
+              if (vec0el == error_mark_node)
+                {
+                  warning_at (loc, 0, "Invalid shuffling arguments");
+                  TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+                }
+
+              elval0 = force_gimple_operand_gsi (gsi, vec0el, 
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+	      
+	      vec1el = vector_element (gsi, vec1, idx1val, &vec1tmp);
+              if (vec1el == error_mark_node)
+                {
+                  warning_at (loc, 0, "Invalid shuffling arguments");
+                  TRAP_RETURN (new_stmt, stmt, gsi, vec0);
+                }
+
+              elval1 = force_gimple_operand_gsi (gsi, vec1el,
+                                true, NULL_TREE, true, GSI_SAME_STMT);
+
+              condexpr = fold_build3 (COND_EXPR, type0, cond, \
+                                      elval1, elval0);
+
+              t = force_gimple_operand_gsi (gsi, condexpr, true, \
+                                        NULL_TREE, true, GSI_SAME_STMT);
+            }
+          
+          asgn = gimple_build_assign (var, t);
+          ssatmp = make_ssa_name (var, asgn);
+          gimple_assign_set_lhs (asgn, ssatmp);
+          gsi_insert_before (gsi, asgn, GSI_SAME_STMT);
+          CONSTRUCTOR_APPEND_ELT (v, size_int (i), ssatmp);
+        }
+    }
+  
+  vectype = build_vector_type (type0, els);
+  constr = build_constructor (vectype, v);
+  gimple_assign_set_rhs_from_tree (gsi, constr);
+  /* Statement should be updated by callee.  */
+}
+
 /* Process one statement.  If we identify a vector operation, expand it.  */
 
 static void
@@ -451,6 +725,13 @@  expand_vector_operations_1 (gimple_stmt_
   code = gimple_assign_rhs_code (stmt);
   rhs_class = get_gimple_rhs_class (code);
 
+  if (code == VEC_SHUFFLE_EXPR)
+    {
+      lower_vec_shuffle (gsi, gimple_location (stmt));
+      gimple_set_modified (gsi_stmt (*gsi), true);
+      update_stmt (gsi_stmt (*gsi));
+    }
+
   if (rhs_class != GIMPLE_UNARY_RHS && rhs_class != GIMPLE_BINARY_RHS)
     return;
 
@@ -612,10 +893,11 @@  expand_vector_operations_1 (gimple_stmt_
 /* Use this to lower vector operations introduced by the vectorizer,
    if it may need the bit-twiddling tricks implemented in this file.  */
 
+
 static bool
-gate_expand_vector_operations (void)
+gate_expand_vector_operations_noop (void)
 {
-  return flag_tree_vectorize != 0;
+  return optimize == 0;
 }
 
 static unsigned int
@@ -648,7 +930,7 @@  struct gimple_opt_pass pass_lower_vector
  {
   GIMPLE_PASS,
   "veclower",				/* name */
-  0,					/* gate */
+  gate_expand_vector_operations_noop,   /* gate */
   expand_vector_operations,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
@@ -660,7 +942,8 @@  struct gimple_opt_pass pass_lower_vector
   0,					/* todo_flags_start */
   TODO_update_ssa	                /* todo_flags_finish */
     | TODO_verify_ssa
-    | TODO_verify_stmts | TODO_verify_flow
+    | TODO_verify_stmts | TODO_verify_flow 
+    | TODO_cleanup_cfg
  }
 };
 
@@ -669,7 +952,7 @@  struct gimple_opt_pass pass_lower_vector
  {
   GIMPLE_PASS,
   "veclower2",				/* name */
-  gate_expand_vector_operations,	/* gate */
+  0,	                                /* gate */
   expand_vector_operations,		/* execute */
   NULL,					/* sub */
   NULL,					/* next */
@@ -682,6 +965,7 @@  struct gimple_opt_pass pass_lower_vector
   TODO_update_ssa	                /* todo_flags_finish */
     | TODO_verify_ssa
     | TODO_verify_stmts | TODO_verify_flow
+    | TODO_cleanup_cfg
  }
 };
 
Index: gcc/gimple.c
===================================================================
--- gcc/gimple.c	(revision 178354)
+++ gcc/gimple.c	(working copy)
@@ -2615,6 +2615,7 @@  get_gimple_rhs_num_ops (enum tree_code c
       || (SYM) == WIDEN_MULT_MINUS_EXPR					    \
       || (SYM) == DOT_PROD_EXPR						    \
       || (SYM) == REALIGN_LOAD_EXPR					    \
+      || (SYM) == VEC_SHUFFLE_EXPR					    \
       || (SYM) == FMA_EXPR) ? GIMPLE_TERNARY_RHS			    \
    : ((SYM) == COND_EXPR						    \
       || (SYM) == CONSTRUCTOR						    \
Index: gcc/tree-cfg.c
===================================================================
--- gcc/tree-cfg.c	(revision 178354)
+++ gcc/tree-cfg.c	(working copy)
@@ -3713,6 +3713,7 @@  verify_gimple_assign_ternary (gimple stm
 
     case DOT_PROD_EXPR:
     case REALIGN_LOAD_EXPR:
+    case VEC_SHUFFLE_EXPR:
       /* FIXME.  */
       return false;
 
Index: gcc/passes.c
===================================================================
--- gcc/passes.c	(revision 178354)
+++ gcc/passes.c	(working copy)
@@ -1354,7 +1354,6 @@  init_optimization_passes (void)
 	  NEXT_PASS (pass_vectorize);
 	    {
 	      struct opt_pass **p = &pass_vectorize.pass.sub;
-	      NEXT_PASS (pass_lower_vector_ssa);
 	      NEXT_PASS (pass_dce_loop);
 	    }
           NEXT_PASS (pass_predcom);
@@ -1366,6 +1365,7 @@  init_optimization_passes (void)
 	  NEXT_PASS (pass_lim);
 	  NEXT_PASS (pass_tree_loop_done);
 	}
+      NEXT_PASS (pass_lower_vector_ssa);
       NEXT_PASS (pass_cse_reciprocals);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_vrp);
Index: gcc/c-parser.c
===================================================================
--- gcc/c-parser.c	(revision 178354)
+++ gcc/c-parser.c	(working copy)
@@ -5989,6 +5989,41 @@  c_parser_alignof_expression (c_parser *p
     }
 }
 
+/* Helper function to read arguments of builtins which are interfaces
+   for the middle-end nodes like COMPLEX_EXPR, VEC_SHUFFLE_EXPR and
+   others.  The name of the builtin is passed using BNAME parameter.
+   Function returns true if there were no errors while parsing and
+   stores the arguments in EXPR_LIST.  */
+static bool
+c_parser_get_builtin_args (c_parser *parser, const char *bname, 
+			   VEC(tree,gc) **expr_list)
+{
+  location_t loc = c_parser_peek_token (parser)->location;
+  *expr_list = NULL;
+
+  if (c_parser_next_token_is_not (parser, CPP_OPEN_PAREN))
+    {
+      error_at (loc, "cannot take address of %qs", bname);
+      return false;
+    }
+
+  c_parser_consume_token (parser);
+
+  if (c_parser_next_token_is (parser, CPP_CLOSE_PAREN))
+    {
+      c_parser_consume_token (parser);
+      return true;
+    }
+    
+  *expr_list = c_parser_expr_list (parser, false, false, NULL);
+
+  if (!c_parser_require (parser, CPP_CLOSE_PAREN, "expected %<)%>"))
+    return false;
+
+  return true;
+}
+
+
 /* Parse a postfix expression (C90 6.3.1-6.3.2, C99 6.5.1-6.5.2).
 
    postfix-expression:
@@ -6027,6 +6062,10 @@  c_parser_alignof_expression (c_parser *p
 			     assignment-expression )
      __builtin_types_compatible_p ( type-name , type-name )
      __builtin_complex ( assignment-expression , assignment-expression )
+     __builtin_shuffle ( assignment-expression , assignment-expression )
+     __builtin_shuffle ( assignment-expression , 
+			 assignment-expression ,
+			 assignment-expression, )
 
    offsetof-member-designator:
      identifier
@@ -6047,7 +6086,7 @@  c_parser_alignof_expression (c_parser *p
 static struct c_expr
 c_parser_postfix_expression (c_parser *parser)
 {
-  struct c_expr expr, e1, e2, e3;
+  struct c_expr expr, e1;
   struct c_type_name *t1, *t2;
   location_t loc = c_parser_peek_token (parser)->location;;
   expr.original_code = ERROR_MARK;
@@ -6333,45 +6372,42 @@  c_parser_postfix_expression (c_parser *p
 	  }
 	  break;
 	case RID_CHOOSE_EXPR:
-	  c_parser_consume_token (parser);
-	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
-	    {
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  loc = c_parser_peek_token (parser)->location;
-	  e1 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e2 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e3 = c_parser_expr_no_commas (parser, NULL);
-	  c_parser_skip_until_found (parser, CPP_CLOSE_PAREN,
-				     "expected %<)%>");
 	  {
-	    tree c;
+	    VEC(tree,gc) *expr_list;
+	    tree e1value, e2value, e3value, c;
 
-	    c = e1.value;
-	    mark_exp_read (e2.value);
-	    mark_exp_read (e3.value);
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser, 
+					    "__builtin_choose_expr", &expr_list))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) != 3)
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_choose_expr%>");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    
+	    e1value = VEC_index (tree, expr_list, 0);
+	    e2value = VEC_index (tree, expr_list, 1);
+	    e3value = VEC_index (tree, expr_list, 2);
+
+	    c = e1value;
+	    mark_exp_read (e2value);
+	    mark_exp_read (e3value);
 	    if (TREE_CODE (c) != INTEGER_CST
 		|| !INTEGRAL_TYPE_P (TREE_TYPE (c)))
 	      error_at (loc,
 			"first argument to %<__builtin_choose_expr%> not"
 			" a constant");
 	    constant_expression_warning (c);
-	    expr = integer_zerop (c) ? e3 : e2;
+	    expr.value = integer_zerop (c) ? e3value : e2value;
+	    break;
 	  }
-	  break;
 	case RID_TYPES_COMPATIBLE_P:
 	  c_parser_consume_token (parser);
 	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
@@ -6410,57 +6446,94 @@  c_parser_postfix_expression (c_parser *p
 	  }
 	  break;
 	case RID_BUILTIN_COMPLEX:
-	  c_parser_consume_token (parser);
-	  if (!c_parser_require (parser, CPP_OPEN_PAREN, "expected %<(%>"))
-	    {
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  loc = c_parser_peek_token (parser)->location;
-	  e1 = c_parser_expr_no_commas (parser, NULL);
-	  if (!c_parser_require (parser, CPP_COMMA, "expected %<,%>"))
-	    {
-	      c_parser_skip_until_found (parser, CPP_CLOSE_PAREN, NULL);
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  e2 = c_parser_expr_no_commas (parser, NULL);
-	  c_parser_skip_until_found (parser, CPP_CLOSE_PAREN,
-				     "expected %<)%>");
-	  mark_exp_read (e1.value);
-	  if (TREE_CODE (e1.value) == EXCESS_PRECISION_EXPR)
-	    e1.value = convert (TREE_TYPE (e1.value),
-				TREE_OPERAND (e1.value, 0));
-	  mark_exp_read (e2.value);
-	  if (TREE_CODE (e2.value) == EXCESS_PRECISION_EXPR)
-	    e2.value = convert (TREE_TYPE (e2.value),
-				TREE_OPERAND (e2.value, 0));
-	  if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (e1.value))
-	      || DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e1.value))
-	      || !SCALAR_FLOAT_TYPE_P (TREE_TYPE (e2.value))
-	      || DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e2.value)))
-	    {
-	      error_at (loc, "%<__builtin_complex%> operand "
-			"not of real binary floating-point type");
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  if (TYPE_MAIN_VARIANT (TREE_TYPE (e1.value))
-	      != TYPE_MAIN_VARIANT (TREE_TYPE (e2.value)))
-	    {
-	      error_at (loc,
-			"%<__builtin_complex%> operands of different types");
-	      expr.value = error_mark_node;
-	      break;
-	    }
-	  if (!flag_isoc99)
-	    pedwarn (loc, OPT_pedantic,
-		     "ISO C90 does not support complex types");
-	  expr.value = build2 (COMPLEX_EXPR,
-			       build_complex_type (TYPE_MAIN_VARIANT
-						   (TREE_TYPE (e1.value))),
-			       e1.value, e2.value);
-	  break;
+	  { 
+	    VEC(tree,gc) *expr_list;
+	    tree e1value, e2value;
+	    
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser, 
+					    "__builtin_complex", &expr_list))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) != 2)
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_complex%>");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    
+	    e1value = VEC_index (tree, expr_list, 0);
+	    e2value = VEC_index (tree, expr_list, 1);
+
+	    mark_exp_read (e1value);
+	    if (TREE_CODE (e1value) == EXCESS_PRECISION_EXPR)
+	      e1value = convert (TREE_TYPE (e1value),
+				 TREE_OPERAND (e1value, 0));
+	    mark_exp_read (e2value);
+	    if (TREE_CODE (e2value) == EXCESS_PRECISION_EXPR)
+	      e2value = convert (TREE_TYPE (e2value),
+				 TREE_OPERAND (e2value, 0));
+	    if (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (e1value))
+		|| DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e1value))
+		|| !SCALAR_FLOAT_TYPE_P (TREE_TYPE (e2value))
+		|| DECIMAL_FLOAT_TYPE_P (TREE_TYPE (e2value)))
+	      {
+		error_at (loc, "%<__builtin_complex%> operand "
+			  "not of real binary floating-point type");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    if (TYPE_MAIN_VARIANT (TREE_TYPE (e1value))
+		!= TYPE_MAIN_VARIANT (TREE_TYPE (e2value)))
+	      {
+		error_at (loc,
+			  "%<__builtin_complex%> operands of different types");
+		expr.value = error_mark_node;
+		break;
+	      }
+	    if (!flag_isoc99)
+	      pedwarn (loc, OPT_pedantic,
+		       "ISO C90 does not support complex types");
+	    expr.value = build2 (COMPLEX_EXPR,
+				 build_complex_type (TYPE_MAIN_VARIANT
+						     (TREE_TYPE (e1value))),
+				 e1value, e2value);
+	    break;
+	  }
+	case RID_BUILTIN_SHUFFLE:
+	  {
+	    VEC(tree,gc) *expr_list;
+	    
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser, 
+					    "__builtin_shuffle", &expr_list))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    if (VEC_length (tree, expr_list) == 2)
+	      expr.value = c_build_vec_shuffle_expr 
+				(loc, VEC_index (tree, expr_list, 0),
+				 VEC_index (tree, expr_list, 0),
+				 VEC_index (tree, expr_list, 1));
+	    else if (VEC_length (tree, expr_list) == 3)
+	      expr.value = c_build_vec_shuffle_expr 
+				(loc, VEC_index (tree, expr_list, 0),
+				 VEC_index (tree, expr_list, 1),
+				 VEC_index (tree, expr_list, 2));
+	    else
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_shuffle%>");
+		expr.value = error_mark_node;
+	      }
+	    break;
+	  }
 	case RID_AT_SELECTOR:
 	  gcc_assert (c_dialect_objc ());
 	  c_parser_consume_token (parser);
Index: gcc/config/i386/sse.md
===================================================================
--- gcc/config/i386/sse.md	(revision 178354)
+++ gcc/config/i386/sse.md	(working copy)
@@ -231,6 +231,12 @@  (define_mode_attr sseinsnmode
    (V4SF "V4SF") (V2DF "V2DF")
    (TI "TI") (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")])
 
+;; All 128bit vector modes
+(define_mode_attr sseshuffint
+  [(V16QI "V16QI") (V8HI "V8HI") 
+   (V4SI "V4SI")  (V2DI "V2DI")
+   (V4SF "V4SI") (V2DF "V2DI")])
+
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V8SF "V8SI") (V4DF "V4DI")
@@ -6234,6 +6240,18 @@  (define_expand "vconduv2di"
   DONE;
 })
 
+(define_expand "vshuffle<mode>"
+  [(match_operand:V_128 0 "register_operand" "")
+   (match_operand:V_128 1 "general_operand" "")
+   (match_operand:<sseshuffint> 2 "general_operand" "")]
+  "TARGET_SSE3 || TARGET_AVX"
+{
+  bool ok = ix86_expand_vshuffle (operands);
+  gcc_assert (ok);
+  DONE;
+})
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel bitwise logical operations
Index: gcc/config/i386/i386-protos.h
===================================================================
--- gcc/config/i386/i386-protos.h	(revision 178354)
+++ gcc/config/i386/i386-protos.h	(working copy)
@@ -118,6 +118,7 @@  extern bool ix86_expand_int_movcc (rtx[]
 extern bool ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
 extern bool ix86_expand_int_vcond (rtx[]);
+extern bool ix86_expand_vshuffle (rtx[]);
 extern void ix86_expand_sse_unpack (rtx[], bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 178354)
+++ gcc/config/i386/i386.c	(working copy)
@@ -18693,6 +18693,96 @@  ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
+bool
+ix86_expand_vshuffle (rtx operands[])
+{
+  rtx target = operands[0];
+  rtx op0 = operands[1];
+  rtx mask = operands[2];
+  rtx mm, vt, cv0, t1;
+  enum machine_mode mode = GET_MODE (op0);
+  enum machine_mode maskmode = GET_MODE (mask);
+  enum machine_mode maskinner = GET_MODE_INNER (mode);
+  rtx vec[16];
+  int w, i, j;
+
+  gcc_assert ((TARGET_SSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128);
+
+  op0 = force_reg (mode, op0);
+  mask = force_reg (maskmode, mask);
+
+  /* Number of elements in the vector.  */
+  w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner);
+ 
+  /* mask = mask & {w-1, w-1, w-1,...} */
+  for (i = 0; i < w; i++)
+    vec[i] = GEN_INT (w - 1);
+
+  mm = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+  mm = force_reg (maskmode, mm);
+
+  mask = gen_rtx_AND (maskmode, mask, mm);
+  
+  /* Convert mask to vector of chars.  */
+  mask = simplify_gen_subreg (V16QImode, mask, maskmode, 0);
+  mask = force_reg (V16QImode, mask);
+
+
+  /* Build a helper mask wich we will use in pshufb
+     (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+     (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}
+     ...  */
+  for (i = 0; i < w; i++)
+    for (j = 0; j < 16/w; j++)
+      vec[i*w+j] = GEN_INT (i*16/w);
+
+  vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  vt = force_reg (V16QImode, vt);
+  
+  t1 = gen_reg_rtx (V16QImode);
+  emit_insn (gen_ssse3_pshufbv16qi3 (t1, mask, vt));
+  mm = t1;
+
+  /* MM contains now something like
+     mm = {m[0], .., m[0], m[k], .., m[k], ... }, where 
+     m[i] is an index of the element in the vector we are
+     selecting from.
+
+     Convert it into the byte positions by doing
+     mm = mm * {16/w, 16/w, ...}
+     mm = mm + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+  for (i = 0; i < 16; i++)
+    vec[i] = GEN_INT (16/w);
+
+  cv0 = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  cv0 = force_reg (V16QImode, cv0);
+  mm = gen_rtx_MULT (V16QImode, mm, cv0);
+
+  for (i = 0; i < w; i++)
+    for (j = 0; j < 16/w; j++)
+      vec[i*w+j] = GEN_INT (j);
+
+  cv0 = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+  cv0 = force_reg (V16QImode, cv0);
+  mm = gen_rtx_PLUS (V16QImode, mm, cv0);
+  mm = force_reg (V16QImode, mm);
+
+  t1 = gen_reg_rtx (V16QImode);
+  
+  /* Convert OP0 to vector of chars.  */
+  op0 = simplify_gen_subreg (V16QImode, op0, mode, 0);
+  op0 = force_reg (V16QImode, op0);
+  emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mm));
+  
+  /* Convert it back from vector of chars to the original mode.  */
+  t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
+  
+  emit_insn (gen_rtx_SET (VOIDmode, target, t1));
+ 
+  fprintf (stderr, "-- %s called\n", __func__);
+  return true;
+}
+
 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
    true if we should do zero extension, else sign extension.  HIGH_P is
    true if we want the N/2 high elements, else the low elements.  */
@@ -30911,6 +31001,9 @@  struct expand_vec_perm_d
 
 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+static int extract_vec_perm_cst (struct expand_vec_perm_d *, tree);
+static bool ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask);
+
 
 /* Get a vector mode of the same size as the original but with elements
    twice as wide.  This is only guaranteed to apply to integral vectors.  */
@@ -34576,10 +34669,10 @@  ix86_vectorize_builtin_vec_perm_ok (tree
 
   vec_mask = extract_vec_perm_cst (&d, mask);
 
-  /* This hook is cannot be called in response to something that the
-     user does (unlike the builtin expander) so we shouldn't ever see
-     an error generated from the extract.  */
-  gcc_assert (vec_mask > 0 && vec_mask <= 3);
+  /* Check whether the mask can be applied to the vector type.  */
+  if (vec_mask < 0 || vec_mask > 3)
+    return false;
+  
   one_vec = (vec_mask != 3);
 
   /* Implementable with shufps or pshufd.  */
Index: gcc/tree-ssa-operands.c
===================================================================
--- gcc/tree-ssa-operands.c	(revision 178354)
+++ gcc/tree-ssa-operands.c	(working copy)
@@ -943,6 +943,7 @@  get_expr_operands (gimple stmt, tree *ex
 
     case COND_EXPR:
     case VEC_COND_EXPR:
+    case VEC_SHUFFLE_EXPR:
       get_expr_operands (stmt, &TREE_OPERAND (expr, 0), uflags);
       get_expr_operands (stmt, &TREE_OPERAND (expr, 1), uflags);
       get_expr_operands (stmt, &TREE_OPERAND (expr, 2), uflags);