Patchwork [RFC] Masked load/store vectorization

login
register
mail settings
Submitter Jakub Jelinek
Date Nov. 2, 2012, 5:14 p.m.
Message ID <20121102171415.GD1881@tucnak.redhat.com>
Download mbox | patch
Permalink /patch/196605/
State New
Headers show

Comments

Jakub Jelinek - Nov. 2, 2012, 5:14 p.m.
Hi!

This is a RFC patch which implements masked (conditional) vector
loads/stores roughly as discussed with richi (i.e. internal function
calls, added during if-conversion pass, and if not vectorized
removed after vectorization again).
The if-unconversion pass is just very basic, I guess it could often
do better job by trying to guess which statements in the bb with
MASK_LOAD/MASK_STORE were in which bb, depending on what condition,
because right now vrp2 is able to fix up something, but not undo
COND_EXPRs on PHI nodes, and there is no sink pass after vect.

Testcases it can handle include e.g.
extern void abort (void);

__attribute__((noinline, noclone)) void
foo (float *__restrict x, float *__restrict y, float *__restrict z)
{
  float *__restrict p = __builtin_assume_aligned (x, 32);
  float *__restrict q = __builtin_assume_aligned (y, 32);
  float *__restrict r = __builtin_assume_aligned (z, 32);
  int i;
  for (i = 0; i < 1024; i++)
    {
      if (p[i] < 0.0f)
        q[i] = p[i] + 2.0f;
      else
        p[i] = r[i] + 3.0f;
    }
}

float a[1024] __attribute__((aligned (32)));
float b[1024] __attribute__((aligned (32)));
float c[1024] __attribute__((aligned (32)));

int
main ()
{
  int i;
  for (i = 0; i < 1024; i++)
    {
      a[i] = (i & 1) ? -i : i;
      b[i] = 7 * i;
      c[i] = a[i] - 3.0f;
    }
  foo (a, b, c);
  for (i = 0; i < 1024; i++)
    if (a[i] != ((i & 1) ? -i : i)
        || b[i] != ((i & 1) ? a[i] + 2.0f : 7 * i)
        || c[i] != a[i] - 3.0f)
      abort ();
  for (i = 0; i < 5000000; i++)
    foo (a, b, c);
  return 0;  
}
or
void
foo (double *x, double *y)
{
  double *p = __builtin_assume_aligned (x, 16);
  double *q = __builtin_assume_aligned (y, 16);
  double z, h;
  int i;
  for (i = 0; i < 1024; i++)
    {
      if (p[i] < 0.0)
        z = q[i], h = q[i] * 7.0 + 3.0;
      else
        z = p[i] + 6.0, h = p[1024 + i];
      p[i] = z + 2.0 * h;
    }
}
or
#define N 1024
float vf1[N+16], vf2[N], vf3[N];
double vd1[N+16], vd2[N];
int k[N];
long l[N];
short n[N];

__attribute__((noinline, noclone)) void
f1 (void)
{
  int i;
  for (i = 0; i < N; i++)
    {
      float f;
      if (vf3[i] < 0.0f)
        f = vf1[k[i]];
      else
        f = 7.0f;
      vf2[i] = f;
    }
}

(all with -O3 -mavx resp. -O3 -mavx2).  vmaskmov*/vpmaskmov* and
masked gather insns are then used.

2012-11-02  Jakub Jelinek  <jakub@redhat.com>

	* Makefile.in (tree-if-conv.o): Depend on $(TARGET_H), $(EXPR_H)
	and $(OPTABS_H).
	* config/i386/sse.md (maskload<mode>, maskstore<mode>): New expanders.
	* tree-data-ref.c (struct data_ref_loc_d): Replace pos field with ref.
	(get_references_in_stmt): Don't record operand addresses, but
	operands themselves.  Handle MASK_LOAD and MASK_STORE.
	(find_data_references_in_stmt, graphite_find_data_references_in_stmt,
	create_rdg_vertices): Adjust users of pos field of data_ref_loc_d.
	* internal-fn.def (MASK_LOAD, MASK_STORE): New internal fns.
	* tree-if-conv.c: Add target.h, expr.h and optabs.h includes.
	(if_convertible_phi_p, insert_gimplified_predicates): Add
	any_mask_load_store argument, if true, handle it like
	flag_tree_loop_if_convert_stores.
	(ifcvt_can_use_mask_load_store): New function.
	(if_convertible_gimple_assign_stmt_p): Add any_mask_load_store
	argument, check if some conditional loads or stores can't be
	converted into MASK_LOAD or MASK_STORE.
	(if_convertible_stmt_p): Add any_mask_load_store argument,
	pass it down to if_convertible_gimple_assign_stmt_p.
	(if_convertible_loop_p_1): Add any_mask_load_store argument,
	pass it down to if_convertible_stmt_p and if_convertible_phi_p,
	call if_convertible_phi_p only after all if_convertible_stmt_p
	calls.
	(if_convertible_loop_p): Add any_mask_load_store argument,
	pass it down to if_convertible_loop_p_1.
	(predicate_mem_writes): Emit MASK_LOAD and/or MASK_STORE calls.
	(combine_blocks): Add any_mask_load_store argument, pass
	it down to insert_gimplified_predicates and call predicate_mem_writes
	if it is set.
	(tree_if_conversion): Add any_mask_load_store_p argument,
	adjust if_convertible_loop_p, combine_blocks calls and gather
	whether any mask loads/stores have been generated.
	(need_if_unconversion): New variable.
	(main_tree_if_conversion): Adjust tree_if_conversion caller,
	if any masked loads/stores have been created, set
	need_if_unconversion and return TODO_update_ssa_only_virtuals.
	(gate_tree_if_unconversion, main_tree_if_unconversion): New
	functions.
	(pass_if_unconversion): New pass descriptor.
	* tree-vect-data-refs.c (vect_check_gather): Handle
	MASK_LOAD/MASK_STORE.
	(vect_analyze_data_refs, vect_supportable_dr_alignment): Likewise.
	* gimple.h (gimple_expr_type): Handle MASK_STORE.
	* internal-fn.c (expand_MASK_LOAD, expand_MASK_STORE): New functions.
	* tree-vect-loop.c (vect_determine_vectorization_factor): Handle
	MASK_STORE.
	* passes.c (init_optimization_passes): Add pass_if_unconversion.
	* optabs.def (maskload_optab, maskstore_optab): New optabs.
	* tree-pass.h (pass_if_unconversion): New extern decl.
	* tree-vect-stmts.c (vect_mark_relevant): Don't crash if lhs
	is NULL.
	(exist_non_indexing_operands_for_use_p): Handle MASK_LOAD
	and MASK_STORE.
	(vectorizable_mask_load_store): New function.
	(vectorizable_call): Call it for MASK_LOAD or MASK_STORE.
	(vect_transform_stmt): Handle MASK_STORE.



	Jakub

Patch

--- gcc/Makefile.in.jj	2012-11-02 09:01:55.000000000 +0100
+++ gcc/Makefile.in	2012-11-02 17:48:10.144549562 +0100
@@ -2378,7 +2378,7 @@  tree-nested.o: tree-nested.c $(CONFIG_H)
 tree-if-conv.o: tree-if-conv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
    $(TREE_H) $(FLAGS_H) $(BASIC_BLOCK_H) $(TREE_FLOW_H) \
    $(CFGLOOP_H) $(TREE_DATA_REF_H) $(TREE_PASS_H) $(DIAGNOSTIC_H) \
-   $(DBGCNT_H) $(GIMPLE_PRETTY_PRINT_H)
+   $(DBGCNT_H) $(GIMPLE_PRETTY_PRINT_H) $(TARGET_H) $(EXPR_H) $(OPTABS_H)
 tree-iterator.o : tree-iterator.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) \
    coretypes.h $(GGC_H) tree-iterator.h $(GIMPLE_H) gt-tree-iterator.h
 tree-dfa.o : tree-dfa.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \
--- gcc/config/i386/sse.md.jj	2012-11-02 09:01:47.493572793 +0100
+++ gcc/config/i386/sse.md	2012-11-02 10:12:48.873876499 +0100
@@ -11294,6 +11294,23 @@  (define_insn "<avx_avx2>_maskstore<ssemo
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "maskload<mode>"
+  [(set (match_operand:V48_AVX2 0 "register_operand")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 2 "register_operand")
+	   (match_operand:V48_AVX2 1 "memory_operand")]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX")
+
+(define_expand "maskstore<mode>"
+  [(set (match_operand:V48_AVX2 0 "memory_operand")
+	(unspec:V48_AVX2
+	  [(match_operand:<sseintvecmode> 2 "register_operand")
+	   (match_operand:V48_AVX2 1 "register_operand")
+	   (match_dup 0)]
+	  UNSPEC_MASKMOV))]
+  "TARGET_AVX")
+
 (define_insn_and_split "avx_<castmode><avxsizesuffix>_<castmode>"
   [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m")
 	(unspec:AVX256MODE2P
--- gcc/tree-data-ref.c.jj	2012-11-02 09:01:47.069575609 +0100
+++ gcc/tree-data-ref.c	2012-11-02 09:13:34.203881900 +0100
@@ -4283,11 +4283,11 @@  compute_all_dependences (VEC (data_refer
 
 typedef struct data_ref_loc_d
 {
-    /* Position of the memory reference.  */
-    tree *pos;
+  /* The memory reference.  */
+  tree ref;
 
-      /* True if the memory reference is read.  */
-      bool is_read;
+  /* True if the memory reference is read.  */
+  bool is_read;
 } data_ref_loc;
 
 DEF_VEC_O (data_ref_loc);
@@ -4301,7 +4301,7 @@  get_references_in_stmt (gimple stmt, VEC
 {
   bool clobbers_memory = false;
   data_ref_loc ref;
-  tree *op0, *op1;
+  tree op0, op1;
   enum gimple_code stmt_code = gimple_code (stmt);
 
   *references = NULL;
@@ -4310,7 +4310,10 @@  get_references_in_stmt (gimple stmt, VEC
      As we cannot model data-references to not spelled out
      accesses give up if they may occur.  */
   if ((stmt_code == GIMPLE_CALL
-       && !(gimple_call_flags (stmt) & ECF_CONST))
+       && !(gimple_call_flags (stmt) & ECF_CONST)
+       && (!gimple_call_internal_p (stmt)
+	   || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
+	       && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
       || (stmt_code == GIMPLE_ASM
 	  && (gimple_asm_volatile_p (stmt) || gimple_vuse (stmt))))
     clobbers_memory = true;
@@ -4321,15 +4324,15 @@  get_references_in_stmt (gimple stmt, VEC
   if (stmt_code == GIMPLE_ASSIGN)
     {
       tree base;
-      op0 = gimple_assign_lhs_ptr (stmt);
-      op1 = gimple_assign_rhs1_ptr (stmt);
+      op0 = gimple_assign_lhs (stmt);
+      op1 = gimple_assign_rhs1 (stmt);
 
-      if (DECL_P (*op1)
-	  || (REFERENCE_CLASS_P (*op1)
-	      && (base = get_base_address (*op1))
+      if (DECL_P (op1)
+	  || (REFERENCE_CLASS_P (op1)
+	      && (base = get_base_address (op1))
 	      && TREE_CODE (base) != SSA_NAME))
 	{
-	  ref.pos = op1;
+	  ref.ref = op1;
 	  ref.is_read = true;
 	  VEC_safe_push (data_ref_loc, heap, *references, ref);
 	}
@@ -4338,16 +4341,35 @@  get_references_in_stmt (gimple stmt, VEC
     {
       unsigned i, n;
 
-      op0 = gimple_call_lhs_ptr (stmt);
+      ref.is_read = false;
+      if (gimple_call_internal_p (stmt))
+	switch (gimple_call_internal_fn (stmt))
+	  {
+	  case IFN_MASK_LOAD:
+	    ref.is_read = true;
+	  case IFN_MASK_STORE:
+	    ref.ref = build2 (MEM_REF,
+			      ref.is_read
+			      ? TREE_TYPE (gimple_call_lhs (stmt))
+			      : TREE_TYPE (gimple_call_arg (stmt, 3)),
+			      gimple_call_arg (stmt, 0),
+			      gimple_call_arg (stmt, 1));
+	    VEC_safe_push (data_ref_loc, heap, *references, ref);
+	    return false;
+	  default:
+	    break;
+	  }
+
+      op0 = gimple_call_lhs (stmt);
       n = gimple_call_num_args (stmt);
       for (i = 0; i < n; i++)
 	{
-	  op1 = gimple_call_arg_ptr (stmt, i);
+	  op1 = gimple_call_arg (stmt, i);
 
-	  if (DECL_P (*op1)
-	      || (REFERENCE_CLASS_P (*op1) && get_base_address (*op1)))
+	  if (DECL_P (op1)
+	      || (REFERENCE_CLASS_P (op1) && get_base_address (op1)))
 	    {
-	      ref.pos = op1;
+	      ref.ref = op1;
 	      ref.is_read = true;
 	      VEC_safe_push (data_ref_loc, heap, *references, ref);
 	    }
@@ -4356,11 +4378,11 @@  get_references_in_stmt (gimple stmt, VEC
   else
     return clobbers_memory;
 
-  if (*op0
-      && (DECL_P (*op0)
-	  || (REFERENCE_CLASS_P (*op0) && get_base_address (*op0))))
+  if (op0
+      && (DECL_P (op0)
+	  || (REFERENCE_CLASS_P (op0) && get_base_address (op0))))
     {
-      ref.pos = op0;
+      ref.ref = op0;
       ref.is_read = false;
       VEC_safe_push (data_ref_loc, heap, *references, ref);
     }
@@ -4390,7 +4412,7 @@  find_data_references_in_stmt (struct loo
   FOR_EACH_VEC_ELT (data_ref_loc, references, i, ref)
     {
       dr = create_data_ref (nest, loop_containing_stmt (stmt),
-			    *ref->pos, stmt, ref->is_read);
+			    ref->ref, stmt, ref->is_read);
       gcc_assert (dr != NULL);
       VEC_safe_push (data_reference_p, heap, *datarefs, dr);
     }
@@ -4422,7 +4444,7 @@  graphite_find_data_references_in_stmt (l
 
   FOR_EACH_VEC_ELT (data_ref_loc, references, i, ref)
     {
-      dr = create_data_ref (nest, loop, *ref->pos, stmt, ref->is_read);
+      dr = create_data_ref (nest, loop, ref->ref, stmt, ref->is_read);
       gcc_assert (dr != NULL);
       VEC_safe_push (data_reference_p, heap, *datarefs, dr);
     }
@@ -5058,7 +5080,7 @@  create_rdg_vertices (struct graph *rdg,
 	  else
 	    RDGV_HAS_MEM_READS (v) = true;
 	  dr = create_data_ref (loop, loop_containing_stmt (stmt),
-				*ref->pos, stmt, ref->is_read);
+				ref->ref, stmt, ref->is_read);
 	  if (dr)
 	    VEC_safe_push (data_reference_p, heap, RDGV_DATAREFS (v), dr);
 	}
--- gcc/internal-fn.def.jj	2012-11-02 09:01:47.590572147 +0100
+++ gcc/internal-fn.def	2012-11-02 09:13:34.215881819 +0100
@@ -1,5 +1,5 @@ 
 /* Internal functions.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -40,3 +40,5 @@  along with GCC; see the file COPYING3.
 
 DEF_INTERNAL_FN (LOAD_LANES, ECF_CONST | ECF_LEAF)
 DEF_INTERNAL_FN (STORE_LANES, ECF_CONST | ECF_LEAF)
+DEF_INTERNAL_FN (MASK_LOAD, ECF_PURE | ECF_LEAF)
+DEF_INTERNAL_FN (MASK_STORE, ECF_LEAF)
--- gcc/tree-if-conv.c.jj	2012-11-02 09:01:47.108575349 +0100
+++ gcc/tree-if-conv.c	2012-11-02 16:46:18.306241821 +0100
@@ -96,6 +96,9 @@  along with GCC; see the file COPYING3.
 #include "tree-scalar-evolution.h"
 #include "tree-pass.h"
 #include "dbgcnt.h"
+#include "target.h"
+#include "expr.h"
+#include "optabs.h"
 
 /* List of basic blocks in if-conversion-suitable order.  */
 static basic_block *ifc_bbs;
@@ -448,7 +451,8 @@  bb_with_exit_edge_p (struct loop *loop,
    - there is a virtual PHI in a BB other than the loop->header.  */
 
 static bool
-if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
+if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi,
+		      bool any_mask_load_store)
 {
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -463,7 +467,7 @@  if_convertible_phi_p (struct loop *loop,
       return false;
     }
 
-  if (flag_tree_loop_if_convert_stores)
+  if (flag_tree_loop_if_convert_stores || any_mask_load_store)
     return true;
 
   /* When the flag_tree_loop_if_convert_stores is not set, check
@@ -679,6 +683,84 @@  ifcvt_could_trap_p (gimple stmt, VEC (da
   return gimple_could_trap_p (stmt);
 }
 
+/* Return true if STMT could be converted into a masked load or store
+   (conditional load or store based on a mask computed from bb predicate).  */
+
+static bool
+ifcvt_can_use_mask_load_store (gimple stmt)
+{
+  tree lhs, ref;
+  enum machine_mode mode, vmode;
+  optab op;
+  basic_block bb;
+  unsigned int vector_sizes;
+
+  if (!flag_tree_vectorize
+      || !gimple_assign_single_p (stmt)
+      || gimple_has_volatile_ops (stmt))
+    return false;
+
+  /* Avoid creating mask loads/stores if we'd need to chain
+     conditions, to make it easier to undo them.  */
+  bb = gimple_bb (stmt);
+  if (!single_pred_p (bb)
+      || is_predicated (single_pred (bb)))
+    return false;
+
+  /* Check whether this is a load or store.  */
+  lhs = gimple_assign_lhs (stmt);
+  if (TREE_CODE (lhs) != SSA_NAME)
+    {
+      if (!is_gimple_val (gimple_assign_rhs1 (stmt)))
+	return false;
+      op = maskstore_optab;
+      ref = lhs;
+    }
+  else if (gimple_assign_load_p (stmt))
+    {
+      op = maskload_optab;
+      ref = gimple_assign_rhs1 (stmt);
+    }
+  else
+    return false;
+
+  /* And whether REF isn't a MEM_REF with non-addressable decl.  */
+  if (TREE_CODE (ref) == MEM_REF
+      && TREE_CODE (TREE_OPERAND (ref, 0)) == ADDR_EXPR
+      && DECL_P (TREE_OPERAND (TREE_OPERAND (ref, 0), 0))
+      && !TREE_ADDRESSABLE (TREE_OPERAND (TREE_OPERAND (ref, 0), 0)))
+    return false;
+
+  /* Mask should be integer mode of the same size as the load/store
+     mode.  */
+  mode = TYPE_MODE (TREE_TYPE (lhs));
+  if (int_mode_for_mode (mode) == BLKmode)
+    return false;
+
+  /* See if there is any chance the mask load or store might be
+     vectorized.  If not, punt.  */
+  vmode = targetm.vectorize.preferred_simd_mode (mode);
+  if (!VECTOR_MODE_P (vmode))
+    return false;
+
+  if (optab_handler (op, vmode) != CODE_FOR_nothing)
+    return true;
+
+  vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
+  while (vector_sizes != 0)
+    {
+      unsigned int cur = 1 << floor_log2 (vector_sizes);
+      vector_sizes &= ~cur;
+      if (cur <= GET_MODE_SIZE (mode))
+	continue;
+      vmode = mode_for_vector (mode, cur / GET_MODE_SIZE (mode));
+      if (VECTOR_MODE_P (vmode)
+	  && optab_handler (op, vmode) != CODE_FOR_nothing)
+	return true;
+    }
+  return false;
+}
+
 /* Return true when STMT is if-convertible.
 
    GIMPLE_ASSIGN statement is not if-convertible if,
@@ -688,7 +770,8 @@  ifcvt_could_trap_p (gimple stmt, VEC (da
 
 static bool
 if_convertible_gimple_assign_stmt_p (gimple stmt,
-				     VEC (data_reference_p, heap) *refs)
+				     VEC (data_reference_p, heap) *refs,
+				     bool *any_mask_load_store)
 {
   tree lhs = gimple_assign_lhs (stmt);
   basic_block bb;
@@ -714,10 +797,18 @@  if_convertible_gimple_assign_stmt_p (gim
       return false;
     }
 
+  gimple_set_plf (stmt, GF_PLF_1, false);
+
   if (flag_tree_loop_if_convert_stores)
     {
       if (ifcvt_could_trap_p (stmt, refs))
 	{
+	  if (ifcvt_can_use_mask_load_store (stmt))
+	    {
+	      gimple_set_plf (stmt, GF_PLF_1, true);
+	      *any_mask_load_store = true;
+	      return true;
+	    }
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file, "tree could trap...\n");
 	  return false;
@@ -727,6 +818,12 @@  if_convertible_gimple_assign_stmt_p (gim
 
   if (gimple_assign_rhs_could_trap_p (stmt))
     {
+      if (ifcvt_can_use_mask_load_store (stmt))
+	{
+	  gimple_set_plf (stmt, GF_PLF_1, true);
+	  *any_mask_load_store = true;
+	  return true;
+	}
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "tree could trap...\n");
       return false;
@@ -738,6 +835,12 @@  if_convertible_gimple_assign_stmt_p (gim
       && bb != bb->loop_father->header
       && !bb_with_exit_edge_p (bb->loop_father, bb))
     {
+      if (ifcvt_can_use_mask_load_store (stmt))
+	{
+	  gimple_set_plf (stmt, GF_PLF_1, true);
+	  *any_mask_load_store = true;
+	  return true;
+	}
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
 	  fprintf (dump_file, "LHS is not var\n");
@@ -756,7 +859,8 @@  if_convertible_gimple_assign_stmt_p (gim
    - it is a GIMPLE_LABEL or a GIMPLE_COND.  */
 
 static bool
-if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs)
+if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs,
+		       bool *any_mask_load_store)
 {
   switch (gimple_code (stmt))
     {
@@ -766,7 +870,8 @@  if_convertible_stmt_p (gimple stmt, VEC
       return true;
 
     case GIMPLE_ASSIGN:
-      return if_convertible_gimple_assign_stmt_p (stmt, refs);
+      return if_convertible_gimple_assign_stmt_p (stmt, refs,
+						  any_mask_load_store);
 
     case GIMPLE_CALL:
       {
@@ -1072,7 +1177,8 @@  static bool
 if_convertible_loop_p_1 (struct loop *loop,
 			 VEC (loop_p, heap) **loop_nest,
 			 VEC (data_reference_p, heap) **refs,
-			 VEC (ddr_p, heap) **ddrs)
+			 VEC (ddr_p, heap) **ddrs,
+			 bool *any_mask_load_store)
 {
   bool res;
   unsigned int i;
@@ -1128,17 +1234,27 @@  if_convertible_loop_p_1 (struct loop *lo
       basic_block bb = ifc_bbs[i];
       gimple_stmt_iterator itr;
 
-      for (itr = gsi_start_phis (bb); !gsi_end_p (itr); gsi_next (&itr))
-	if (!if_convertible_phi_p (loop, bb, gsi_stmt (itr)))
-	  return false;
-
       /* Check the if-convertibility of statements in predicated BBs.  */
       if (is_predicated (bb))
 	for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr))
-	  if (!if_convertible_stmt_p (gsi_stmt (itr), *refs))
+	  if (!if_convertible_stmt_p (gsi_stmt (itr), *refs,
+				      any_mask_load_store))
 	    return false;
     }
 
+  /* Checking PHIs needs to be done after stmts, as the fact whether there
+     are any masked loads or stores affects the tests.  */
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      basic_block bb = ifc_bbs[i];
+      gimple_stmt_iterator itr;
+
+      for (itr = gsi_start_phis (bb); !gsi_end_p (itr); gsi_next (&itr))
+	if (!if_convertible_phi_p (loop, bb, gsi_stmt (itr),
+				   *any_mask_load_store))
+	  return false;
+    }
+
   if (dump_file)
     fprintf (dump_file, "Applying if-conversion\n");
 
@@ -1154,7 +1270,7 @@  if_convertible_loop_p_1 (struct loop *lo
    - if its basic blocks and phi nodes are if convertible.  */
 
 static bool
-if_convertible_loop_p (struct loop *loop)
+if_convertible_loop_p (struct loop *loop, bool *any_mask_load_store)
 {
   edge e;
   edge_iterator ei;
@@ -1196,7 +1312,8 @@  if_convertible_loop_p (struct loop *loop
   refs = VEC_alloc (data_reference_p, heap, 5);
   ddrs = VEC_alloc (ddr_p, heap, 25);
   loop_nest = VEC_alloc (loop_p, heap, 3);
-  res = if_convertible_loop_p_1 (loop, &loop_nest, &refs, &ddrs);
+  res = if_convertible_loop_p_1 (loop, &loop_nest, &refs, &ddrs,
+				 any_mask_load_store);
 
   if (flag_tree_loop_if_convert_stores)
     {
@@ -1414,7 +1531,7 @@  predicate_all_scalar_phis (struct loop *
    gimplification of the predicates.  */
 
 static void
-insert_gimplified_predicates (loop_p loop)
+insert_gimplified_predicates (loop_p loop, bool any_mask_load_store)
 {
   unsigned int i;
 
@@ -1435,7 +1552,8 @@  insert_gimplified_predicates (loop_p loo
       stmts = bb_predicate_gimplified_stmts (bb);
       if (stmts)
 	{
-	  if (flag_tree_loop_if_convert_stores)
+	  if (flag_tree_loop_if_convert_stores
+	      || any_mask_load_store)
 	    {
 	      /* Insert the predicate of the BB just after the label,
 		 as the if-conversion of memory writes will use this
@@ -1594,9 +1712,49 @@  predicate_mem_writes (loop_p loop)
 	}
 
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-	if ((stmt = gsi_stmt (gsi))
-	    && gimple_assign_single_p (stmt)
-	    && gimple_vdef (stmt))
+	if ((stmt = gsi_stmt (gsi)) == NULL
+	    || !gimple_assign_single_p (stmt))
+	  continue;
+	else if (gimple_plf (stmt, GF_PLF_1))
+	  {
+	    tree lhs = gimple_assign_lhs (stmt);
+	    tree rhs = gimple_assign_rhs1 (stmt);
+	    tree ref, addr, ptr, masktype, mask_op0, mask_op1, mask;
+	    gimple new_stmt;
+	    int bitsize = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (lhs)));
+
+	    masktype = build_nonstandard_integer_type (bitsize, 1);
+	    mask_op0 = build_int_cst (masktype, swap ? 0 : -1);
+	    mask_op1 = build_int_cst (masktype, swap ? -1 : 0);
+	    ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
+	    addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
+					     true, NULL_TREE, true,
+					     GSI_SAME_STMT);
+	    cond = force_gimple_operand_gsi_1 (&gsi, unshare_expr (cond),
+					       is_gimple_condexpr, NULL_TREE,
+					       true, GSI_SAME_STMT);
+	    mask = fold_build_cond_expr (masktype, unshare_expr (cond),
+					 mask_op0, mask_op1);
+	    mask = ifc_temp_var (masktype, mask, &gsi);
+	    ptr = build_int_cst (reference_alias_ptr_type (ref), 0);
+	    /* Copy points-to info if possible.  */
+	    if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
+	      copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
+			     ref);
+	    if (TREE_CODE (lhs) == SSA_NAME)
+	      {
+		new_stmt
+		  = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
+						ptr, mask);
+		gimple_call_set_lhs (new_stmt, lhs);
+	      }
+	    else
+	      new_stmt
+		= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+					      mask, rhs);
+	    gsi_replace (&gsi, new_stmt, false);
+	  }
+	else if (gimple_vdef (stmt))
 	  {
 	    tree lhs = gimple_assign_lhs (stmt);
 	    tree rhs = gimple_assign_rhs1 (stmt);
@@ -1666,7 +1824,7 @@  remove_conditions_and_labels (loop_p loo
    blocks.  Replace PHI nodes with conditional modify expressions.  */
 
 static void
-combine_blocks (struct loop *loop)
+combine_blocks (struct loop *loop, bool any_mask_load_store)
 {
   basic_block bb, exit_bb, merge_target_bb;
   unsigned int orig_loop_num_nodes = loop->num_nodes;
@@ -1675,10 +1833,10 @@  combine_blocks (struct loop *loop)
   edge_iterator ei;
 
   remove_conditions_and_labels (loop);
-  insert_gimplified_predicates (loop);
+  insert_gimplified_predicates (loop, any_mask_load_store);
   predicate_all_scalar_phis (loop);
 
-  if (flag_tree_loop_if_convert_stores)
+  if (flag_tree_loop_if_convert_stores || any_mask_load_store)
     predicate_mem_writes (loop);
 
   /* Merge basic blocks: first remove all the edges in the loop,
@@ -1775,23 +1933,25 @@  combine_blocks (struct loop *loop)
    profitability analysis.  Returns true when something changed.  */
 
 static bool
-tree_if_conversion (struct loop *loop)
+tree_if_conversion (struct loop *loop, bool *any_mask_load_store_p)
 {
   bool changed = false;
   ifc_bbs = NULL;
+  bool any_mask_load_store = false;
 
-  if (!if_convertible_loop_p (loop)
+  if (!if_convertible_loop_p (loop, &any_mask_load_store)
       || !dbg_cnt (if_conversion_tree))
     goto cleanup;
 
   /* Now all statements are if-convertible.  Combine all the basic
      blocks into one huge basic block doing the if-conversion
      on-the-fly.  */
-  combine_blocks (loop);
+  combine_blocks (loop, any_mask_load_store);
 
-  if (flag_tree_loop_if_convert_stores)
+  if (flag_tree_loop_if_convert_stores || any_mask_load_store)
     mark_virtual_operands_for_renaming (cfun);
 
+  *any_mask_load_store_p |= any_mask_load_store;
   changed = true;
 
  cleanup:
@@ -1809,6 +1969,9 @@  tree_if_conversion (struct loop *loop)
   return changed;
 }
 
+/* Flag whether if-unconversion pass will be needed afterwards.  */
+static bool need_if_unconversion;
+
 /* Tree if-conversion pass management.  */
 
 static unsigned int
@@ -1818,17 +1981,20 @@  main_tree_if_conversion (void)
   struct loop *loop;
   bool changed = false;
   unsigned todo = 0;
+  bool any_mask_load_store = false;
 
   if (number_of_loops () <= 1)
     return 0;
 
   FOR_EACH_LOOP (li, loop, 0)
-    changed |= tree_if_conversion (loop);
+    changed |= tree_if_conversion (loop, &any_mask_load_store);
+
+  need_if_unconversion = any_mask_load_store;
 
   if (changed)
     todo |= TODO_cleanup_cfg;
 
-  if (changed && flag_tree_loop_if_convert_stores)
+  if (changed && (flag_tree_loop_if_convert_stores || any_mask_load_store))
     todo |= TODO_update_ssa_only_virtuals;
 
   free_dominance_info (CDI_POST_DOMINATORS);
@@ -1865,6 +2031,139 @@  struct gimple_opt_pass pass_if_conversio
   NULL,					/* sub */
   NULL,					/* next */
   0,					/* static_pass_number */
+  TV_NONE,				/* tv_id */
+  PROP_cfg | PROP_ssa,			/* properties_required */
+  0,					/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_verify_stmts | TODO_verify_flow
+					/* todo_flags_finish */
+ }
+};
+
+/* Undo creation of MASK_LOAD or MASK_STORE, if it hasn't
+   been successfully vectorized.  */
+
+static bool
+gate_tree_if_unconversion (void)
+{
+  return need_if_unconversion;
+}
+
+static unsigned int
+main_tree_if_unconversion (void)
+{
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+
+  need_if_unconversion = false;
+  FOR_EACH_BB (bb)
+    for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      {
+	gimple stmt = gsi_stmt (gsi);
+	if (is_gimple_call (stmt)
+	    && gimple_call_internal_p (stmt)
+	    && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+		|| gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	    && INTEGRAL_TYPE_P (TREE_TYPE (gimple_call_arg (stmt, 2))))
+	  {
+	    tree cond = gimple_call_arg (stmt, 2), mem, type;
+	    edge e1, e2, e3;
+	    bool swapped_p = false;
+	    gimple cond_stmt, new_stmt;
+
+	    if (TREE_CODE (cond) == SSA_NAME
+		&& !SSA_NAME_IS_DEFAULT_DEF (cond))
+	      {
+		gimple def_stmt = SSA_NAME_DEF_STMT (cond);
+		if (is_gimple_assign (def_stmt)
+		    && gimple_bb (def_stmt) == bb
+		    && gimple_assign_rhs_code (def_stmt) == COND_EXPR)
+		  {
+		    tree rhs2 = gimple_assign_rhs2 (def_stmt);
+		    tree rhs3 = gimple_assign_rhs3 (def_stmt);
+		    if (integer_all_onesp (rhs2) && integer_zerop (rhs3))
+		      cond = gimple_assign_rhs1 (def_stmt);
+		    else if (integer_zerop (rhs2) && integer_all_onesp (rhs3))
+		      {
+			cond = gimple_assign_rhs1 (def_stmt);
+			swapped_p = true;
+		      }
+		  }
+	      }
+	    gsi_prev (&gsi);
+	    e1 = split_block (bb, gsi_stmt (gsi));
+	    e2 = split_block (e1->dest, stmt);
+	    e3 = make_edge (e1->src, e2->dest,
+			    swapped_p ? EDGE_TRUE_VALUE : EDGE_FALSE_VALUE);
+	    e1->flags = (e1->flags & ~EDGE_FALLTHRU)
+			| (swapped_p ? EDGE_FALSE_VALUE : EDGE_TRUE_VALUE);
+	    set_immediate_dominator (CDI_DOMINATORS, e2->dest, e1->src);
+	    if (cond == gimple_call_arg (stmt, 2))
+	      cond_stmt
+		= gimple_build_cond (NE_EXPR, cond,
+				     build_int_cst (TREE_TYPE (cond), 0),
+				     NULL_TREE, NULL_TREE);
+	    else
+	      cond_stmt
+		= gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
+	    gsi = gsi_last_bb (e1->src);
+	    gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
+	    if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+	      type = TREE_TYPE (gimple_call_lhs (stmt));
+	    else
+	      type = TREE_TYPE (gimple_call_arg (stmt, 3));
+	    mem = build2 (MEM_REF, type, gimple_call_arg (stmt, 0),
+			  gimple_call_arg (stmt, 1));
+	    if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+	      new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
+					      mem);
+	    else
+	      new_stmt = gimple_build_assign (mem, gimple_call_arg (stmt, 3));
+	    gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+	    if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+	      {
+		gimple phi;
+		tree res = gimple_assign_lhs (new_stmt);
+		tree tem = make_ssa_name (TREE_TYPE (res), NULL);
+		tree zero = build_zero_cst (TREE_TYPE (res));
+		gimple_assign_set_lhs (new_stmt, tem);
+		gimple_call_set_lhs (stmt, NULL_TREE);
+		phi = create_phi_node (res, e2->dest);
+		add_phi_arg (phi, tem, e2, gimple_location (stmt));
+		add_phi_arg (phi, zero, e3, gimple_location (stmt));
+		SSA_NAME_DEF_STMT (res) = phi;
+	      }
+	    else
+	      {
+		gimple phi;
+		tree new_vdef = copy_ssa_name (gimple_vuse (stmt), new_stmt);
+		gimple_set_vdef (new_stmt, new_vdef);
+		phi = create_phi_node (gimple_vdef (stmt), e2->dest);
+		add_phi_arg (phi, new_vdef, e2, UNKNOWN_LOCATION);
+		add_phi_arg (phi, gimple_vuse (stmt), e3, UNKNOWN_LOCATION);
+		SSA_NAME_DEF_STMT (gimple_vdef (stmt)) = phi;
+	      }
+	    gsi = gsi_for_stmt (stmt);
+	    gsi_replace (&gsi, new_stmt, false);
+	    gsi = gsi_for_stmt (cond_stmt);
+	  }
+      }
+
+  return 0;
+}
+
+struct gimple_opt_pass pass_if_unconversion =
+{
+ {
+  GIMPLE_PASS,
+  "ifuncvt",				/* name */
+  OPTGROUP_NONE,			/* optinfo_flags */
+  gate_tree_if_unconversion,		/* gate */
+  main_tree_if_unconversion,		/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
   TV_NONE,				/* tv_id */
   PROP_cfg | PROP_ssa,			/* properties_required */
   0,					/* properties_provided */
--- gcc/tree-vect-data-refs.c.jj	2012-11-02 09:01:47.097575423 +0100
+++ gcc/tree-vect-data-refs.c	2012-11-02 09:29:49.647621187 +0100
@@ -2707,6 +2707,24 @@  vect_check_gather (gimple stmt, loop_vec
   enum machine_mode pmode;
   int punsignedp, pvolatilep;
 
+  base = DR_REF (dr);
+  /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
+     see if we can use the def stmt of the address.  */
+  if (is_gimple_call (stmt)
+      && gimple_call_internal_p (stmt)
+      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+      && TREE_CODE (base) == MEM_REF
+      && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
+      && integer_zerop (TREE_OPERAND (base, 1))
+      && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
+    {
+      gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
+      if (is_gimple_assign (def_stmt)
+	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
+	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
+    }
+
   /* The gather builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
@@ -2719,7 +2737,7 @@  vect_check_gather (gimple stmt, loop_vec
      vectorized.  The following code attempts to find such a preexistng
      SSA_NAME OFF and put the loop invariants into a tree BASE
      that can be gimplified before the loop.  */
-  base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
+  base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
 			      &pmode, &punsignedp, &pvolatilep, false);
   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
 
@@ -3186,7 +3204,10 @@  vect_analyze_data_refs (loop_vec_info lo
       offset = unshare_expr (DR_OFFSET (dr));
       init = unshare_expr (DR_INIT (dr));
 
-      if (is_gimple_call (stmt))
+      if (is_gimple_call (stmt)
+	  && (!gimple_call_internal_p (stmt)
+	      || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
+		  && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
 	{
 	  if (dump_enabled_p ())
 	    {
@@ -4893,6 +4914,14 @@  vect_supportable_dr_alignment (struct da
   if (aligned_access_p (dr) && !check_aligned_accesses)
     return dr_aligned;
 
+  /* For now assume all conditional loads/stores support unaligned
+     access without any special code.  */
+  if (is_gimple_call (stmt)
+      && gimple_call_internal_p (stmt)
+      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+    return dr_unaligned_supported;
+
   if (loop_vinfo)
     {
       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
--- gcc/gimple.h.jj	2012-11-02 09:01:47.206574700 +0100
+++ gcc/gimple.h	2012-11-02 09:13:34.211881846 +0100
@@ -4938,7 +4938,13 @@  gimple_expr_type (const_gimple stmt)
 	 useless conversion involved.  That means returning the
 	 original RHS type as far as we can reconstruct it.  */
       if (code == GIMPLE_CALL)
-	type = gimple_call_return_type (stmt);
+	{
+	  if (gimple_call_internal_p (stmt)
+	      && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	    type = TREE_TYPE (gimple_call_arg (stmt, 3));
+	  else
+	    type = gimple_call_return_type (stmt);
+	}
       else
 	switch (gimple_assign_rhs_code (stmt))
 	  {
--- gcc/internal-fn.c.jj	2012-11-02 09:01:47.582572197 +0100
+++ gcc/internal-fn.c	2012-11-02 10:06:36.167504880 +0100
@@ -1,5 +1,5 @@ 
 /* Internal functions.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -109,6 +109,52 @@  expand_STORE_LANES (gimple stmt)
   expand_insn (get_multi_vector_move (type, vec_store_lanes_optab), 2, ops);
 }
 
+static void
+expand_MASK_LOAD (gimple stmt)
+{
+  struct expand_operand ops[3];
+  tree type, lhs, rhs, maskt;
+  rtx mem, target, mask;
+
+  maskt = gimple_call_arg (stmt, 2);
+  lhs = gimple_call_lhs (stmt);
+  type = TREE_TYPE (lhs);
+  rhs = build2 (MEM_REF, type, gimple_call_arg (stmt, 0),
+		gimple_call_arg (stmt, 1));
+
+  mem = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  gcc_assert (MEM_P (mem));
+  mask = expand_normal (maskt);
+  target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  create_output_operand (&ops[0], target, TYPE_MODE (type));
+  create_fixed_operand (&ops[1], mem);
+  create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
+  expand_insn (optab_handler (maskload_optab, TYPE_MODE (type)), 3, ops);
+}
+
+static void
+expand_MASK_STORE (gimple stmt)
+{
+  struct expand_operand ops[3];
+  tree type, lhs, rhs, maskt;
+  rtx mem, reg, mask;
+
+  maskt = gimple_call_arg (stmt, 2);
+  rhs = gimple_call_arg (stmt, 3);
+  type = TREE_TYPE (rhs);
+  lhs = build2 (MEM_REF, type, gimple_call_arg (stmt, 0),
+		gimple_call_arg (stmt, 1));
+
+  mem = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  gcc_assert (MEM_P (mem));
+  mask = expand_normal (maskt);
+  reg = expand_normal (rhs);
+  create_fixed_operand (&ops[0], mem);
+  create_input_operand (&ops[1], reg, TYPE_MODE (type));
+  create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
+  expand_insn (optab_handler (maskstore_optab, TYPE_MODE (type)), 3, ops);
+}
+
 /* Routines to expand each internal function, indexed by function number.
    Each routine has the prototype:
 
--- gcc/tree-vect-loop.c.jj	2012-11-02 09:01:47.543572462 +0100
+++ gcc/tree-vect-loop.c	2012-11-02 09:13:34.209881860 +0100
@@ -351,7 +351,11 @@  vect_determine_vectorization_factor (loo
 		analyze_pattern_stmt = false;
 	    }
 
-	  if (gimple_get_lhs (stmt) == NULL_TREE)
+	  if (gimple_get_lhs (stmt) == NULL_TREE
+	      /* MASK_STORE has no lhs, but is ok.  */
+	      && (!is_gimple_call (stmt)
+		  || !gimple_call_internal_p (stmt)
+		  || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -388,7 +392,12 @@  vect_determine_vectorization_factor (loo
 	  else
 	    {
 	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
-	      scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
+	      if (is_gimple_call (stmt)
+		  && gimple_call_internal_p (stmt)
+		  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+		scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+	      else
+		scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 	      if (dump_enabled_p ())
 		{
 		  dump_printf_loc (MSG_NOTE, vect_location,
--- gcc/passes.c.jj	2012-11-01 09:33:29.000000000 +0100
+++ gcc/passes.c	2012-11-02 16:26:09.452391414 +0100
@@ -1484,6 +1484,7 @@  init_optimization_passes (void)
 	      struct opt_pass **p = &pass_vectorize.pass.sub;
 	      NEXT_PASS (pass_dce_loop);
 	    }
+	  NEXT_PASS (pass_if_unconversion);
           NEXT_PASS (pass_predcom);
 	  NEXT_PASS (pass_complete_unroll);
 	  NEXT_PASS (pass_slp_vectorize);
--- gcc/optabs.def.jj	2012-11-02 09:01:47.148575082 +0100
+++ gcc/optabs.def	2012-11-02 09:13:34.205881885 +0100
@@ -242,6 +242,8 @@  OPTAB_D (sdot_prod_optab, "sdot_prod$I$a
 OPTAB_D (ssum_widen_optab, "widen_ssum$I$a3")
 OPTAB_D (udot_prod_optab, "udot_prod$I$a")
 OPTAB_D (usum_widen_optab, "widen_usum$I$a3")
+OPTAB_D (maskload_optab, "maskload$a")
+OPTAB_D (maskstore_optab, "maskstore$a")
 OPTAB_D (vec_extract_optab, "vec_extract$a")
 OPTAB_D (vec_init_optab, "vec_init$a")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
--- gcc/tree-pass.h.jj	2012-11-01 09:33:25.000000000 +0100
+++ gcc/tree-pass.h	2012-11-02 11:52:38.598409785 +0100
@@ -1,5 +1,5 @@ 
 /* Definitions for describing one tree-ssa optimization pass.
-   Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+   Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
    Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>
 
@@ -284,6 +284,7 @@  extern struct gimple_opt_pass pass_recor
 extern struct gimple_opt_pass pass_graphite;
 extern struct gimple_opt_pass pass_graphite_transforms;
 extern struct gimple_opt_pass pass_if_conversion;
+extern struct gimple_opt_pass pass_if_unconversion;
 extern struct gimple_opt_pass pass_loop_distribution;
 extern struct gimple_opt_pass pass_vectorize;
 extern struct gimple_opt_pass pass_slp_vectorize;
--- gcc/tree-vect-stmts.c.jj	2012-11-02 09:01:47.555572381 +0100
+++ gcc/tree-vect-stmts.c	2012-11-02 10:32:47.782340547 +0100
@@ -218,7 +218,7 @@  vect_mark_relevant (VEC(gimple,heap) **w
           /* This use is out of pattern use, if LHS has other uses that are
              pattern uses, we should mark the stmt itself, and not the pattern
              stmt.  */
-	  if (TREE_CODE (lhs) == SSA_NAME)
+	  if (lhs && TREE_CODE (lhs) == SSA_NAME)
 	    FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
 	      {
 		if (is_gimple_debug (USE_STMT (use_p)))
@@ -376,7 +376,27 @@  exist_non_indexing_operands_for_use_p (t
      first case, and whether var corresponds to USE.  */
 
   if (!gimple_assign_copy_p (stmt))
-    return false;
+    {
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt))
+	switch (gimple_call_internal_fn (stmt))
+	  {
+	  case IFN_MASK_STORE:
+	    operand = gimple_call_arg (stmt, 3);
+	    if (operand == use)
+	      return true;
+	    /* FALLTHRU */
+	  case IFN_MASK_LOAD:
+	    operand = gimple_call_arg (stmt, 2);
+	    if (operand == use)
+	      return true;
+	    break;
+	  default:
+	    break;
+	  }
+      return false;
+    }
+
   if (TREE_CODE (gimple_assign_lhs (stmt)) == SSA_NAME)
     return false;
   operand = gimple_assign_rhs1 (stmt);
@@ -1693,6 +1713,401 @@  vectorizable_function (gimple call, tree
 						        vectype_in);
 }
 
+
+static tree permute_vec_elements (tree, tree, tree, gimple,
+				  gimple_stmt_iterator *);
+
+
+static bool
+vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
+			      gimple *vec_stmt, slp_tree slp_node)
+{
+  tree vec_dest = NULL;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  stmt_vec_info prev_stmt_info;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
+  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree elem_type;
+  gimple new_stmt;
+  tree dummy;
+  tree dataref_ptr = NULL_TREE;
+  gimple ptr_incr;
+  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  int ncopies;
+  int i, j;
+  bool inv_p;
+  tree gather_base = NULL_TREE, gather_off = NULL_TREE;
+  tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
+  int gather_scale = 1;
+  enum vect_def_type gather_dt = vect_unknown_def_type;
+  bool is_store;
+  tree mask;
+  gimple def_stmt;
+  tree def;
+  enum vect_def_type dt;
+
+  if (slp_node != NULL)
+    return false;
+
+  ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+  gcc_assert (ncopies >= 1);
+
+  is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
+  mask = gimple_call_arg (stmt, 2);
+  if (TYPE_PRECISION (TREE_TYPE (mask))
+      != GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))))
+    return false;
+
+  /* FORNOW. This restriction should be relaxed.  */
+  if (nested_in_vect_loop && ncopies > 1)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "multiple types in nested loop.");
+      return false;
+    }
+
+  if (!STMT_VINFO_RELEVANT_P (stmt_info))
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  if (!STMT_VINFO_DATA_REF (stmt_info))
+    return false;
+
+  elem_type = TREE_TYPE (vectype);
+
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+    return false;
+
+  if (STMT_VINFO_GATHER_P (stmt_info))
+    {
+      gimple def_stmt;
+      tree def;
+      gather_decl = vect_check_gather (stmt, loop_vinfo, &gather_base,
+				       &gather_off, &gather_scale);
+      gcc_assert (gather_decl);
+      if (!vect_is_simple_use_1 (gather_off, NULL, loop_vinfo, NULL,
+				 &def_stmt, &def, &gather_dt,
+				 &gather_off_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "gather index use not simple.");
+	  return false;
+	}
+    }
+  else if (tree_int_cst_compare (nested_in_vect_loop
+				 ? STMT_VINFO_DR_STEP (stmt_info)
+				 : DR_STEP (dr), size_zero_node) < 0)
+    return false;
+  else if (optab_handler (is_store ? maskstore_optab : maskload_optab,
+			  TYPE_MODE (vectype)) == CODE_FOR_nothing)
+    return false;
+
+  if (TREE_CODE (mask) != SSA_NAME)
+    return false;
+
+  if (!vect_is_simple_use (mask, stmt, loop_vinfo, NULL,
+			   &def_stmt, &def, &dt))
+    return false;
+
+  if (is_store)
+    {
+      tree rhs = gimple_call_arg (stmt, 3);
+      if (!vect_is_simple_use (rhs, stmt, loop_vinfo, NULL,
+			       &def_stmt, &def, &dt))
+	return false;
+    }
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
+      return true;
+    }
+
+  /** Transform.  **/
+
+  if (STMT_VINFO_GATHER_P (stmt_info))
+    {
+      tree vec_oprnd0 = NULL_TREE, op;
+      tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gather_decl));
+      tree rettype, srctype, ptrtype, idxtype, masktype, scaletype;
+      tree ptr, vec_mask = NULL_TREE, mask_op, var, scale;
+      tree perm_mask = NULL_TREE, prev_res = NULL_TREE;
+      edge pe = loop_preheader_edge (loop);
+      gimple_seq seq;
+      basic_block new_bb;
+      enum { NARROW, NONE, WIDEN } modifier;
+      int gather_off_nunits = TYPE_VECTOR_SUBPARTS (gather_off_vectype);
+
+      if (nunits == gather_off_nunits)
+	modifier = NONE;
+      else if (nunits == gather_off_nunits / 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, gather_off_nunits);
+	  modifier = WIDEN;
+
+	  for (i = 0; i < gather_off_nunits; ++i)
+	    sel[i] = i | nunits;
+
+	  perm_mask = vect_gen_perm_mask (gather_off_vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	}
+      else if (nunits == gather_off_nunits * 2)
+	{
+	  unsigned char *sel = XALLOCAVEC (unsigned char, nunits);
+	  modifier = NARROW;
+
+	  for (i = 0; i < nunits; ++i)
+	    sel[i] = i < gather_off_nunits
+		     ? i : i + nunits - gather_off_nunits;
+
+	  perm_mask = vect_gen_perm_mask (vectype, sel);
+	  gcc_assert (perm_mask != NULL_TREE);
+	  ncopies *= 2;
+	}
+      else
+	gcc_unreachable ();
+
+      rettype = TREE_TYPE (TREE_TYPE (gather_decl));
+      srctype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      ptrtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      idxtype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      masktype = TREE_VALUE (arglist); arglist = TREE_CHAIN (arglist);
+      scaletype = TREE_VALUE (arglist);
+      gcc_checking_assert (types_compatible_p (srctype, rettype)
+			   && types_compatible_p (srctype, masktype));
+
+      vec_dest = vect_create_destination_var (gimple_call_lhs (stmt), vectype);
+
+      ptr = fold_convert (ptrtype, gather_base);
+      if (!is_gimple_min_invariant (ptr))
+	{
+	  ptr = force_gimple_operand (ptr, &seq, true, NULL_TREE);
+	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+	  gcc_assert (!new_bb);
+	}
+
+      scale = build_int_cst (scaletype, gather_scale);
+
+      prev_stmt_info = NULL;
+      for (j = 0; j < ncopies; ++j)
+	{
+	  if (modifier == WIDEN && (j & 1))
+	    op = permute_vec_elements (vec_oprnd0, vec_oprnd0,
+				       perm_mask, stmt, gsi);
+	  else if (j == 0)
+	    op = vec_oprnd0
+	      = vect_get_vec_def_for_operand (gather_off, stmt, NULL);
+	  else
+	    op = vec_oprnd0
+	      = vect_get_vec_def_for_stmt_copy (gather_dt, vec_oprnd0);
+
+	  if (!useless_type_conversion_p (idxtype, TREE_TYPE (op)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (op))
+			  == TYPE_VECTOR_SUBPARTS (idxtype));
+	      var = vect_get_new_vect_var (idxtype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      op = build1 (VIEW_CONVERT_EXPR, idxtype, op);
+	      new_stmt
+		= gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var,
+						op, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      op = var;
+	    }
+
+	  if (j == 0)
+	    vec_mask = vect_get_vec_def_for_operand (mask, stmt, NULL);
+	  else
+	    {
+	      vect_is_simple_use (vec_mask, NULL, loop_vinfo, NULL, &def_stmt,
+				  &def, &dt);
+	      vec_mask = vect_get_vec_def_for_stmt_copy (dt, vec_mask);
+	    }
+
+	  mask_op = vec_mask;
+	  if (!useless_type_conversion_p (masktype, TREE_TYPE (vec_mask)))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask_op))
+			  == TYPE_VECTOR_SUBPARTS (masktype));
+	      var = vect_get_new_vect_var (masktype, vect_simple_var, NULL);
+	      var = make_ssa_name (var, NULL);
+	      mask_op = build1 (VIEW_CONVERT_EXPR, masktype, mask_op);
+	      new_stmt
+		= gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var,
+						mask_op, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      mask_op = var;
+	    }
+
+	  new_stmt
+	    = gimple_build_call (gather_decl, 5, mask_op, ptr, op, mask_op,
+				 scale);
+
+	  if (!useless_type_conversion_p (vectype, rettype))
+	    {
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (vectype)
+			  == TYPE_VECTOR_SUBPARTS (rettype));
+	      var = vect_get_new_vect_var (rettype, vect_simple_var, NULL);
+	      op = make_ssa_name (var, new_stmt);
+	      gimple_call_set_lhs (new_stmt, op);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      var = make_ssa_name (vec_dest, NULL);
+	      op = build1 (VIEW_CONVERT_EXPR, vectype, op);
+	      new_stmt
+		= gimple_build_assign_with_ops (VIEW_CONVERT_EXPR, var, op,
+						NULL_TREE);
+	    }
+	  else
+	    {
+	      var = make_ssa_name (vec_dest, new_stmt);
+	      gimple_call_set_lhs (new_stmt, var);
+	    }
+
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (modifier == NARROW)
+	    {
+	      if ((j & 1) == 0)
+		{
+		  prev_res = var;
+		  continue;
+		}
+	      var = permute_vec_elements (prev_res, var,
+					  perm_mask, stmt, gsi);
+	      new_stmt = SSA_NAME_DEF_STMT (var);
+	    }
+
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+      return true;
+    }
+  else if (is_store)
+    {
+      tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
+      prev_stmt_info = NULL;
+      for (i = 0; i < ncopies; i++)
+	{
+	  unsigned align, misalign;
+
+	  if (i == 0)
+	    {
+	      tree rhs = gimple_call_arg (stmt, 3);
+	      vec_rhs = vect_get_vec_def_for_operand (rhs, stmt, NULL);
+	      vec_mask = vect_get_vec_def_for_operand (mask, stmt, NULL);
+	      /* We should have catched mismatched types earlier.  */
+	      gcc_assert (useless_type_conversion_p (vectype,
+						     TREE_TYPE (vec_rhs)));
+	      dataref_ptr = vect_create_data_ref_ptr (stmt, vectype, NULL,
+						      NULL_TREE, &dummy, gsi,
+						      &ptr_incr, false, &inv_p);
+	      gcc_assert (!inv_p);
+	    }
+	  else
+	    {
+	      vect_is_simple_use (vec_rhs, NULL, loop_vinfo, NULL, &def_stmt,
+				  &def, &dt);
+	      vec_rhs = vect_get_vec_def_for_stmt_copy (dt, vec_rhs);
+	      vect_is_simple_use (vec_mask, NULL, loop_vinfo, NULL, &def_stmt,
+				  &def, &dt);
+	      vec_mask = vect_get_vec_def_for_stmt_copy (dt, vec_mask);
+	      dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
+					     TYPE_SIZE_UNIT (vectype));
+	    }
+
+	  align = TYPE_ALIGN_UNIT (vectype);
+	  if (aligned_access_p (dr))
+	    misalign = 0;
+	  else if (DR_MISALIGNMENT (dr) == -1)
+	    {
+	      align = TYPE_ALIGN_UNIT (elem_type);
+	      misalign = 0;
+	    }
+	  else
+	    misalign = DR_MISALIGNMENT (dr);
+	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+				  misalign);
+	  new_stmt
+	    = gimple_build_call_internal (IFN_MASK_STORE, 4, dataref_ptr,
+					  gimple_call_arg (stmt, 1),
+					  vec_mask, vec_rhs);
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	  if (i == 0)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+    }
+  else
+    {
+      tree vec_mask = NULL_TREE;
+      prev_stmt_info = NULL;
+      vec_dest = vect_create_destination_var (gimple_call_lhs (stmt), vectype);
+      for (i = 0; i < ncopies; i++)
+	{
+	  unsigned align, misalign;
+
+	  if (i == 0)
+	    {
+	      vec_mask = vect_get_vec_def_for_operand (mask, stmt, NULL);
+	      dataref_ptr = vect_create_data_ref_ptr (stmt, vectype, NULL,
+						      NULL_TREE, &dummy, gsi,
+						      &ptr_incr, false, &inv_p);
+	      gcc_assert (!inv_p);
+	    }
+	  else
+	    {
+	      vect_is_simple_use (vec_mask, NULL, loop_vinfo, NULL, &def_stmt,
+				  &def, &dt);
+	      vec_mask = vect_get_vec_def_for_stmt_copy (dt, vec_mask);
+	      dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
+					     TYPE_SIZE_UNIT (vectype));
+	    }
+
+	  align = TYPE_ALIGN_UNIT (vectype);
+	  if (aligned_access_p (dr))
+	    misalign = 0;
+	  else if (DR_MISALIGNMENT (dr) == -1)
+	    {
+	      align = TYPE_ALIGN_UNIT (elem_type);
+	      misalign = 0;
+	    }
+	  else
+	    misalign = DR_MISALIGNMENT (dr);
+	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+				  misalign);
+	  new_stmt
+	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
+					  gimple_call_arg (stmt, 1),
+					  vec_mask);
+	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest, NULL));
+	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	  if (i == 0)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+	  prev_stmt_info = vinfo_for_stmt (new_stmt);
+	}
+    }
+
+  return true;
+}
+
+
 /* Function vectorizable_call.
 
    Check if STMT performs a function call that can be vectorized.
@@ -1735,10 +2150,16 @@  vectorizable_call (gimple stmt, gimple_s
   if (!is_gimple_call (stmt))
     return false;
 
-  if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
+  if (stmt_can_throw_internal (stmt))
     return false;
 
-  if (stmt_can_throw_internal (stmt))
+  if (gimple_call_internal_p (stmt)
+      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+    return vectorizable_mask_load_store (stmt, gsi, vec_stmt,
+					 slp_node);
+
+  if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
     return false;
 
   vectype_out = STMT_VINFO_VECTYPE (stmt_info);
@@ -3442,10 +3863,6 @@  vectorizable_shift (gimple stmt, gimple_
 }
 
 
-static tree permute_vec_elements (tree, tree, tree, gimple,
-				  gimple_stmt_iterator *);
-
-
 /* Function vectorizable_operation.
 
    Check if STMT performs a binary, unary or ternary operation that can
@@ -5836,6 +6253,10 @@  vect_transform_stmt (gimple stmt, gimple
     case call_vec_info_type:
       done = vectorizable_call (stmt, gsi, &vec_stmt, slp_node);
       stmt = gsi_stmt (*gsi);
+      if (is_gimple_call (stmt)
+	  && gimple_call_internal_p (stmt)
+	  && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	is_store = true;
       break;
 
     case reduc_vec_info_type: