Patchwork [gomp4] Some progress on #pragma omp simd

login
register
mail settings
Submitter Jakub Jelinek
Date June 11, 2013, 4:56 p.m.
Message ID <20130611165616.GG2336@tucnak.redhat.com>
Download mbox | patch
Permalink /patch/250576/
State New
Headers show

Comments

Jakub Jelinek - June 11, 2013, 4:56 p.m.
On Sat, Apr 27, 2013 at 08:17:35PM +0200, Jakub Jelinek wrote:
> One way we could implement the SIMD private/lastprivate/reduction
> vars and for Cilk+ also firstprivate ones might be:
> - query the target what the maximum possible vectorization factor for the
>   loop is (and min that with simdlen if any), let's call it MAXVF
...

I've made some progress on this, but am still far from being there, so
looking for comments.  Known unhandled things are that the inliner doesn't
remap the simd uids (neither in loop structure nor in
__builtin_GOMP.simd_{vf,lane} arguments), the 
adjust_simduid_builtins subpass doesn't actually try to shrink the sizes
of the arrays back to the decided vectorization factor, LTO doesn't handle
simd uids, and most importantly that the vectorizer still does a poor job on
it.  Also, not sure what to do for lastprivate, probably use the magic
arrays and just in the epilogue of the loop compute which of the array items
belonged to the last iteration somehow.

I was using:
int a, b[1024];
struct S { S (); ~S (); int s; };

#pragma omp declare simd
__attribute__((noinline, noclone)) void
bar (int &x, int &y)
{
  x += 4;
  y += 4;
}

int
foo (void)
{
  int i, j = 0, x;
  S s;
  #pragma omp simd private(x) private(s) reduction(+:j)
  for (i = 0; i < 1024; i++)
    {
      a = 6;
      x = 8;
      s.s += 1;
      b[i] += a + x + s.s;
      j += b[i];
    }
  return j;
}
as a testcase, x being non-addressable scalar is just handled
normally, it shouldn't be observable if we have just one copy or more.
s, being addressable class that also needs to be constructed/destructed is
using the magic arrays and so is reduction.  While the vectorizer can
recognize some reductions, e.g. without -ffast-math it will not vectorize
any floating point ones because that means changing the order of
computations, while when they are mandated to be one copy per simd lane,
the order of computations is clear and thus can be vectorized.
Also, especially with user defined reductions the reduction can be a
function call etc.

So, what I get out of ompexp on this is an initializing loop like:
  <bb 3>:
  D.2719 = __builtin_GOMP.simd_vf (1);
  D.2713 = 0;
  goto <bb 5>;

  <bb 4>:
  D.2715[D.2713] = 0;
  D.2718 = &D.2717[D.2713];
  S::S (D.2718);
  D.2713 = D.2713 + 1;
  
  <bb 5>:
  if (D.2713 < D.2719)
    goto <bb 4>;
  else
    goto <bb 6>;
then the main loop:
  <bb 6>:
  i = 0;
  goto <bb 8>;

  <bb 7>:
  D.2714 = __builtin_GOMP.simd_lane (1);
  a = 6;
  x = 8;
  D.2701 = D.2717[D.2714].s;
  D.2702 = D.2701 + 1;
  D.2717[D.2714].s = D.2702;
  D.2703 = b[i];
  a.0 = a;
  D.2705 = a.0 + x;
  D.2701 = D.2717[D.2714].s;
  D.2706 = D.2705 + D.2701;
  D.2707 = D.2703 + D.2706;
  b[i] = D.2707;
  D.2703 = b[i];
  D.2727 = D.2715[D.2714];
  D.2728 = D.2703 + D.2727;
  D.2715[D.2714] = D.2728;
  i = i + 1;

  <bb 8>:
  if (i < 1024)
    goto <bb 7>;
  else
    goto <bb 9>;
and lastly a destruction/reduction loop:
  <bb 9>:
  D.2723 = __builtin_GOMP.simd_vf (1);
  D.2713 = 0;
  goto <bb 11>;

  <bb 10>:
  D.2716 = D.2715[D.2713];
  j = j + D.2716;
  D.2718 = &D.2717[D.2713];
  S::~S (D.2718);
  D.2713 = D.2713 + 1;

  <bb 11>:
  if (D.2713 < D.2723)
    goto <bb 10>;
  else
    goto <bb 12>;

I've discussed with richi on IRC the vectorizer data ref stuff somewhat,
but in the end kept this special function simd_lane, as the array indexes
modulo vectorization factor are nothing close to what the vectorizer was able
to handle.  What I get in *.optimized is:
  <bb 5>:
  
  <bb 6>:
  # ivtmp.41_45 = PHI <ivtmp.41_32(5), ivtmp.41_64(11)>
  vect__17.10_7 = MEM[(struct S[32] *)&D.2717];
  vect__18.11_1 = vect__17.10_7 + { 1, 1, 1, 1, 1, 1, 1, 1 };
  MEM[(struct S[32] *)&D.2717] = vect__18.11_1;
  _65 = (void *) ivtmp.41_45;
  vect__20.17_50 = MEM[base: _65, offset: 0B];
  vect__21.18_52 = vect__17.10_7 + { 15, 15, 15, 15, 15, 15, 15, 15 };
  vect__22.20_53 = vect__20.17_50 + vect__21.18_52;
  MEM[base: _65, offset: 0B] = vect__22.20_53;
  vect__24.25_58 = MEM[(int[32] *)&D.2715];
  vect__25.26_59 = vect__22.20_53 + vect__24.25_58;
  MEM[(int[32] *)&D.2715] = vect__25.26_59;
  ivtmp.41_32 = ivtmp.41_45 + 32;
  if (ivtmp.41_32 != _20)
    goto <bb 5>;
  else
    goto <bb 7>;
though, there is no pass that would be able to turn the memory accesses
into a vector load before the loop and just operations on vector register
followed by vector store at the end of the loop, and while CSE is able to
improve it somewhat at the RTL level, I still end up with unnecessary stack
stores inside of the loop.
        vmovdqa 160(%rsp), %ymm0
        movl    $b, %eax
        vmovdqa .LC0(%rip), %ymm3
        vmovdqa .LC1(%rip), %ymm2
.L8:
        vmovdqa %ymm0, %ymm1
        addq    $32, %rax
        vpaddd  %ymm3, %ymm0, %ymm0
        vmovdqa %ymm0, 160(%rsp)	! This is not needed
        vpaddd  %ymm2, %ymm1, %ymm1
        vpaddd  -32(%rax), %ymm1, %ymm1
        vmovdqa %ymm1, -32(%rax)
        cmpq    $b+4096, %rax
        vpaddd  32(%rsp), %ymm1, %ymm1	! 32(%rsp) could be read before loop
        vmovdqa %ymm1, 32(%rsp)		! This is not needed
        jne     .L8
! and we could just store %ymm0 to 160(%rsp) here
and %ymm4 to 32(%rsp).

Thoughts?



	Jakub
Aldy Hernandez - June 13, 2013, 8:15 p.m.
> it.  Also, not sure what to do for lastprivate, probably use the magic
> arrays and just in the epilogue of the loop compute which of the array items
> belonged to the last iteration somehow.

Can't you do (for lastprivate(abc) something like:

	if (i == 1024) {
		abc = magic_abc[__builtin_GOMP.simd_lane (1)];
	}

> #pragma omp declare simd
> __attribute__((noinline, noclone)) void
> bar (int &x, int &y)
> {
>    x += 4;
>    y += 4;
> }

Does bar() have anything to do with this example, or was this an oversight?

> using the magic arrays and so is reduction.  While the vectorizer can
> recognize some reductions, e.g. without -ffast-math it will not vectorize
> any floating point ones because that means changing the order of
> computations, while when they are mandated to be one copy per simd lane,
> the order of computations is clear and thus can be vectorized.

Let me see if I understand (all things floating point confuse me). 
You're saying that the vectorizer, in its present state will refuse to 
vectorize reductions with floats because it may possibly change the 
order of computations, but we should override that behavior for OMP simd 
loops?

>    D.2717[D.2714].s = D.2702;
>    D.2703 = b[i];
>    a.0 = a;
>    D.2705 = a.0 + x;
>    D.2701 = D.2717[D.2714].s;

Is there some subtlety in which we have to dereference D.2717 twice 
here, or can we reuse D.2702?

Aldy
Jakub Jelinek - June 13, 2013, 8:20 p.m.
On Thu, Jun 13, 2013 at 03:15:45PM -0500, Aldy Hernandez wrote:
> 
> >it.  Also, not sure what to do for lastprivate, probably use the magic
> >arrays and just in the epilogue of the loop compute which of the array items
> >belonged to the last iteration somehow.
> 
> Can't you do (for lastprivate(abc) something like:
> 
> 	if (i == 1024) {
> 		abc = magic_abc[__builtin_GOMP.simd_lane (1)];
> 	}

Well, if you do that inside of the loop, you make it probably not
vectorizable.  So you need something like:
abc = magic_abc[(count - 1) & (__builtin_GOMP.simd_vf (1) - 1)];
or so.

> >#pragma omp declare simd
> >__attribute__((noinline, noclone)) void
> >bar (int &x, int &y)
> >{
> >   x += 4;
> >   y += 4;
> >}
> 
> Does bar() have anything to do with this example, or was this an oversight?

It was there just to make the stuff addressable during gimplification, and
possibly no longer addressable afterwards.

> >using the magic arrays and so is reduction.  While the vectorizer can
> >recognize some reductions, e.g. without -ffast-math it will not vectorize
> >any floating point ones because that means changing the order of
> >computations, while when they are mandated to be one copy per simd lane,
> >the order of computations is clear and thus can be vectorized.
> 
> Let me see if I understand (all things floating point confuse me).
> You're saying that the vectorizer, in its present state will refuse
> to vectorize reductions with floats because it may possibly change
> the order of computations, but we should override that behavior for
> OMP simd loops?

No, I'm saying that in simd loops the order of computations is different
(and depending on the vectorization factor), as each SIMD lane is supposed
to have its own private variable and at the end everything is reduced
together.

> >   D.2717[D.2714].s = D.2702;
> >   D.2703 = b[i];
> >   a.0 = a;
> >   D.2705 = a.0 + x;
> >   D.2701 = D.2717[D.2714].s;
> 
> Is there some subtlety in which we have to dereference D.2717 twice
> here, or can we reuse D.2702?

Usually it is FRE/PRE that optimizes at least the loads, and DSE stores,
but FRE/PRE isn't run after vectorization I think.

	Jakub

Patch

--- gcc/tree-vectorizer.h.jj	2013-05-20 13:21:34.000000000 +0200
+++ gcc/tree-vectorizer.h	2013-06-11 10:01:12.539179209 +0200
@@ -576,6 +576,9 @@  typedef struct _stmt_vec_info {
   /* For loads only, true if this is a gather load.  */
   bool gather_p;
   bool stride_load_p;
+
+  /* For both loads and stores.  */
+  bool simd_lane_access_p;
 } *stmt_vec_info;
 
 /* Access Functions.  */
@@ -591,6 +594,7 @@  typedef struct _stmt_vec_info {
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_P(S)		   (S)->gather_p
 #define STMT_VINFO_STRIDE_LOAD_P(S)	   (S)->stride_load_p
+#define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 
 #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_base_address
 #define STMT_VINFO_DR_INIT(S)              (S)->dr_init
--- gcc/Makefile.in.jj	2013-05-20 13:21:43.000000000 +0200
+++ gcc/Makefile.in	2013-05-20 13:21:43.000000000 +0200
@@ -2633,7 +2633,7 @@  tree-vect-data-refs.o: tree-vect-data-re
 tree-vectorizer.o: tree-vectorizer.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(DUMPFILE_H) $(TM_H) $(GGC_H) $(TREE_H) $(TREE_FLOW_H) \
    $(CFGLOOP_H) $(TREE_PASS_H) $(TREE_VECTORIZER_H) \
-   $(TREE_PRETTY_PRINT_H)
+   $(TREE_PRETTY_PRINT_H) $(HASH_TABLE_H) tree-ssa-propagate.h
 tree-loop-distribution.o: tree-loop-distribution.c $(CONFIG_H) $(SYSTEM_H) \
    coretypes.h $(TREE_FLOW_H) $(CFGLOOP_H) $(TREE_DATA_REF_H) $(TREE_PASS_H)
 tree-parloops.o: tree-parloops.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
--- gcc/tree-data-ref.c.jj	2013-05-13 16:49:07.000000000 +0200
+++ gcc/tree-data-ref.c	2013-06-10 18:18:10.865489362 +0200
@@ -4331,10 +4331,24 @@  get_references_in_stmt (gimple stmt, vec
   /* ASM_EXPR and CALL_EXPR may embed arbitrary side effects.
      As we cannot model data-references to not spelled out
      accesses give up if they may occur.  */
-  if ((stmt_code == GIMPLE_CALL
-       && !(gimple_call_flags (stmt) & ECF_CONST))
-      || (stmt_code == GIMPLE_ASM
-	  && (gimple_asm_volatile_p (stmt) || gimple_vuse (stmt))))
+  if (stmt_code == GIMPLE_CALL
+      && !(gimple_call_flags (stmt) & ECF_CONST))
+    {
+      /* Allow __builtin_GOMP.simd_lane in their own loops.  */
+      if (!gimple_call_builtin_p (stmt, BUILT_IN_GOMP_SIMD_LANE))
+	clobbers_memory = true;
+      else
+	{
+	  struct loop *loop = gimple_bb (stmt)->loop_father;
+	  tree uid = gimple_call_arg (stmt, 0);
+	  if (loop == NULL
+	      || !host_integerp (uid, 1)
+	      || loop->simduid != tree_low_cst (uid, 1))
+	    clobbers_memory = true;
+	}
+    }
+  else if (stmt_code == GIMPLE_ASM
+	   && (gimple_asm_volatile_p (stmt) || gimple_vuse (stmt)))
     clobbers_memory = true;
 
   if (!gimple_vuse (stmt))
--- gcc/tree-vect-data-refs.c.jj	2013-05-20 13:21:36.000000000 +0200
+++ gcc/tree-vect-data-refs.c	2013-06-11 14:04:33.043359212 +0200
@@ -2877,6 +2877,7 @@  vect_analyze_data_refs (loop_vec_info lo
       stmt_vec_info stmt_info;
       tree base, offset, init;
       bool gather = false;
+      bool simd_lane_access = false;
       int vf;
 
       if (!dr || !DR_REF (dr))
@@ -2894,12 +2895,17 @@  vect_analyze_data_refs (loop_vec_info lo
       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
 	  || !DR_STEP (dr))
         {
-	  /* If target supports vector gather loads, see if they can't
-	     be used.  */
-	  if (loop_vinfo
-	      && DR_IS_READ (dr)
+	  bool maybe_gather
+	    = DR_IS_READ (dr)
 	      && !TREE_THIS_VOLATILE (DR_REF (dr))
-	      && targetm.vectorize.builtin_gather != NULL
+	      && targetm.vectorize.builtin_gather != NULL;
+	  bool maybe_simd_lane_access
+	    = loop_vinfo && loop->simduid;
+
+	  /* If target supports vector gather loads, or if this might be
+	     a SIMD lane access, see if they can't be used.  */
+	  if (loop_vinfo
+	      && (maybe_gather || maybe_simd_lane_access)
 	      && !nested_in_vect_loop_p (loop, stmt))
 	    {
 	      struct data_reference *newdr
@@ -2912,14 +2918,56 @@  vect_analyze_data_refs (loop_vec_info lo
 		  && DR_STEP (newdr)
 		  && integer_zerop (DR_STEP (newdr)))
 		{
-		  dr = newdr;
-		  gather = true;
+		  if (maybe_simd_lane_access)
+		    {
+		      tree off = DR_OFFSET (newdr);
+		      STRIP_NOPS (off);
+		      if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
+			  && TREE_CODE (off) == MULT_EXPR
+			  && host_integerp (TREE_OPERAND (off, 1), 1))
+			{
+			  tree step = TREE_OPERAND (off, 1);
+			  off = TREE_OPERAND (off, 0);
+			  STRIP_NOPS (off);
+			  if (CONVERT_EXPR_P (off)
+			      && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
+									  0)))
+				 < TYPE_PRECISION (TREE_TYPE (off)))
+			    off = TREE_OPERAND (off, 0);
+			  if (TREE_CODE (off) == SSA_NAME)
+			    {
+			      gimple def = SSA_NAME_DEF_STMT (off);
+			      tree reft = TREE_TYPE (DR_REF (newdr));
+			      if (gimple_call_builtin_p (def,
+						BUILT_IN_GOMP_SIMD_LANE)
+				  && host_integerp (gimple_call_arg (def, 0),
+						    1)
+				  && (unsigned)
+				     tree_low_cst (gimple_call_arg (def, 0), 1)
+				     == loop->simduid
+				  /* For now.  */
+				  && tree_int_cst_equal (TYPE_SIZE_UNIT (reft),
+							 step))
+				{
+				  DR_OFFSET (newdr) = ssize_int (0);
+				  DR_STEP (newdr) = step;
+				  dr = newdr;
+				  simd_lane_access = true;
+				}
+			    }
+			}
+		    }
+		  if (!simd_lane_access && maybe_gather)
+		    {
+		      dr = newdr;
+		      gather = true;
+		    }
 		}
-	      else
+	      if (!gather && !simd_lane_access)
 		free_data_ref (newdr);
 	    }
 
-	  if (!gather)
+	  if (!gather && !simd_lane_access)
 	    {
 	      if (dump_enabled_p ())
 		{
@@ -2946,7 +2994,7 @@  vect_analyze_data_refs (loop_vec_info lo
           if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
         }
@@ -2979,7 +3027,7 @@  vect_analyze_data_refs (loop_vec_info lo
           if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
@@ -2998,7 +3046,7 @@  vect_analyze_data_refs (loop_vec_info lo
           if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    free_data_ref (dr);
           return false;
 	}
@@ -3019,7 +3067,7 @@  vect_analyze_data_refs (loop_vec_info lo
 	  if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    free_data_ref (dr);
 	  return false;
 	}
@@ -3154,12 +3202,17 @@  vect_analyze_data_refs (loop_vec_info lo
           if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    free_data_ref (dr);
           return false;
         }
 
       STMT_VINFO_DATA_REF (stmt_info) = dr;
+      if (simd_lane_access)
+	{
+	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
+	  datarefs[i] = dr;
+	}
 
       /* Set vectype for STMT.  */
       scalar_type = TREE_TYPE (DR_REF (dr));
@@ -3180,7 +3233,7 @@  vect_analyze_data_refs (loop_vec_info lo
           if (bb_vinfo)
 	    break;
 
-	  if (gather)
+	  if (gather || simd_lane_access)
 	    {
 	      STMT_VINFO_DATA_REF (stmt_info) = NULL;
 	      free_data_ref (dr);
--- gcc/tree-vectorizer.c.jj	2013-05-20 13:21:34.000000000 +0200
+++ gcc/tree-vectorizer.c	2013-06-11 18:34:30.162161013 +0200
@@ -66,13 +66,99 @@  along with GCC; see the file COPYING3.
 #include "cfgloop.h"
 #include "tree-vectorizer.h"
 #include "tree-pass.h"
+#include "hash-table.h"
+#include "tree-ssa-propagate.h"
 
 /* Loop or bb location.  */
 LOC vect_location;
 
 /* Vector mapping GIMPLE stmt to stmt_vec_info. */
 vec<vec_void_p> stmt_vec_info_vec;
+
+/* For mapping simduid to vectorization factor.  */
+
+struct simduid_to_vf : typed_free_remove<simduid_to_vf>
+{
+  unsigned int simduid;
+  int vf;
 
+  /* hash_table support.  */
+  typedef simduid_to_vf value_type;
+  typedef simduid_to_vf compare_type;
+  static inline hashval_t hash (const value_type *);
+  static inline int equal (const value_type *, const compare_type *);
+};
+
+inline hashval_t
+simduid_to_vf::hash (const value_type *p)
+{
+  return p->simduid;
+}
+
+inline int
+simduid_to_vf::equal (const value_type *p1, const value_type *p2)
+{
+  return p1->simduid == p2->simduid;
+}
+
+static void
+adjust_simduid_builtins (hash_table <simduid_to_vf> &htab)
+{
+  basic_block bb;
+
+  FOR_EACH_BB (bb)
+    {
+      gimple_stmt_iterator i;
+
+      for (i = gsi_start_bb (bb); !gsi_end_p (i); gsi_next (&i))
+	{
+	  unsigned int vf = 1;
+	  bool is_lane = false;
+	  gimple stmt = gsi_stmt (i);
+	  if (!is_gimple_call (stmt))
+	    continue;
+	  if (gimple_call_builtin_p (stmt, BUILT_IN_GOMP_SIMD_LANE))
+	    is_lane = true;
+	  else if (!gimple_call_builtin_p (stmt, BUILT_IN_GOMP_SIMD_VF))
+	    continue;
+	  gcc_assert (host_integerp (gimple_call_arg (stmt, 0), 1));
+	  simduid_to_vf *p = NULL, data;
+	  data.simduid = tree_low_cst (gimple_call_arg (stmt, 0), 1);
+	  if (htab.is_created ())
+	    p = htab.find (&data);
+	  if (p)
+	    vf = p->vf;
+	  if (!is_lane)
+	    update_call_from_tree (&i, build_int_cst (unsigned_type_node, vf));
+	  else if (vf == 1)
+	    update_call_from_tree (&i, build_int_cst (unsigned_type_node, 0));
+	  else
+	    {
+	      struct loop *loop = bb->loop_father;
+	      gcc_assert (loop && loop->header);
+	      tree result = make_ssa_name (unsigned_type_node, NULL);
+	      tree incremented = make_ssa_name (unsigned_type_node, NULL);
+	      tree masked = make_ssa_name (unsigned_type_node, NULL);
+	      tree zero = build_int_cst (unsigned_type_node, 0);
+	      tree one = build_int_cst (unsigned_type_node, 1);
+	      tree vfm1 = build_int_cst (unsigned_type_node, vf - 1);
+	      gimple phi = create_phi_node (result, loop->header);
+	      edge e;
+	      edge_iterator ei;
+	      FOR_EACH_EDGE (e, ei, loop->header->preds)
+		add_phi_arg (phi, e->src == loop->latch ? masked : zero, e,
+			     UNKNOWN_LOCATION);
+	      update_call_from_tree (&i, result);
+	      gimple g = gimple_build_assign_with_ops (PLUS_EXPR, incremented,
+						       result, one);
+	      gsi_insert_after (&i, g, GSI_NEW_STMT);
+	      g = gimple_build_assign_with_ops (BIT_AND_EXPR, masked,
+						incremented, vfm1);
+	      gsi_insert_after (&i, g, GSI_NEW_STMT);
+	    }
+	}
+    }
+}
 
 /* Function vectorize_loops.
 
@@ -86,12 +172,17 @@  vectorize_loops (void)
   unsigned int vect_loops_num;
   loop_iterator li;
   struct loop *loop;
+  hash_table <simduid_to_vf> simduid_to_vf_htab;
 
   vect_loops_num = number_of_loops (cfun);
 
   /* Bail out if there are no loops.  */
   if (vect_loops_num <= 1)
-    return 0;
+    {
+      if (cfun->has_simduid_loops)
+	adjust_simduid_builtins (simduid_to_vf_htab);
+      return 0;
+    }
 
   init_stmt_vec_info_vec ();
 
@@ -126,6 +217,17 @@  vectorize_loops (void)
 	/* Now that the loop has been vectorized, allow it to be unrolled
 	   etc.  */
 	loop->force_vect = false;
+
+	if (loop->simduid)
+	  {
+	    simduid_to_vf *simduid_to_vf_data = XNEW (simduid_to_vf);
+	    if (!simduid_to_vf_htab.is_created ())
+	      simduid_to_vf_htab.create (15);
+	    simduid_to_vf_data->simduid = loop->simduid;
+	    simduid_to_vf_data->vf = loop_vinfo->vectorization_factor;
+	    *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT)
+	      = simduid_to_vf_data;
+	  }
       }
 
   vect_location = UNKNOWN_LOC;
@@ -153,6 +255,11 @@  vectorize_loops (void)
 
   free_stmt_vec_info_vec ();
 
+  if (cfun->has_simduid_loops)
+    adjust_simduid_builtins (simduid_to_vf_htab);
+  if (simduid_to_vf_htab.is_created ())
+    simduid_to_vf_htab.dispose ();
+
   if (num_vectorized_loops > 0)
     {
       /* If we vectorized any loop only virtual SSA form needs to be updated.
--- gcc/gimplify.c.jj	2013-06-04 20:55:56.000000000 +0200
+++ gcc/gimplify.c	2013-06-10 10:48:20.040213161 +0200
@@ -6898,7 +6898,7 @@  gimplify_omp_for (tree *expr_p, gimple_s
 						 (splay_tree_key)decl);
 	  if (n != NULL && (n->value & GOVD_DATA_SHARE_CLASS) != 0)
 	    omp_notice_variable (gimplify_omp_ctxp, decl, true);
-	  else
+	  else if (TREE_VEC_LENGTH (OMP_FOR_INIT (for_stmt)) == 1)
 	    {
 	      c = build_omp_clause (input_location, OMP_CLAUSE_LINEAR);
 	      OMP_CLAUSE_LINEAR_NO_COPYIN (c) = 1;
@@ -6911,6 +6911,20 @@  gimplify_omp_for (tree *expr_p, gimple_s
 	      omp_add_variable (gimplify_omp_ctxp, decl,
 				GOVD_LINEAR | GOVD_EXPLICIT | GOVD_SEEN);
 	    }
+	  else
+	    {
+	      bool lastprivate
+		= (!has_decl_expr
+		   || !bitmap_bit_p (has_decl_expr, DECL_UID (decl)));
+	      c = build_omp_clause (input_location,
+				    lastprivate ? OMP_CLAUSE_LASTPRIVATE
+						: OMP_CLAUSE_PRIVATE);
+	      OMP_CLAUSE_DECL (c) = decl;
+	      OMP_CLAUSE_CHAIN (c) = OMP_FOR_CLAUSES (for_stmt);
+	      omp_add_variable (gimplify_omp_ctxp, decl,
+				(lastprivate ? GOVD_LASTPRIVATE : GOVD_PRIVATE)
+				| GOVD_EXPLICIT | GOVD_SEEN);
+	    }
 	}
       else if (is_private)
 	omp_notice_variable (gimplify_omp_ctxp, decl, true);
--- gcc/fortran/f95-lang.c.jj	2013-05-13 16:48:52.000000000 +0200
+++ gcc/fortran/f95-lang.c	2013-06-10 16:24:33.395248031 +0200
@@ -538,6 +538,7 @@  gfc_builtin_function (tree decl)
 #define ATTR_CONST_NOTHROW_LEAF_LIST	(ECF_NOTHROW | ECF_LEAF | ECF_CONST)
 #define ATTR_NOTHROW_LIST		(ECF_NOTHROW)
 #define ATTR_CONST_NOTHROW_LIST		(ECF_NOTHROW | ECF_CONST)
+#define ATTR_NOVOPS_NOTHROW_LEAF_LIST	(ECF_NOTHROW | ECF_LEAF | ECF_NOVOPS)
 
 static void
 gfc_define_builtin (const char *name, tree type, enum built_in_function code,
--- gcc/tree.c.jj	2013-06-04 20:55:56.000000000 +0200
+++ gcc/tree.c	2013-06-10 12:17:53.082094837 +0200
@@ -263,7 +263,8 @@  unsigned const char omp_clause_num_ops[]
   0, /* OMP_CLAUSE_FOR  */
   0, /* OMP_CLAUSE_PARALLEL  */
   0, /* OMP_CLAUSE_SECTIONS  */
-  0  /* OMP_CLAUSE_TASKGROUP  */
+  0, /* OMP_CLAUSE_TASKGROUP  */
+  1, /* OMP_CLAUSE__SIMDUID_  */
 };
 
 const char * const omp_clause_code_name[] =
@@ -304,7 +305,8 @@  const char * const omp_clause_code_name[
   "for",
   "parallel",
   "sections",
-  "taskgroup"
+  "taskgroup",
+  "_simduid_"
 };
 
 
@@ -11018,6 +11020,7 @@  walk_tree_1 (tree *tp, walk_tree_fn func
 	case OMP_CLAUSE_DIST_SCHEDULE:
 	case OMP_CLAUSE_SAFELEN:
 	case OMP_CLAUSE_SIMDLEN:
+	case OMP_CLAUSE__SIMDUID_:
 	  WALK_SUBTREE (OMP_CLAUSE_OPERAND (*tp, 0));
 	  /* FALLTHRU */
 
--- gcc/tree-pretty-print.c.jj	2013-06-04 20:55:56.000000000 +0200
+++ gcc/tree-pretty-print.c	2013-06-10 12:17:06.775835897 +0200
@@ -585,6 +585,13 @@  dump_omp_clause (pretty_printer *buffer,
       pp_character (buffer, ')');
       break;
 
+    case OMP_CLAUSE__SIMDUID_:
+      pp_string (buffer, "_simduid_(");
+      dump_generic_node (buffer, OMP_CLAUSE__SIMDUID__UID (clause),
+			 spc, flags, false);
+      pp_character (buffer, ')');
+      break;
+
     case OMP_CLAUSE_INBRANCH:
       pp_string (buffer, "inbranch");
       break;
--- gcc/cfgloop.h.jj	2013-05-14 15:30:08.000000000 +0200
+++ gcc/cfgloop.h	2013-06-10 18:17:23.463286893 +0200
@@ -174,6 +174,10 @@  struct GTY ((chain_next ("%h.next"))) lo
      of the loop can be safely evaluated concurrently.  */
   int safelen;
 
+  /* For SIMD loops, this is a unique identifier of the loop, referenced
+     by __builtin_GOMP.simd_vf and __builtin_GOMP.simd_lane builtins.  */
+  unsigned int simduid;
+
   /* True if we should try harder to vectorize this loop.  */
   bool force_vect;
 
--- gcc/builtin-attrs.def.jj	2013-03-20 10:05:01.000000000 +0100
+++ gcc/builtin-attrs.def	2013-06-10 16:16:43.580971774 +0200
@@ -112,6 +112,9 @@  DEF_ATTR_TREE_LIST (ATTR_NOVOPS_LIST, AT
 
 DEF_ATTR_TREE_LIST (ATTR_NOVOPS_LEAF_LIST, ATTR_LEAF, ATTR_NULL, ATTR_NOVOPS_LIST)
 
+DEF_ATTR_TREE_LIST (ATTR_NOVOPS_NOTHROW_LEAF_LIST, ATTR_NOTHROW, ATTR_NULL, \
+		    ATTR_NOVOPS_LEAF_LIST)
+
 DEF_ATTR_TREE_LIST (ATTR_LEAF_LIST, ATTR_LEAF, ATTR_NULL, ATTR_NULL)
 
 DEF_ATTR_TREE_LIST (ATTR_NOTHROW_LIST, ATTR_NOTHROW, ATTR_NULL, ATTR_NULL)
--- gcc/function.h.jj	2013-05-20 13:21:34.000000000 +0200
+++ gcc/function.h	2013-06-11 15:04:43.218100041 +0200
@@ -654,6 +654,10 @@  struct GTY(()) function {
   /* Nonzero if the current function contains any loops with
      loop->force_vect set.  */
   unsigned int has_force_vect_loops : 1;
+
+  /* Nonzero if the current function contains any loops with
+     nonzero value in loop->simduid.  */
+  unsigned int has_simduid_loops : 1;
 };
 
 /* Add the decl D to the local_decls list of FUN.  */
--- gcc/tree.h.jj	2013-06-06 18:48:23.000000000 +0200
+++ gcc/tree.h	2013-06-10 16:17:36.688105238 +0200
@@ -450,7 +450,10 @@  enum omp_clause_code
   OMP_CLAUSE_SECTIONS,
 
   /* OpenMP clause: taskgroup.  */
-  OMP_CLAUSE_TASKGROUP
+  OMP_CLAUSE_TASKGROUP,
+
+  /* Internally used only clause, holding SIMD uid.  */
+  OMP_CLAUSE__SIMDUID_
 };
 
 /* The definition of tree nodes fills the next several pages.  */
@@ -1991,6 +1994,9 @@  extern void protected_set_expr_location
 #define OMP_CLAUSE_SIMDLEN_EXPR(NODE) \
   OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_SIMDLEN), 0)
 
+#define OMP_CLAUSE__SIMDUID__UID(NODE) \
+  OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE__SIMDUID_), 0)
+
 enum omp_clause_schedule_kind
 {
   OMP_CLAUSE_SCHEDULE_STATIC,
--- gcc/omp-builtins.def.jj	2013-05-29 10:07:14.000000000 +0200
+++ gcc/omp-builtins.def	2013-06-10 16:17:28.319240363 +0200
@@ -218,3 +218,8 @@  DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SINGLE_C
 		  BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
 DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SINGLE_COPY_END, "GOMP_single_copy_end",
 		  BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SIMD_LANE, "GOMP.simd_lane",
+		  BT_FN_UINT_UINT, ATTR_NOVOPS_NOTHROW_LEAF_LIST)
+DEF_GOMP_BUILTIN (BUILT_IN_GOMP_SIMD_VF, "GOMP.simd_vf",
+		  BT_FN_UINT_UINT, ATTR_CONST_NOTHROW_LEAF_LIST)
--- gcc/tree-vect-stmts.c.jj	2013-05-13 16:49:39.000000000 +0200
+++ gcc/tree-vect-stmts.c	2013-06-11 14:48:11.788834092 +0200
@@ -4041,7 +4041,7 @@  vectorizable_store (gimple stmt, gimple_
   for (j = 0; j < ncopies; j++)
     {
       gimple new_stmt;
-      gimple ptr_incr;
+      gimple ptr_incr = NULL;
 
       if (j == 0)
 	{
@@ -4085,9 +4085,13 @@  vectorizable_store (gimple stmt, gimple_
 	  /* We should have catched mismatched types earlier.  */
 	  gcc_assert (useless_type_conversion_p (vectype,
 						 TREE_TYPE (vec_oprnd)));
-	  dataref_ptr = vect_create_data_ref_ptr (first_stmt, aggr_type, NULL,
-						  NULL_TREE, &dummy, gsi,
-						  &ptr_incr, false, &inv_p);
+	  bool simd_lane_access_p
+	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info);
+	  dataref_ptr
+	    = vect_create_data_ref_ptr (first_stmt, aggr_type,
+					simd_lane_access_p ? loop : NULL,
+					NULL_TREE, &dummy, gsi, &ptr_incr,
+					simd_lane_access_p, &inv_p);
 	  gcc_assert (bb_vinfo || !inv_p);
 	}
       else
@@ -4314,7 +4318,7 @@  vectorizable_load (gimple stmt, gimple_s
   tree dummy;
   enum dr_alignment_support alignment_support_scheme;
   tree dataref_ptr = NULL_TREE;
-  gimple ptr_incr;
+  gimple ptr_incr = NULL;
   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
   int i, j, group_size, group_gap;
@@ -4947,9 +4951,14 @@  vectorizable_load (gimple stmt, gimple_s
     {
       /* 1. Create the vector or array pointer update chain.  */
       if (j == 0)
-        dataref_ptr = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
-						offset, &dummy, gsi,
-						&ptr_incr, false, &inv_p);
+	{
+	  bool simd_lane_access_p
+	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info);
+	  dataref_ptr
+	    = vect_create_data_ref_ptr (first_stmt, aggr_type, at_loop,
+					offset, &dummy, gsi, &ptr_incr,
+					simd_lane_access_p, &inv_p);
+	}
       else
         dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, stmt,
 				       TYPE_SIZE_UNIT (aggr_type));
--- gcc/omp-low.c.jj	2013-06-04 20:55:56.000000000 +0200
+++ gcc/omp-low.c	2013-06-11 15:05:21.662715957 +0200
@@ -2417,6 +2417,60 @@  omp_clause_aligned_alignment (tree claus
   return build_int_cst (integer_type_node, al);
 }
 
+/* Return maximum possible vectorization factor for the target.  */
+
+static int
+omp_max_vf (void)
+{
+  if (!flag_tree_vectorize
+      && global_options_set.x_flag_tree_vectorize)
+    return 1;
+
+  int vs = targetm.vectorize.autovectorize_vector_sizes ();
+  if (vs)
+    {
+      vs = 1 << floor_log2 (vs);
+      return vs;
+    }
+  enum machine_mode vqimode = targetm.vectorize.preferred_simd_mode (QImode);
+  if (GET_MODE_CLASS (vqimode) == MODE_VECTOR_INT)
+    return GET_MODE_NUNITS (vqimode);
+  return 1;
+}
+
+/* Helper function of lower_rec_input_clauses, used for #pragma omp simd
+   privatization.  */
+
+static bool
+lower_rec_simd_input_clauses (tree new_var, int &max_vf, tree &idx, tree &lane,
+			      tree &ivar, tree &lvar)
+{
+  if (max_vf == 0)
+    {
+      max_vf = omp_max_vf ();
+      if (max_vf > 1)
+	{
+	  idx = create_tmp_var (unsigned_type_node, NULL);
+	  lane = create_tmp_var (unsigned_type_node, NULL);
+	}
+    }
+  if (max_vf == 1)
+    return false;
+
+  tree atype = build_array_type_nelts (TREE_TYPE (new_var), max_vf);
+  tree avar = create_tmp_var_raw (atype, NULL);
+  if (TREE_ADDRESSABLE (new_var))
+    TREE_ADDRESSABLE (avar) = 1;
+  gimple_add_tmp_var (avar);
+  ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, idx,
+		 NULL_TREE, NULL_TREE);
+  lvar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, lane,
+		 NULL_TREE, NULL_TREE);
+  SET_DECL_VALUE_EXPR (new_var, lvar);
+  DECL_HAS_VALUE_EXPR_P (new_var) = 1;
+  return true;
+}
+
 /* Generate code to implement the input clauses, FIRSTPRIVATE and COPYIN,
    from the receiver (aka child) side and initializers for REFERENCE_TYPE
    private variables.  Initialization statements go in ILIST, while calls
@@ -2430,6 +2484,15 @@  lower_rec_input_clauses (tree clauses, g
   bool copyin_by_ref = false;
   bool lastprivate_firstprivate = false;
   int pass;
+  static int simd_uid;
+  bool is_simd = (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
+		  && (gimple_omp_for_kind (ctx->stmt) == GF_OMP_FOR_KIND_SIMD
+		      || (gimple_omp_for_kind (ctx->stmt)
+			  == GF_OMP_FOR_KIND_FOR_SIMD)));
+  int max_vf = 0;
+  tree lane = NULL_TREE, idx = NULL_TREE;
+  tree ivar = NULL_TREE, lvar = NULL_TREE;
+  gimple_seq llist[2] = { NULL, NULL };
 
   copyin_seq = NULL;
 
@@ -2645,6 +2708,33 @@  lower_rec_input_clauses (tree clauses, g
 		x = NULL;
 	    do_private:
 	      x = lang_hooks.decls.omp_clause_default_ctor (c, new_var, x);
+	      if (is_simd)
+		{
+		  tree y = lang_hooks.decls.omp_clause_dtor (c, new_var);
+		  if ((TREE_ADDRESSABLE (new_var) || x || y)
+		      && lower_rec_simd_input_clauses (new_var, max_vf, idx,
+						       lane, ivar, lvar))
+		    {
+		      if (x)
+			x = lang_hooks.decls.omp_clause_default_ctor (c, ivar,
+								      x);
+		      if (x)
+			gimplify_and_add (x, &llist[0]);
+		      if (y)
+			{
+			  y = lang_hooks.decls.omp_clause_dtor (c, ivar);
+			  if (y)
+			    {
+			      gimple_seq tseq = NULL;
+
+			      dtor = y;
+			      gimplify_stmt (&dtor, &tseq);
+			      gimple_seq_add_seq (&llist[1], tseq);
+			    }
+			}
+		      break;
+		    }
+		}
 	      if (x)
 		gimplify_and_add (x, ilist);
 	      /* FALLTHRU */
@@ -2687,10 +2777,42 @@  lower_rec_input_clauses (tree clauses, g
 		}
 	    do_firstprivate:
 	      x = build_outer_var_ref (var, ctx);
+	      if (is_simd)
+		{
+		  if (lower_rec_simd_input_clauses (new_var, max_vf, idx,
+						    lane, ivar, lvar))
+		    {
+		      if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LINEAR)
+			{
+			  tree stept = POINTER_TYPE_P (TREE_TYPE (x))
+				       ? sizetype : TREE_TYPE (x);
+			  tree t = fold_convert (stept,
+						 OMP_CLAUSE_LINEAR_STEP (c));
+			  t = fold_build2 (MULT_EXPR, stept,
+					   fold_convert (stept, idx), t);
+			  if (POINTER_TYPE_P (TREE_TYPE (x)))
+			    x = fold_build2 (POINTER_PLUS_EXPR,
+					     TREE_TYPE (x), x, t);
+			  else
+			    x = fold_build2 (PLUS_EXPR, TREE_TYPE (x), x, t);
+			}
+		      x = lang_hooks.decls.omp_clause_copy_ctor (c, ivar, x);
+		      gimplify_and_add (x, &llist[0]);
+		      x = lang_hooks.decls.omp_clause_dtor (c, ivar);
+		      if (x)
+			{
+			  gimple_seq tseq = NULL;
+
+			  dtor = x;
+			  gimplify_stmt (&dtor, &tseq);
+			  gimple_seq_add_seq (&llist[1], tseq);
+			}
+		      break;
+		    }
+		}
 	      x = lang_hooks.decls.omp_clause_copy_ctor (c, new_var, x);
 	      gimplify_and_add (x, ilist);
 	      goto do_dtor;
-	      break;
 
 	    case OMP_CLAUSE_COPYIN:
 	      by_ref = use_pointer_for_field (var, NULL);
@@ -2706,6 +2828,8 @@  lower_rec_input_clauses (tree clauses, g
 		  tree placeholder = OMP_CLAUSE_REDUCTION_PLACEHOLDER (c);
 		  x = build_outer_var_ref (var, ctx);
 
+		  /* FIXME: Not handled yet.  */
+		  gcc_assert (!is_simd);
 		  if (is_reference (var))
 		    x = build_fold_addr_expr_loc (clause_loc, x);
 		  SET_DECL_VALUE_EXPR (placeholder, x);
@@ -2720,7 +2844,27 @@  lower_rec_input_clauses (tree clauses, g
 		{
 		  x = omp_reduction_init (c, TREE_TYPE (new_var));
 		  gcc_assert (TREE_CODE (TREE_TYPE (new_var)) != ARRAY_TYPE);
-		  gimplify_assign (new_var, x, ilist);
+		  if (is_simd
+		      && lower_rec_simd_input_clauses (new_var, max_vf, idx,
+						       lane, ivar, lvar))
+		    {
+		      enum tree_code code = OMP_CLAUSE_REDUCTION_CODE (c);
+		      tree ref = build_outer_var_ref (var, ctx);
+
+		      gimplify_assign (ivar, x, &llist[0]);
+
+
+		      /* reduction(-:var) sums up the partial results, so it
+			 acts identically to reduction(+:var).  */
+		      if (code == MINUS_EXPR)
+			code = PLUS_EXPR;
+
+		      x = build2 (code, TREE_TYPE (ref), ref, ivar);
+		      ref = build_outer_var_ref (var, ctx);
+		      gimplify_assign (ref, x, &llist[1]);
+		    }
+		  else
+		    gimplify_assign (new_var, x, ilist);
 		}
 	      break;
 
@@ -2730,6 +2874,47 @@  lower_rec_input_clauses (tree clauses, g
 	}
     }
 
+  if (lane)
+    {
+      tree uid_cst = build_int_cst (unsigned_type_node, ++simd_uid);
+      gimple g
+	= gimple_build_call (builtin_decl_explicit (BUILT_IN_GOMP_SIMD_LANE), 1,
+			     uid_cst);
+      gimple_call_set_lhs (g, lane);
+      gimple_stmt_iterator gsi = gsi_start_1 (gimple_omp_body_ptr (ctx->stmt));
+      gsi_insert_before_without_update (&gsi, g, GSI_SAME_STMT);
+      c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__SIMDUID_);
+      OMP_CLAUSE__SIMDUID__UID (c) = uid_cst;
+      OMP_CLAUSE_CHAIN (c) = gimple_omp_for_clauses (ctx->stmt);
+      gimple_omp_for_set_clauses (ctx->stmt, c);
+      for (int i = 0; i < 2; i++)
+	if (llist[i])
+	  {
+	    tree vf = create_tmp_var (unsigned_type_node, NULL);
+	    tree fndecl = builtin_decl_explicit (BUILT_IN_GOMP_SIMD_VF);
+	    g = gimple_build_call (fndecl, 1, uid_cst);
+	    gimple_call_set_lhs (g, vf);
+	    gimple_seq *seq = i == 0 ? ilist : dlist;
+	    gimple_seq_add_stmt (seq, g);
+	    tree t = build_int_cst (unsigned_type_node, 0);
+	    g = gimple_build_assign_with_ops (INTEGER_CST, idx, t, NULL_TREE);
+	    gimple_seq_add_stmt (seq, g);
+	    tree body = create_artificial_label (UNKNOWN_LOCATION);
+	    tree header = create_artificial_label (UNKNOWN_LOCATION);
+	    tree end = create_artificial_label (UNKNOWN_LOCATION);
+	    gimple_seq_add_stmt (seq, gimple_build_goto (header));
+	    gimple_seq_add_stmt (seq, gimple_build_label (body));
+	    gimple_seq_add_seq (seq, llist[i]);
+	    t = build_int_cst (unsigned_type_node, 1);
+	    g = gimple_build_assign_with_ops (PLUS_EXPR, idx, idx, t);
+	    gimple_seq_add_stmt (seq, g);
+	    gimple_seq_add_stmt (seq, gimple_build_label (header));
+	    g = gimple_build_cond (LT_EXPR, idx, vf, body, end);
+	    gimple_seq_add_stmt (seq, g);
+	    gimple_seq_add_stmt (seq, gimple_build_label (end));
+	  }
+    }
+
   /* The copyin sequence is not to be executed by the main thread, since
      that would result in self-copies.  Perhaps not visible to scalars,
      but it certainly is to C++ operator=.  */
@@ -2872,6 +3057,12 @@  lower_reduction_clauses (tree clauses, g
   tree x, c;
   int count = 0;
 
+  /* SIMD reductions are handled in lower_rec_input_clauses.  */
+  if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR
+      && (gimple_omp_for_kind (ctx->stmt) == GF_OMP_FOR_KIND_SIMD
+	  || (gimple_omp_for_kind (ctx->stmt) == GF_OMP_FOR_KIND_FOR_SIMD)))
+    return;
+
   /* First see if there is exactly one reduction clause.  Use OMP_ATOMIC
      update in that case, otherwise use a lock.  */
   for (c = clauses; c && count < 2; c = OMP_CLAUSE_CHAIN (c))
@@ -5204,6 +5395,8 @@  expand_omp_simd (struct omp_region *regi
   int i;
   tree safelen = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
 				  OMP_CLAUSE_SAFELEN);
+  tree simduid = find_omp_clause (gimple_omp_for_clauses (fd->for_stmt),
+				  OMP_CLAUSE__SIMDUID_);
 
   type = TREE_TYPE (fd->loop.v);
   entry_bb = region->entry;
@@ -5456,6 +5649,11 @@  expand_omp_simd (struct omp_region *regi
 	  else
 	    loop->safelen = tree_low_cst (safelen, 1);
 	}
+      if (simduid)
+	{
+	  loop->simduid = tree_low_cst (OMP_CLAUSE__SIMDUID__UID (simduid), 1);
+	  cfun->has_simduid_loops = true;
+	}
       /* If not -fno-tree-vectorize, hint that we want to vectorize
 	 the loop.  */
       if (flag_tree_vectorize