diff mbox series

#pragma omp scan inclusive vectorization

Message ID 20190617212715.GW19695@tucnak
State New
Headers show
Series #pragma omp scan inclusive vectorization | expand

Commit Message

Jakub Jelinek June 17, 2019, 9:27 p.m. UTC
Hi!

On Mon, Jun 17, 2019 at 08:35:23AM +0200, Richard Biener wrote:
> Ugh, not pretty but probably best we can do.  Btw, can you please
> add support for the SLP case and group_size == 1?  I know I'm slow
> with the branch ripping out the non-SLP path but it would save me
> some extra work (possibly).

Here is what I've committed so far.  The SLP & group_size == 1 stuff
I'll try to help you with on your branch when you are close to merging in,
though there is a small complication, because this scan stuff isn't 100%
identical to just unrolling it several times, the "orig" vector is broadcast
from the last value to all the lanes.

I'll work incrementally on further improvements (e.g. the C++ testcase is
still not vectorized, we end up with
  _18 = .GOMP_SIMD_LANE (simduid.0_14(D), 0);
...
  _11 = (sizetype) _18;
  _9 = _11 * 4;
  _28 = &D.2456 + _9;
  _32 = MEM[(int *)_28];
instead of the usual _32 = D.2456[_18], nothing for whatever reason folds
that back together and the current simd lane handling probably doesn't
recognize that), handling references, exclusive scan etc.

Bootstrapped/regtested on x86_64-linux and i686-linux.

2019-06-17  Jakub Jelinek  <jakub@redhat.com>

	* omp-low.c (struct omp_context): Add scan_inclusive field.
	(scan_omp_1_stmt) <case GIMPLE_OMP_SCAN>: Set ctx->scan_inclusive
	if inclusive scan.
	(struct omplow_simd_context): Add lastlane member.
	(lower_rec_simd_input_clauses): Add rvar argument, handle inscan
	reductions.  Build 2 or 3 argument .GOMP_SIMD_LANE calls rather than
	1 or 2 argument.
	(lower_rec_input_clauses): Handle inscan reductions in simd contexts.
	(lower_lastprivate_clauses): Set TREE_THIS_NOTRAP on the ARRAY_REF.
	(lower_omp_scan): New function.
	(lower_omp_1) <case GIMPLE_OMP_SCAN>: Use lower_omp_scan.
	* tree-ssa-dce.c (eliminate_unnecessary_stmts): For IFN_GOMP_SIMD_LANE
	check 3rd argument if present rather than 2nd.
	* tree-vectorizer.h (struct _loop_vec_info): Add scan_map member.
	(struct _stmt_vec_info): Change simd_lane_access_p from bool into
	2-bit bitfield.
	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
	scan_map.  For IFN_GOMP_SIMD_LANE check 3rd argument if present rather
	than 2nd.
	(_loop_vec_info::~_loop_vec_info): Delete scan_map.
	* tree-vect-data-refs.c (vect_analyze_data_ref_accesses): Allow two
	different STMT_VINFO_SIMD_LANE_ACCESS_P refs if they have the same
	init.
	(vect_find_stmt_data_reference): Encode in ->aux the 2nd
	IFN_GOMP_SIMD_LANE argument.
	(vect_analyze_data_refs): Set STMT_VINFO_SIMD_LANE_ACCESS_P from the
	encoded ->aux value.
	* tree-vect-stmts.c: Include attribs.h.
	(vectorizable_call): Adjust comment about IFN_GOMP_SIMD_LANE.
	(scan_operand_equal_p, check_scan_store, vectorizable_scan_store): New
	functions.
	(vectorizable_load): For STMT_VINFO_SIMD_LANE_ACCESS_P tests use != 0.
	(vectorizable_store): Handle STMT_VINFO_SIMD_LANE_ACCESS_P > 1.
cp/
	* semantics.c (finish_omp_clauses): For OMP_CLAUSE_REDUCTION_INSCAN
	set need_copy_assignment.
testsuite/
	* gcc.dg/vect/vect-simd-8.c: New test.
	* gcc.dg/vect/vect-simd-9.c: New test.
	* g++.dg/vect/simd-2.cc: New test.
	* g++.dg/gomp/scan-1.C: New test.



	Jakub
diff mbox series

Patch

--- gcc/omp-low.c.jj	2019-06-15 09:06:53.794030048 +0200
+++ gcc/omp-low.c	2019-06-17 18:13:09.426055084 +0200
@@ -141,6 +141,9 @@  struct omp_context
   /* True if lower_omp_1 should look up lastprivate conditional in parent
      context.  */
   bool combined_into_simd_safelen0;
+
+  /* True if there is nested scan context with inclusive clause.  */
+  bool scan_inclusive;
 };
 
 static splay_tree all_contexts;
@@ -3329,11 +3332,15 @@  scan_omp_1_stmt (gimple_stmt_iterator *g
       scan_omp_single (as_a <gomp_single *> (stmt), ctx);
       break;
 
+    case GIMPLE_OMP_SCAN:
+      if (tree clauses = gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt)))
+	if (OMP_CLAUSE_CODE (clauses) == OMP_CLAUSE_INCLUSIVE)
+	  ctx->scan_inclusive = true;
+      /* FALLTHRU */
     case GIMPLE_OMP_SECTION:
     case GIMPLE_OMP_MASTER:
     case GIMPLE_OMP_ORDERED:
     case GIMPLE_OMP_CRITICAL:
-    case GIMPLE_OMP_SCAN:
     case GIMPLE_OMP_GRID_BODY:
       ctx = new_omp_context (stmt, ctx);
       scan_omp (gimple_omp_body_ptr (stmt), ctx);
@@ -3671,6 +3678,7 @@  struct omplow_simd_context {
   omplow_simd_context () { memset (this, 0, sizeof (*this)); }
   tree idx;
   tree lane;
+  tree lastlane;
   vec<tree, va_heap> simt_eargs;
   gimple_seq simt_dlist;
   poly_uint64_pod max_vf;
@@ -3682,7 +3690,8 @@  struct omplow_simd_context {
 
 static bool
 lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
-			      omplow_simd_context *sctx, tree &ivar, tree &lvar)
+			      omplow_simd_context *sctx, tree &ivar,
+			      tree &lvar, tree *rvar = NULL)
 {
   if (known_eq (sctx->max_vf, 0U))
     {
@@ -3738,7 +3747,27 @@  lower_rec_simd_input_clauses (tree new_v
 	= tree_cons (get_identifier ("omp simd array"), NULL,
 		     DECL_ATTRIBUTES (avar));
       gimple_add_tmp_var (avar);
-      ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->idx,
+      tree iavar = avar;
+      if (rvar)
+	{
+	  /* For inscan reductions, create another array temporary,
+	     which will hold the reduced value.  */
+	  iavar = create_tmp_var_raw (atype);
+	  if (TREE_ADDRESSABLE (new_var))
+	    TREE_ADDRESSABLE (iavar) = 1;
+	  DECL_ATTRIBUTES (iavar)
+	    = tree_cons (get_identifier ("omp simd array"), NULL,
+			 tree_cons (get_identifier ("omp simd inscan"), NULL,
+				    DECL_ATTRIBUTES (iavar)));
+	  gimple_add_tmp_var (iavar);
+	  ctx->cb.decl_map->put (avar, iavar);
+	  if (sctx->lastlane == NULL_TREE)
+	    sctx->lastlane = create_tmp_var (unsigned_type_node);
+	  *rvar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar,
+			  sctx->lastlane, NULL_TREE, NULL_TREE);
+	  TREE_THIS_NOTRAP (*rvar) = 1;
+	}
+      ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->idx,
 		     NULL_TREE, NULL_TREE);
       lvar = build4 (ARRAY_REF, TREE_TYPE (new_var), avar, sctx->lane,
 		     NULL_TREE, NULL_TREE);
@@ -3814,7 +3843,7 @@  lower_rec_input_clauses (tree clauses, g
   omplow_simd_context sctx = omplow_simd_context ();
   tree simt_lane = NULL_TREE, simtrec = NULL_TREE;
   tree ivar = NULL_TREE, lvar = NULL_TREE, uid = NULL_TREE;
-  gimple_seq llist[3] = { };
+  gimple_seq llist[4] = { };
   tree nonconst_simd_if = NULL_TREE;
 
   copyin_seq = NULL;
@@ -5155,9 +5184,14 @@  lower_rec_input_clauses (tree clauses, g
 		      new_vard = TREE_OPERAND (new_var, 0);
 		      gcc_assert (DECL_P (new_vard));
 		    }
+		  tree rvar = NULL_TREE, *rvarp = NULL;
+		  if (is_simd
+		      && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+		      && OMP_CLAUSE_REDUCTION_INSCAN (c))
+		    rvarp = &rvar;
 		  if (is_simd
 		      && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
-						       ivar, lvar))
+						       ivar, lvar, rvarp))
 		    {
 		      if (new_vard == new_var)
 			{
@@ -5173,6 +5207,93 @@  lower_rec_input_clauses (tree clauses, g
 		      x = lang_hooks.decls.omp_clause_default_ctor
 				(c, unshare_expr (ivar),
 				 build_outer_var_ref (var, ctx));
+		      if (rvarp)
+			{
+			  if (x)
+			    {
+			      gimplify_and_add (x, &llist[0]);
+
+			      tree ivar2 = unshare_expr (lvar);
+			      TREE_OPERAND (ivar2, 1) = sctx.idx;
+			      x = lang_hooks.decls.omp_clause_default_ctor
+				    (c, ivar2, build_outer_var_ref (var, ctx));
+			      gimplify_and_add (x, &llist[0]);
+
+			      /* For types that need construction, add another
+				 private var which will be default constructed
+				 and optionally initialized with
+				 OMP_CLAUSE_REDUCTION_GIMPLE_INIT, as in the
+				 loop we want to assign this value instead of
+				 constructing and destructing it in each
+				 iteration.  */
+			      tree nv = create_tmp_var_raw (TREE_TYPE (ivar));
+			      gimple_add_tmp_var (nv);
+			      ctx->cb.decl_map->put (TREE_OPERAND (ivar, 0),
+						     nv);
+			      x = lang_hooks.decls.omp_clause_default_ctor
+				    (c, nv, build_outer_var_ref (var, ctx));
+			      gimplify_and_add (x, ilist);
+
+			      if (OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c))
+				{
+				  tseq = OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c);
+				  x = DECL_VALUE_EXPR (new_var);
+				  SET_DECL_VALUE_EXPR (new_var, nv);
+				  lower_omp (&tseq, ctx);
+				  SET_DECL_VALUE_EXPR (new_var, x);
+				  gimple_seq_add_seq (ilist, tseq);
+				  OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL;
+				}
+
+			      x = lang_hooks.decls.omp_clause_dtor (c, nv);
+			      if (x)
+				{
+				  tseq = NULL;
+				  dtor = x;
+				  gimplify_stmt (&dtor, &tseq);
+				  gimple_seq_add_seq (dlist, tseq);
+				}
+			    }
+
+			  tree ref = build_outer_var_ref (var, ctx);
+			  x = unshare_expr (ivar);
+			  x = lang_hooks.decls.omp_clause_assign_op (c, x,
+								     ref);
+			  gimplify_and_add (x, &llist[0]);
+
+			  ref = build_outer_var_ref (var, ctx);
+			  x = lang_hooks.decls.omp_clause_assign_op (c, ref,
+								     rvar);
+			  gimplify_and_add (x, &llist[3]);
+
+			  DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
+			  if (new_vard == new_var)
+			    SET_DECL_VALUE_EXPR (new_var, lvar);
+			  else
+			    SET_DECL_VALUE_EXPR (new_vard,
+						 build_fold_addr_expr (lvar));
+
+			  x = lang_hooks.decls.omp_clause_dtor (c, ivar);
+			  if (x)
+			    {
+			      tseq = NULL;
+			      dtor = x;
+			      gimplify_stmt (&dtor, &tseq);
+			      gimple_seq_add_seq (&llist[1], tseq);
+			    }
+
+			  tree ivar2 = unshare_expr (lvar);
+			  TREE_OPERAND (ivar2, 1) = sctx.idx;
+			  x = lang_hooks.decls.omp_clause_dtor (c, ivar2);
+			  if (x)
+			    {
+			      tseq = NULL;
+			      dtor = x;
+			      gimplify_stmt (&dtor, &tseq);
+			      gimple_seq_add_seq (&llist[1], tseq);
+			    }
+			  break;
+			}
 		      if (x)
 			gimplify_and_add (x, &llist[0]);
 		      if (OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c))
@@ -5240,6 +5361,41 @@  lower_rec_input_clauses (tree clauses, g
 				 : build_outer_var_ref (var, ctx));
 		  if (x)
 		    gimplify_and_add (x, ilist);
+
+		  if (rvarp)
+		    {
+		      if (x)
+			{
+			  tree nv = create_tmp_var_raw (TREE_TYPE (new_vard));
+			  gimple_add_tmp_var (nv);
+			  ctx->cb.decl_map->put (new_var, nv);
+			  x = lang_hooks.decls.omp_clause_default_ctor
+				(c, nv, build_outer_var_ref (var, ctx));
+			  gimplify_and_add (x, ilist);
+			  if (OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c))
+			    {
+			      tseq = OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c);
+			      SET_DECL_VALUE_EXPR (new_var, nv);
+			      DECL_HAS_VALUE_EXPR_P (new_var) = 1;
+			      lower_omp (&tseq, ctx);
+			      SET_DECL_VALUE_EXPR (new_var, NULL_TREE);
+			      DECL_HAS_VALUE_EXPR_P (new_var) = 0;
+			      gimple_seq_add_seq (ilist, tseq);
+			    }
+			  OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL;
+			  x = lang_hooks.decls.omp_clause_dtor (c, nv);
+			  if (x)
+			    {
+			      tseq = NULL;
+			      dtor = x;
+			      gimplify_stmt (&dtor, &tseq);
+			      gimple_seq_add_seq (dlist, tseq);
+			    }
+			}
+		      DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
+		      goto do_dtor;
+		    }
+
 		  if (OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c))
 		    {
 		      tseq = OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c);
@@ -5324,12 +5480,32 @@  lower_rec_input_clauses (tree clauses, g
 		      new_vard = TREE_OPERAND (new_var, 0);
 		      gcc_assert (DECL_P (new_vard));
 		    }
+		  tree rvar = NULL_TREE, *rvarp = NULL;
+		  if (is_simd
+		      && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+		      && OMP_CLAUSE_REDUCTION_INSCAN (c))
+		    rvarp = &rvar;
 		  if (is_simd
 		      && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
-						       ivar, lvar))
+						       ivar, lvar, rvarp))
 		    {
+		      if (new_vard != new_var)
+			{
+			  SET_DECL_VALUE_EXPR (new_vard,
+					       build_fold_addr_expr (lvar));
+			  DECL_HAS_VALUE_EXPR_P (new_vard) = 1;
+			}
+
 		      tree ref = build_outer_var_ref (var, ctx);
 
+		      if (rvarp)
+			{
+			  gimplify_assign (ivar, ref, &llist[0]);
+			  ref = build_outer_var_ref (var, ctx);
+			  gimplify_assign (ref, rvar, &llist[3]);
+			  break;
+			}
+
 		      gimplify_assign (unshare_expr (ivar), x, &llist[0]);
 
 		      if (sctx.is_simt)
@@ -5346,14 +5522,8 @@  lower_rec_input_clauses (tree clauses, g
 		      ref = build_outer_var_ref (var, ctx);
 		      gimplify_assign (ref, x, &llist[1]);
 
-		      if (new_vard != new_var)
-			{
-			  SET_DECL_VALUE_EXPR (new_vard,
-					       build_fold_addr_expr (lvar));
-			  DECL_HAS_VALUE_EXPR_P (new_vard) = 1;
-			}
 		    }
-		  else
+		  else if (rvarp == NULL)
 		    {
 		      if (omp_is_reference (var) && is_simd)
 			handle_simd_reference (clause_loc, new_vard, ilist);
@@ -5456,14 +5626,23 @@  lower_rec_input_clauses (tree clauses, g
   if (sctx.lane)
     {
       gimple *g = gimple_build_call_internal (IFN_GOMP_SIMD_LANE,
-					      1 + (nonconst_simd_if != NULL),
-					      uid, nonconst_simd_if);
+					      2 + (nonconst_simd_if != NULL),
+					      uid, integer_zero_node,
+					      nonconst_simd_if);
       gimple_call_set_lhs (g, sctx.lane);
       gimple_stmt_iterator gsi = gsi_start_1 (gimple_omp_body_ptr (ctx->stmt));
       gsi_insert_before_without_update (&gsi, g, GSI_SAME_STMT);
       g = gimple_build_assign (sctx.lane, INTEGER_CST,
 			       build_int_cst (unsigned_type_node, 0));
       gimple_seq_add_stmt (ilist, g);
+      if (sctx.lastlane)
+	{
+	  g = gimple_build_call_internal (IFN_GOMP_SIMD_LAST_LANE,
+					  2, uid, sctx.lane);
+	  gimple_call_set_lhs (g, sctx.lastlane);
+	  gimple_seq_add_stmt (dlist, g);
+	  gimple_seq_add_seq (dlist, llist[3]);
+	}
       /* Emit reductions across SIMT lanes in log_2(simt_vf) steps.  */
       if (llist[2])
 	{
@@ -5865,6 +6044,7 @@  lower_lastprivate_clauses (tree clauses,
 		  new_var = build4 (ARRAY_REF, TREE_TYPE (val),
 				    TREE_OPERAND (val, 0), lastlane,
 				    NULL_TREE, NULL_TREE);
+		  TREE_THIS_NOTRAP (new_var) = 1;
 		}
 	    }
 	  else if (maybe_simt)
@@ -8371,6 +8551,167 @@  lower_omp_ordered (gimple_stmt_iterator
 }
 
 
+/* Expand code for an OpenMP scan directive and the structured block
+   before the scan directive.  */
+
+static void
+lower_omp_scan (gimple_stmt_iterator *gsi_p, omp_context *ctx)
+{
+  gimple *stmt = gsi_stmt (*gsi_p);
+  bool has_clauses
+    = gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt)) != NULL;
+  tree lane = NULL_TREE;
+  gimple_seq before = NULL;
+  omp_context *octx = ctx->outer;
+  gcc_assert (octx);
+  bool input_phase = has_clauses ^ octx->scan_inclusive;
+  if (gimple_code (octx->stmt) == GIMPLE_OMP_FOR
+      && (gimple_omp_for_kind (octx->stmt) & GF_OMP_FOR_SIMD)
+      && !gimple_omp_for_combined_into_p (octx->stmt)
+      && octx->scan_inclusive)
+    {
+      if (tree c = omp_find_clause (gimple_omp_for_clauses (octx->stmt),
+				    OMP_CLAUSE__SIMDUID_))
+	{
+	  tree uid = OMP_CLAUSE__SIMDUID__DECL (c);
+	  lane = create_tmp_var (unsigned_type_node);
+	  tree t = build_int_cst (integer_type_node, 1 + !input_phase);
+	  gimple *g
+	    = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 2, uid, t);
+	  gimple_call_set_lhs (g, lane);
+	  gimple_seq_add_stmt (&before, g);
+	}
+      for (tree c = gimple_omp_for_clauses (octx->stmt);
+	   c; c = OMP_CLAUSE_CHAIN (c))
+	if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+	    && OMP_CLAUSE_REDUCTION_INSCAN (c))
+	  {
+	    tree var = OMP_CLAUSE_DECL (c);
+	    tree new_var = lookup_decl (var, octx);
+	    tree val = new_var;
+	    tree var2 = NULL_TREE;
+	    tree var3 = NULL_TREE;
+	    if (DECL_HAS_VALUE_EXPR_P (new_var))
+	      {
+		val = DECL_VALUE_EXPR (new_var);
+		if (TREE_CODE (val) == ARRAY_REF
+		    && VAR_P (TREE_OPERAND (val, 0)))
+		  {
+		    tree v = TREE_OPERAND (val, 0);
+		    if (lookup_attribute ("omp simd array",
+					  DECL_ATTRIBUTES (v)))
+		      {
+			val = unshare_expr (val);
+			TREE_OPERAND (val, 1) = lane;
+			var2 = lookup_decl (v, octx);
+			if (input_phase
+			    && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
+			  var3 = maybe_lookup_decl (var2, octx);
+			if (!input_phase)
+			  {
+			    var2 = build4 (ARRAY_REF, TREE_TYPE (val),
+					   var2, lane, NULL_TREE, NULL_TREE);
+			    TREE_THIS_NOTRAP (var2) = 1;
+			  }
+			else
+			  var2 = val;
+		      }
+		  }
+	      }
+	    else
+	      {
+		var2 = build_outer_var_ref (var, octx);
+		if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
+		  {
+		    var3 = maybe_lookup_decl (new_var, octx);
+		    if (var3 == new_var)
+		      var3 = NULL_TREE;
+		  }
+	      }
+	    if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
+	      {
+		tree placeholder = OMP_CLAUSE_REDUCTION_PLACEHOLDER (c);
+		if (input_phase)
+		  {
+		    if (var3)
+		      {
+			/* If we've added a separate identity element
+			   variable, copy it over into val.  */
+			tree x = lang_hooks.decls.omp_clause_assign_op (c, val,
+									var3);
+			gimplify_and_add (x, &before);
+		      }
+		    else if (OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c))
+		      {
+			/* Otherwise, assign to it the identity element.  */
+			gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c);
+			tree x = (DECL_HAS_VALUE_EXPR_P (new_var)
+				  ? DECL_VALUE_EXPR (new_var) : NULL_TREE);
+			tree ref = build_outer_var_ref (var, octx);
+			SET_DECL_VALUE_EXPR (new_var, val);
+			SET_DECL_VALUE_EXPR (placeholder, ref);
+			DECL_HAS_VALUE_EXPR_P (placeholder) = 1;
+			lower_omp (&tseq, octx);
+			SET_DECL_VALUE_EXPR (new_var, x);
+			SET_DECL_VALUE_EXPR (placeholder, NULL_TREE);
+			DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
+			if (x == NULL_TREE)
+			  DECL_HAS_VALUE_EXPR_P (new_var) = 0;
+			gimple_seq_add_seq (&before, tseq);
+			OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL;
+		      }
+		  }
+		else
+		  {
+		    gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c);
+		    tree x = (DECL_HAS_VALUE_EXPR_P (new_var)
+			      ? DECL_VALUE_EXPR (new_var) : NULL_TREE);
+		    SET_DECL_VALUE_EXPR (new_var, val);
+		    SET_DECL_VALUE_EXPR (placeholder, var2);
+		    DECL_HAS_VALUE_EXPR_P (placeholder) = 1;
+		    lower_omp (&tseq, octx);
+		    gimple_seq_add_seq (&before, tseq);
+		    OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c) = NULL;
+		    SET_DECL_VALUE_EXPR (new_var, x);
+		    SET_DECL_VALUE_EXPR (placeholder, NULL_TREE);
+		    DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
+		    x = lang_hooks.decls.omp_clause_assign_op (c, val, var2);
+		    gimplify_and_add (x, &before);
+		  }
+	      }
+	    else
+	      {
+		if (input_phase)
+		  {
+		    /* input phase.  Set val to initializer before
+		       the body.  */
+		    tree x = omp_reduction_init (c, TREE_TYPE (new_var));
+		    gimplify_assign (val, x, &before);
+		  }
+		else
+		  {
+		    /* scan phase.  */
+		    enum tree_code code = OMP_CLAUSE_REDUCTION_CODE (c);
+		    if (code == MINUS_EXPR)
+		      code = PLUS_EXPR;
+
+		    tree x = build2 (code, TREE_TYPE (var2),
+				     unshare_expr (var2), unshare_expr (val));
+		    gimplify_assign (unshare_expr (var2), x, &before);
+		    gimplify_assign (val, var2, &before);
+		  }
+	      }
+	  }
+    }
+  else if (has_clauses)
+    sorry_at (gimple_location (stmt),
+	      "%<#pragma omp scan%> not supported yet");
+  gsi_insert_seq_after (gsi_p, gimple_omp_body (stmt), GSI_SAME_STMT);
+  gsi_insert_seq_after (gsi_p, before, GSI_SAME_STMT);
+  gsi_replace (gsi_p, gimple_build_nop (), true);
+}
+
+
 /* Gimplify a GIMPLE_OMP_CRITICAL statement.  This is a relatively simple
    substitution of a couple of function calls.  But in the NAMED case,
    requires that languages coordinate a symbol name.  It is therefore
@@ -10843,11 +11184,7 @@  lower_omp_1 (gimple_stmt_iterator *gsi_p
     case GIMPLE_OMP_SCAN:
       ctx = maybe_lookup_ctx (stmt);
       gcc_assert (ctx);
-      gsi_insert_seq_after (gsi_p, gimple_omp_body (stmt), GSI_SAME_STMT);
-      if (gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt)))
-	sorry_at (gimple_location (stmt),
-		  "%<#pragma omp scan%> not supported yet");
-      gsi_replace (gsi_p, gimple_build_nop (), true);
+      lower_omp_scan (gsi_p, ctx);
       break;
     case GIMPLE_OMP_CRITICAL:
       ctx = maybe_lookup_ctx (stmt);
--- gcc/tree-ssa-dce.c.jj	2019-06-15 09:06:53.659032138 +0200
+++ gcc/tree-ssa-dce.c	2019-06-17 10:27:07.595344442 +0200
@@ -1339,14 +1339,14 @@  eliminate_unnecessary_stmts (void)
 		  update_stmt (stmt);
 		  release_ssa_name (name);
 
-		  /* GOMP_SIMD_LANE (unless two argument) or ASAN_POISON
+		  /* GOMP_SIMD_LANE (unless three argument) or ASAN_POISON
 		     without lhs is not needed.  */
 		  if (gimple_call_internal_p (stmt))
 		    switch (gimple_call_internal_fn (stmt))
 		      {
 		      case IFN_GOMP_SIMD_LANE:
-			if (gimple_call_num_args (stmt) >= 2
-			    && !integer_nonzerop (gimple_call_arg (stmt, 1)))
+			if (gimple_call_num_args (stmt) >= 3
+			    && !integer_nonzerop (gimple_call_arg (stmt, 2)))
 			  break;
 			/* FALLTHRU */
 		      case IFN_ASAN_POISON:
--- gcc/tree-vectorizer.h.jj	2019-06-15 09:06:53.556033732 +0200
+++ gcc/tree-vectorizer.h	2019-06-17 10:27:07.589344539 +0200
@@ -491,6 +491,10 @@  typedef struct _loop_vec_info : public v
   /* Map of IV base/step expressions to inserted name in the preheader.  */
   hash_map<tree_operand_hash, tree> *ivexpr_map;
 
+  /* Map of OpenMP "omp simd array" scan variables to corresponding
+     rhs of the store of the initializer.  */
+  hash_map<tree, tree> *scan_map;
+
   /* The unrolling factor needed to SLP the loop. In case of that pure SLP is
      applied to the loop, i.e., no unrolling is needed, this is 1.  */
   poly_uint64 slp_unrolling_factor;
@@ -913,7 +917,7 @@  struct _stmt_vec_info {
   bool strided_p;
 
   /* For both loads and stores.  */
-  bool simd_lane_access_p;
+  unsigned simd_lane_access_p : 2;
 
   /* Classifies how the load or store is going to be implemented
      for loop vectorization.  */
--- gcc/tree-vect-loop.c.jj	2019-06-15 09:06:53.615032818 +0200
+++ gcc/tree-vect-loop.c	2019-06-17 10:27:07.636343783 +0200
@@ -824,6 +824,7 @@  _loop_vec_info::_loop_vec_info (struct l
     peeling_for_alignment (0),
     ptr_mask (0),
     ivexpr_map (NULL),
+    scan_map (NULL),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
     vectorizable (false),
@@ -863,8 +864,8 @@  _loop_vec_info::_loop_vec_info (struct l
 	  gimple *stmt = gsi_stmt (si);
 	  gimple_set_uid (stmt, 0);
 	  add_stmt (stmt);
-	  /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
-	     second argument is the #pragma omp simd if (x) condition, when 0,
+	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
+	     third argument is the #pragma omp simd if (x) condition, when 0,
 	     loop shouldn't be vectorized, when non-zero constant, it should
 	     be vectorized normally, otherwise versioned with vectorized loop
 	     done if the condition is non-zero at runtime.  */
@@ -872,12 +873,12 @@  _loop_vec_info::_loop_vec_info (struct l
 	      && is_gimple_call (stmt)
 	      && gimple_call_internal_p (stmt)
 	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
-	      && gimple_call_num_args (stmt) >= 2
+	      && gimple_call_num_args (stmt) >= 3
 	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 	      && (loop_in->simduid
 		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 	    {
-	      tree arg = gimple_call_arg (stmt, 1);
+	      tree arg = gimple_call_arg (stmt, 2);
 	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 		simd_if_cond = arg;
 	      else
@@ -959,6 +960,7 @@  _loop_vec_info::~_loop_vec_info ()
 
   release_vec_loop_masks (&masks);
   delete ivexpr_map;
+  delete scan_map;
 
   loop->aux = NULL;
 }
--- gcc/tree-vect-data-refs.c.jj	2019-06-15 09:06:53.709031364 +0200
+++ gcc/tree-vect-data-refs.c	2019-06-17 10:27:07.574344780 +0200
@@ -3003,6 +3003,13 @@  vect_analyze_data_ref_accesses (vec_info
 	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
 	    break;
 
+	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
+	     just hold extra information.  */
+	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
+	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
+	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
+	    break;
+
 	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
 	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
 	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
@@ -4101,7 +4108,8 @@  vect_find_stmt_data_reference (loop_p lo
 			  DR_STEP_ALIGNMENT (newdr)
 			    = highest_pow2_factor (step);
 			  /* Mark as simd-lane access.  */
-			  newdr->aux = (void *)-1;
+			  tree arg2 = gimple_call_arg (def, 1);
+			  newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
 			  free_data_ref (dr);
 			  datarefs->safe_push (newdr);
 			  return opt_result::success ();
@@ -4210,14 +4218,17 @@  vect_analyze_data_refs (vec_info *vinfo,
         }
 
       /* See if this was detected as SIMD lane access.  */
-      if (dr->aux == (void *)-1)
+      if (dr->aux == (void *)-1
+	  || dr->aux == (void *)-2
+	  || dr->aux == (void *)-3)
 	{
 	  if (nested_in_vect_loop_p (loop, stmt_info))
 	    return opt_result::failure_at (stmt_info->stmt,
 					   "not vectorized:"
 					   " data ref analysis failed: %G",
 					   stmt_info->stmt);
-	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
+	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
+	    = -(uintptr_t) dr->aux;
 	}
 
       tree base = get_base_address (DR_REF (dr));
--- gcc/tree-vect-stmts.c.jj	2019-06-15 09:06:53.752030698 +0200
+++ gcc/tree-vect-stmts.c	2019-06-17 18:42:10.936670834 +0200
@@ -54,6 +54,7 @@  along with GCC; see the file COPYING3.
 #include "tree-ssa-loop-niter.h"
 #include "gimple-fold.h"
 #include "regs.h"
+#include "attribs.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -3257,7 +3258,7 @@  vectorizable_call (stmt_vec_info stmt_in
   if (nargs == 0 || nargs > 4)
     return false;
 
-  /* Ignore the argument of IFN_GOMP_SIMD_LANE, it is magic.  */
+  /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic.  */
   combined_fn cfn = gimple_call_combined_fn (stmt);
   if (cfn == CFN_GOMP_SIMD_LANE)
     {
@@ -6320,6 +6321,489 @@  get_group_alias_ptr_type (stmt_vec_info
 }
 
 
+/* Function scan_operand_equal_p.
+
+   Helper function for check_scan_store.  Compare two references
+   with .GOMP_SIMD_LANE bases.  */
+
+static bool
+scan_operand_equal_p (tree ref1, tree ref2)
+{
+  machine_mode mode1, mode2;
+  poly_int64 bitsize1, bitsize2, bitpos1, bitpos2;
+  tree offset1, offset2;
+  int unsignedp1, unsignedp2, reversep1, reversep2;
+  int volatilep1 = 0, volatilep2 = 0;
+  tree base1 = get_inner_reference (ref1, &bitsize1, &bitpos1, &offset1,
+				    &mode1, &unsignedp1, &reversep1,
+				    &volatilep1);
+  tree base2 = get_inner_reference (ref2, &bitsize2, &bitpos2, &offset2,
+				    &mode2, &unsignedp2, &reversep2,
+				    &volatilep2);
+  if (reversep1 || reversep2 || volatilep1 || volatilep2)
+    return false;
+  if (!operand_equal_p (base1, base2, 0))
+    return false;
+  if (maybe_ne (bitpos1, 0) || maybe_ne (bitpos2, 0))
+    return false;
+  if (maybe_ne (bitsize1, bitsize2))
+    return false;
+  if (!operand_equal_p (offset1, offset2, 0))
+    return false;
+  return true;
+}
+
+
+/* Function check_scan_store.
+
+   Check magic stores for #pragma omp scan {in,ex}clusive reductions.  */
+
+static bool
+check_scan_store (stmt_vec_info stmt_info, tree vectype,
+		  enum vect_def_type rhs_dt, bool slp, tree mask,
+		  vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+  tree ref_type;
+
+  gcc_assert (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1);
+  if (slp
+      || mask
+      || memory_access_type != VMAT_CONTIGUOUS
+      || TREE_CODE (DR_BASE_ADDRESS (dr_info->dr)) != ADDR_EXPR
+      || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0))
+      || loop_vinfo == NULL
+      || LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      || STMT_VINFO_GROUPED_ACCESS (stmt_info)
+      || !integer_zerop (DR_OFFSET (dr_info->dr))
+      || !integer_zerop (DR_INIT (dr_info->dr))
+      || !(ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr)))
+      || !alias_sets_conflict_p (get_alias_set (vectype),
+				 get_alias_set (TREE_TYPE (ref_type))))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "unsupported OpenMP scan store.\n");
+      return false;
+    }
+
+  /* We need to pattern match code built by OpenMP lowering and simplified
+     by following optimizations into something we can handle.
+     #pragma omp simd reduction(inscan,+:r)
+     for (...)
+       {
+	 r += something ();
+	 #pragma omp scan inclusive (r)
+	 use (r);
+       }
+     shall have body with:
+       // Initialization for input phase, store the reduction initializer:
+       _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
+       _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
+       D.2042[_21] = 0;
+       // Actual input phase:
+       ...
+       r.0_5 = D.2042[_20];
+       _6 = _4 + r.0_5;
+       D.2042[_20] = _6;
+       // Initialization for scan phase:
+       _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 2);
+       _26 = D.2043[_25];
+       _27 = D.2042[_25];
+       _28 = _26 + _27;
+       D.2043[_25] = _28;
+       D.2042[_25] = _28;
+       // Actual scan phase:
+       ...
+       r.1_8 = D.2042[_20];
+       ...
+     The "omp simd array" variable D.2042 holds the privatized copy used
+     inside of the loop and D.2043 is another one that holds copies of
+     the current original list item.  The separate GOMP_SIMD_LANE ifn
+     kinds are there in order to allow optimizing the initializer store
+     and combiner sequence, e.g. if it is originally some C++ish user
+     defined reduction, but allow the vectorizer to pattern recognize it
+     and turn into the appropriate vectorized scan.  */
+
+  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
+    {
+      /* Match the D.2042[_21] = 0; store above.  Just require that
+	 it is a constant or external definition store.  */
+      if (rhs_dt != vect_constant_def && rhs_dt != vect_external_def)
+	{
+	 fail_init:
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "unsupported OpenMP scan initializer store.\n");
+	  return false;
+	}
+
+      if (! loop_vinfo->scan_map)
+	loop_vinfo->scan_map = new hash_map<tree, tree>;
+      tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
+      tree &cached = loop_vinfo->scan_map->get_or_insert (var);
+      if (cached)
+	goto fail_init;
+      cached = gimple_assign_rhs1 (STMT_VINFO_STMT (stmt_info));
+
+      /* These stores can be vectorized normally.  */
+      return true;
+    }
+
+  if (rhs_dt != vect_internal_def)
+    {
+     fail:
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "unsupported OpenMP scan combiner pattern.\n");
+      return false;
+    }
+
+  gimple *stmt = STMT_VINFO_STMT (stmt_info);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  if (TREE_CODE (rhs) != SSA_NAME)
+    goto fail;
+
+  use_operand_p use_p;
+  imm_use_iterator iter;
+  gimple *other_store_stmt = NULL;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
+    {
+      gimple *use_stmt = USE_STMT (use_p);
+      if (use_stmt == stmt || is_gimple_debug (use_stmt))
+	continue;
+      if (gimple_bb (use_stmt) != gimple_bb (stmt)
+	  || !gimple_store_p (use_stmt)
+	  || other_store_stmt)
+	goto fail;
+      other_store_stmt = use_stmt;
+    }
+  if (other_store_stmt == NULL)
+    goto fail;
+  stmt_vec_info other_store_stmt_info
+    = loop_vinfo->lookup_stmt (other_store_stmt);
+  if (other_store_stmt_info == NULL
+      || STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) != 3)
+    goto fail;
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
+  if (gimple_bb (def_stmt) != gimple_bb (stmt)
+      || !is_gimple_assign (def_stmt)
+      || gimple_assign_rhs_class (def_stmt) != GIMPLE_BINARY_RHS)
+    goto fail;
+
+  enum tree_code code = gimple_assign_rhs_code (def_stmt);
+  /* For pointer addition, we should use the normal plus for the vector
+     operation.  */
+  switch (code)
+    {
+    case POINTER_PLUS_EXPR:
+      code = PLUS_EXPR;
+      break;
+    case MULT_HIGHPART_EXPR:
+      goto fail;
+    default:
+      break;
+    }
+  if (TREE_CODE_LENGTH (code) != binary_op || !commutative_tree_code (code))
+    goto fail;
+
+  tree rhs1 = gimple_assign_rhs1 (def_stmt);
+  tree rhs2 = gimple_assign_rhs2 (def_stmt);
+  if (TREE_CODE (rhs1) != SSA_NAME
+      || TREE_CODE (rhs2) != SSA_NAME)
+    goto fail;
+
+  gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
+  gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
+  if (gimple_bb (load1_stmt) != gimple_bb (stmt)
+      || !gimple_assign_load_p (load1_stmt)
+      || gimple_bb (load2_stmt) != gimple_bb (stmt)
+      || !gimple_assign_load_p (load2_stmt))
+    goto fail;
+
+  stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
+  stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
+  if (load1_stmt_info == NULL
+      || load2_stmt_info == NULL
+      || STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) != 3
+      || STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) != 3)
+    goto fail;
+
+  if (scan_operand_equal_p (gimple_assign_lhs (stmt),
+			    gimple_assign_rhs1 (load2_stmt)))
+    {
+      std::swap (rhs1, rhs2);
+      std::swap (load1_stmt, load2_stmt);
+      std::swap (load1_stmt_info, load2_stmt_info);
+    }
+  if (!scan_operand_equal_p (gimple_assign_lhs (stmt),
+			     gimple_assign_rhs1 (load1_stmt))
+      || !scan_operand_equal_p (gimple_assign_lhs (other_store_stmt),
+				gimple_assign_rhs1 (load2_stmt)))
+    goto fail;
+
+  dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
+  if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
+      || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0)))
+    goto fail;
+
+  tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
+  tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (other_dr_info->dr), 0);
+  if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var1))
+      || !lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var2))
+      || (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
+	 == (!lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var2))))
+    goto fail;
+
+  if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
+    std::swap (var1, var2);
+
+  if (loop_vinfo->scan_map == NULL)
+    goto fail;
+  tree *init = loop_vinfo->scan_map->get (var1);
+  if (init == NULL)
+    goto fail;
+
+  /* The IL is as expected, now check if we can actually vectorize it.
+       _26 = D.2043[_25];
+       _27 = D.2042[_25];
+       _28 = _26 + _27;
+       D.2043[_25] = _28;
+       D.2042[_25] = _28;
+     should be vectorized as (where _40 is the vectorized rhs
+     from the D.2042[_21] = 0; store):
+       _30 = MEM <vector(8) int> [(int *)&D.2043];
+       _31 = MEM <vector(8) int> [(int *)&D.2042];
+       _32 = VEC_PERM_EXPR <_31, _40, { 8, 0, 1, 2, 3, 4, 5, 6 }>;
+       _33 = _31 + _32;
+       // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
+       _34 = VEC_PERM_EXPR <_33, _40, { 8, 9, 0, 1, 2, 3, 4, 5 }>;
+       _35 = _33 + _34;
+       // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
+       //         _31[1]+.._31[4], ... _31[4]+.._31[7] };
+       _36 = VEC_PERM_EXPR <_35, _40, { 8, 9, 10, 11, 0, 1, 2, 3 }>;
+       _37 = _35 + _36;
+       // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
+       //         _31[0]+.._31[4], ... _31[0]+.._31[7] };
+       _38 = _30 + _37;
+       _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
+       MEM <vector(8) int> [(int *)&D.2043] = _39;
+       MEM <vector(8) int> [(int *)&D.2042] = _38;  */
+  enum machine_mode vec_mode = TYPE_MODE (vectype);
+  optab optab = optab_for_tree_code (code, vectype, optab_default);
+  if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
+    goto fail;
+
+  unsigned HOST_WIDE_INT nunits;
+  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
+    goto fail;
+  int units_log2 = exact_log2 (nunits);
+  if (units_log2 <= 0)
+    goto fail;
+
+  for (int i = 0; i <= units_log2; ++i)
+    {
+      unsigned HOST_WIDE_INT j, k;
+      vec_perm_builder sel (nunits, nunits, 1);
+      sel.quick_grow (nunits);
+      if (i == units_log2)
+	{
+	  for (j = 0; j < nunits; ++j)
+	    sel[j] = nunits - 1;
+	}
+      else
+	{
+	  for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
+	    sel[j] = nunits + j;
+	  for (k = 0; j < nunits; ++j, ++k)
+	    sel[j] = k;
+	}
+      vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
+      if (!can_vec_perm_const_p (vec_mode, indices))
+	goto fail;
+    }
+
+  return true;
+}
+
+
+/* Function vectorizable_scan_store.
+
+   Helper of vectorizable_score, arguments like on vectorizable_store.
+   Handle only the transformation, checking is done in check_scan_store.  */
+
+static bool
+vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+			 stmt_vec_info *vec_stmt, int ncopies)
+{
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+  tree ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
+  vec_info *vinfo = stmt_info->vinfo;
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "transform scan store. ncopies = %d\n", ncopies);
+
+  gimple *stmt = STMT_VINFO_STMT (stmt_info);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  gcc_assert (TREE_CODE (rhs) == SSA_NAME);
+
+  gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
+  enum tree_code code = gimple_assign_rhs_code (def_stmt);
+  if (code == POINTER_PLUS_EXPR)
+    code = PLUS_EXPR;
+  gcc_assert (TREE_CODE_LENGTH (code) == binary_op
+	      && commutative_tree_code (code));
+  tree rhs1 = gimple_assign_rhs1 (def_stmt);
+  tree rhs2 = gimple_assign_rhs2 (def_stmt);
+  gcc_assert (TREE_CODE (rhs1) == SSA_NAME && TREE_CODE (rhs2) == SSA_NAME);
+  gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
+  gimple *load2_stmt = SSA_NAME_DEF_STMT (rhs2);
+  stmt_vec_info load1_stmt_info = loop_vinfo->lookup_stmt (load1_stmt);
+  stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
+  dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
+  dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
+  tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
+  tree var2 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
+
+  if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
+    {
+      std::swap (rhs1, rhs2);
+      std::swap (var1, var2);
+    }
+
+  tree *init = loop_vinfo->scan_map->get (var1);
+  gcc_assert (init);
+
+  tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
+  bool inscan_var_store
+    = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
+
+  unsigned HOST_WIDE_INT nunits;
+  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
+    gcc_unreachable ();
+  int units_log2 = exact_log2 (nunits);
+  gcc_assert (units_log2 > 0);
+  auto_vec<tree, 16> perms;
+  perms.quick_grow (units_log2 + 1);
+  for (int i = 0; i <= units_log2; ++i)
+    {
+      unsigned HOST_WIDE_INT j, k;
+      vec_perm_builder sel (nunits, nunits, 1);
+      sel.quick_grow (nunits);
+      if (i == units_log2)
+	{
+	  for (j = 0; j < nunits; ++j)
+	    sel[j] = nunits - 1;
+	}
+      else
+	{
+	  for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j)
+	    sel[j] = nunits + j;
+	  for (k = 0; j < nunits; ++j, ++k)
+	    sel[j] = k;
+	}
+      vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits);
+      perms[i] = vect_gen_perm_mask_checked (vectype, indices);
+    }
+
+  stmt_vec_info prev_stmt_info = NULL;
+  tree vec_oprnd1 = NULL_TREE;
+  tree vec_oprnd2 = NULL_TREE;
+  tree vec_oprnd3 = NULL_TREE;
+  tree dataref_ptr = unshare_expr (DR_BASE_ADDRESS (dr_info->dr));
+  tree dataref_offset = build_int_cst (ref_type, 0);
+  tree bump = vect_get_data_ptr_increment (dr_info, vectype, VMAT_CONTIGUOUS);
+  tree orig = NULL_TREE;
+  for (int j = 0; j < ncopies; j++)
+    {
+      stmt_vec_info new_stmt_info;
+      if (j == 0)
+	{
+	  vec_oprnd1 = vect_get_vec_def_for_operand (*init, stmt_info);
+	  vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info);
+	  vec_oprnd3 = vect_get_vec_def_for_operand (rhs2, stmt_info);
+	  orig = vec_oprnd3;
+	}
+      else
+	{
+	  vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
+	  vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2);
+	  vec_oprnd3 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd3);
+	  if (!inscan_var_store)
+	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
+	}
+
+      tree v = vec_oprnd2;
+      for (int i = 0; i < units_log2; ++i)
+	{
+	  tree new_temp = make_ssa_name (vectype);
+	  gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR, v,
+					   vec_oprnd1, perms[i]);
+	  new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+	  if (prev_stmt_info == NULL)
+	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+	  else
+	    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+	  prev_stmt_info = new_stmt_info;
+
+	  tree new_temp2 = make_ssa_name (vectype);
+	  g = gimple_build_assign (new_temp2, code, v, new_temp);
+	  new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+	  STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+	  prev_stmt_info = new_stmt_info;
+
+	  v = new_temp2;
+	}
+
+      tree new_temp = make_ssa_name (vectype);
+      gimple *g = gimple_build_assign (new_temp, code, orig, v);
+      new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+      prev_stmt_info = new_stmt_info;
+
+      orig = make_ssa_name (vectype);
+      g = gimple_build_assign (orig, VEC_PERM_EXPR, new_temp, new_temp,
+			       perms[units_log2]);
+      new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+      prev_stmt_info = new_stmt_info;
+
+      if (!inscan_var_store)
+	{
+	  tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
+				       dataref_offset);
+	  vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
+	  g = gimple_build_assign (data_ref, new_temp);
+	  new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
+	  STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+	  prev_stmt_info = new_stmt_info;
+	}
+    }
+
+  if (inscan_var_store)
+    for (int j = 0; j < ncopies; j++)
+      {
+	if (j != 0)
+	  dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
+
+	tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
+				     dataref_offset);
+	vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
+	gimple *g = gimple_build_assign (data_ref, orig);
+	stmt_vec_info new_stmt_info
+	  = vect_finish_stmt_generation (stmt_info, g, gsi);
+	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
+	prev_stmt_info = new_stmt_info;
+      }
+  return true;
+}
+
+
 /* Function vectorizable_store.
 
    Check if STMT_INFO defines a non scalar data-ref (array/pointer/structure)
@@ -6514,6 +6998,13 @@  vectorizable_store (stmt_vec_info stmt_i
       group_size = vec_num = 1;
     }
 
+  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) > 1 && !vec_stmt)
+    {
+      if (!check_scan_store (stmt_info, vectype, rhs_dt, slp, mask,
+			     memory_access_type))
+	return false;
+    }
+
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
@@ -6737,6 +7228,8 @@  vectorizable_store (stmt_vec_info stmt_i
 	}
       return true;
     }
+  else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
+    return vectorizable_scan_store (stmt_info, gsi, vec_stmt, ncopies);
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     DR_GROUP_STORE_COUNT (DR_GROUP_FIRST_ELEMENT (stmt_info))++;
@@ -7162,7 +7655,7 @@  vectorizable_store (stmt_vec_info stmt_i
 	  gcc_assert (useless_type_conversion_p (vectype,
 						 TREE_TYPE (vec_oprnd)));
 	  bool simd_lane_access_p
-	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info);
+	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
 	  if (simd_lane_access_p
 	      && !loop_masks
 	      && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
@@ -8347,7 +8840,7 @@  vectorizable_load (stmt_vec_info stmt_in
       if (j == 0)
 	{
 	  bool simd_lane_access_p
-	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info);
+	    = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0;
 	  if (simd_lane_access_p
 	      && TREE_CODE (DR_BASE_ADDRESS (first_dr_info->dr)) == ADDR_EXPR
 	      && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (first_dr_info->dr), 0))
--- gcc/cp/semantics.c.jj	2019-06-10 14:18:17.458525716 +0200
+++ gcc/cp/semantics.c	2019-06-17 10:54:33.771837898 +0200
@@ -7688,6 +7688,8 @@  finish_omp_clauses (tree clauses, enum c
 	case OMP_CLAUSE_REDUCTION:
 	  if (reduction_seen == -2)
 	    OMP_CLAUSE_REDUCTION_INSCAN (c) = 0;
+	  if (OMP_CLAUSE_REDUCTION_INSCAN (c))
+	    need_copy_assignment = true;
 	  need_implicitly_determined = true;
 	  break;
 	case OMP_CLAUSE_IN_REDUCTION:
--- gcc/testsuite/gcc.dg/vect/vect-simd-8.c.jj	2019-06-17 10:27:07.595344442 +0200
+++ gcc/testsuite/gcc.dg/vect/vect-simd-8.c	2019-06-17 17:39:06.516299158 +0200
@@ -0,0 +1,118 @@ 
+/* { dg-require-effective-target size32plus } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+int r, a[1024], b[1024];
+
+__attribute__((noipa)) void
+foo (int *a, int *b)
+{
+  #pragma omp simd reduction (inscan, +:r)
+  for (int i = 0; i < 1024; i++)
+    {
+      r += a[i];
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) int
+bar (void)
+{
+  int s = 0;
+  #pragma omp simd reduction (inscan, +:s)
+  for (int i = 0; i < 1024; i++)
+    {
+      s += 2 * a[i];
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return s;
+}
+
+__attribute__((noipa)) void
+baz (int *a, int *b)
+{
+  #pragma omp simd reduction (inscan, +:r) if (simd: 0)
+  for (int i = 0; i < 1024; i++)
+    {
+      r += a[i];
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) int
+qux (void)
+{
+  int s = 0;
+  #pragma omp simd reduction (inscan, +:s) simdlen (1)
+  for (int i = 0; i < 1024; i++)
+    {
+      s += 2 * a[i];
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return s;
+}
+
+int
+main ()
+{
+  int s = 0;
+  check_vect ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      a[i] = i;
+      b[i] = -1;
+      asm ("" : "+g" (i));
+    }
+  foo (a, b);
+  if (r != 1024 * 1023 / 2)
+    abort ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = 25;
+    }
+  if (bar () != 1024 * 1023)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += 2 * i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = -1;
+    }
+  r = 0;
+  baz (a, b);
+  if (r != 1024 * 1023 / 2)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = -25;
+    }
+  if (qux () != 1024 * 1023)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += 2 * i;
+      if (b[i] != s)
+	abort ();
+    }
+  return 0;
+}
--- gcc/testsuite/gcc.dg/vect/vect-simd-9.c.jj	2019-06-17 17:41:04.937434721 +0200
+++ gcc/testsuite/gcc.dg/vect/vect-simd-9.c	2019-06-17 17:41:25.133116754 +0200
@@ -0,0 +1,120 @@ 
+/* { dg-require-effective-target size32plus } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+int r, a[1024], b[1024];
+
+#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer (omp_priv = 0)
+
+__attribute__((noipa)) void
+foo (int *a, int *b)
+{
+  #pragma omp simd reduction (inscan, foo:r)
+  for (int i = 0; i < 1024; i++)
+    {
+      r += a[i];
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) int
+bar (void)
+{
+  int s = 0;
+  #pragma omp simd reduction (inscan, foo:s)
+  for (int i = 0; i < 1024; i++)
+    {
+      s += 2 * a[i];
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return s;
+}
+
+__attribute__((noipa)) void
+baz (int *a, int *b)
+{
+  #pragma omp simd reduction (inscan, foo:r) if (simd: 0)
+  for (int i = 0; i < 1024; i++)
+    {
+      r += a[i];
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) int
+qux (void)
+{
+  int s = 0;
+  #pragma omp simd reduction (inscan, foo:s) simdlen (1)
+  for (int i = 0; i < 1024; i++)
+    {
+      s += 2 * a[i];
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return s;
+}
+
+int
+main ()
+{
+  int s = 0;
+  check_vect ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      a[i] = i;
+      b[i] = -1;
+      asm ("" : "+g" (i));
+    }
+  foo (a, b);
+  if (r != 1024 * 1023 / 2)
+    abort ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = 25;
+    }
+  if (bar () != 1024 * 1023)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += 2 * i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = -1;
+    }
+  r = 0;
+  baz (a, b);
+  if (r != 1024 * 1023 / 2)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += i;
+      if (b[i] != s)
+	abort ();
+      else
+	b[i] = -25;
+    }
+  if (qux () != 1024 * 1023)
+    abort ();
+  s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s += 2 * i;
+      if (b[i] != s)
+	abort ();
+    }
+  return 0;
+}
--- gcc/testsuite/g++.dg/vect/simd-2.cc.jj	2019-06-17 18:25:22.704460820 +0200
+++ gcc/testsuite/g++.dg/vect/simd-2.cc	2019-06-17 18:48:17.082095017 +0200
@@ -0,0 +1,153 @@ 
+// { dg-require-effective-target size32plus }
+// { dg-additional-options "-fopenmp-simd" }
+// { dg-additional-options "-mavx" { target avx_runtime } }
+// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { xfail *-*-* } } }
+
+#include "../../gcc.dg/vect/tree-vect.h"
+
+struct S {
+  inline S ();
+  inline ~S ();
+  inline S (const S &);
+  inline S & operator= (const S &);
+  int s;
+};
+
+S::S () : s (0)
+{
+}
+
+S::~S ()
+{
+}
+
+S::S (const S &x)
+{
+  s = x.s;
+}
+
+S &
+S::operator= (const S &x)
+{
+  s = x.s;
+  return *this;
+}
+
+static inline void
+ini (S &x)
+{
+  x.s = 0;
+}
+
+S r, a[1024], b[1024];
+
+#pragma omp declare reduction (+: S: omp_out.s += omp_in.s)
+#pragma omp declare reduction (plus: S: omp_out.s += omp_in.s) initializer (ini (omp_priv))
+
+__attribute__((noipa)) void
+foo (S *a, S *b)
+{
+  #pragma omp simd reduction (inscan, +:r)
+  for (int i = 0; i < 1024; i++)
+    {
+      r.s += a[i].s;
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) S
+bar (void)
+{
+  S s;
+  #pragma omp simd reduction (inscan, plus:s)
+  for (int i = 0; i < 1024; i++)
+    {
+      s.s += 2 * a[i].s;
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return S (s);
+}
+
+__attribute__((noipa)) void
+baz (S *a, S *b)
+{
+  #pragma omp simd reduction (inscan, +:r) simdlen(1)
+  for (int i = 0; i < 1024; i++)
+    {
+      r.s += a[i].s;
+      #pragma omp scan inclusive(r)
+      b[i] = r;
+    }
+}
+
+__attribute__((noipa)) S
+qux (void)
+{
+  S s;
+  #pragma omp simd if (0) reduction (inscan, plus:s)
+  for (int i = 0; i < 1024; i++)
+    {
+      s.s += 2 * a[i].s;
+      #pragma omp scan inclusive(s)
+      b[i] = s;
+    }
+  return S (s);
+}
+
+int
+main ()
+{
+  S s;
+  check_vect ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      a[i].s = i;
+      b[i].s = -1;
+      asm ("" : "+g" (i));
+    }
+  foo (a, b);
+  if (r.s != 1024 * 1023 / 2)
+    abort ();
+  for (int i = 0; i < 1024; ++i)
+    {
+      s.s += i;
+      if (b[i].s != s.s)
+	abort ();
+      else
+	b[i].s = 25;
+    }
+  if (bar ().s != 1024 * 1023)
+    abort ();
+  s.s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s.s += 2 * i;
+      if (b[i].s != s.s)
+	abort ();
+    }
+  r.s = 0;
+  baz (a, b);
+  if (r.s != 1024 * 1023 / 2)
+    abort ();
+  s.s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s.s += i;
+      if (b[i].s != s.s)
+	abort ();
+      else
+	b[i].s = 25;
+    }
+  if (qux ().s != 1024 * 1023)
+    abort ();
+  s.s = 0;
+  for (int i = 0; i < 1024; ++i)
+    {
+      s.s += 2 * i;
+      if (b[i].s != s.s)
+	abort ();
+    }
+  return 0;
+}
--- gcc/testsuite/g++.dg/gomp/scan-1.C.jj	2019-06-17 19:15:16.850095767 +0200
+++ gcc/testsuite/g++.dg/gomp/scan-1.C	2019-06-17 19:19:44.683982049 +0200
@@ -0,0 +1,26 @@ 
+// { dg-do compile { target c++11 } }
+
+struct S { S (); ~S (); S &operator = (const S &) = delete; int s; };	// { dg-message "declared here" }
+#pragma omp declare reduction (+ : S : omp_out.s += omp_in.s)
+
+S s;
+
+void
+foo (void)
+{
+  #pragma omp simd reduction (+: s)
+  for (int i = 0; i < 64; ++i)
+    s.s += i;
+}
+
+void
+bar (int *x)
+{
+  #pragma omp simd reduction (inscan, +: s)	// { dg-error "use of deleted function" }
+  for (int i = 0; i < 64; ++i)
+    {
+      s.s += i;
+      #pragma omp scan inclusive (s)		// { dg-error "" }
+      x[i] = s.s;
+    }
+}