Message ID | 20190621065729.GW815@tucnak |
---|---|
State | New |
Headers | show |
Series | [committed] Add OpenMP 5 exclusive scan support for simd constructs | expand |
On Fri, 21 Jun 2019 at 08:57, Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > The following patch adds exclusive scan support for simd, it is similar to > the inclusive scan, just we need to swap the input and scan phases and > use slightly different pattern at the start of the scan phase, so that it > computes what we need. > > Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk. > > 2019-06-21 Jakub Jelinek <jakub@redhat.com> > > * omp-low.c (lower_rec_simd_input_clauses): Add rvar2 argument, > create another "omp scan inscan exclusive" array if > !ctx->scan_inclusive. > (lower_rec_input_clauses): Handle exclusive scan inscan reductions. > (lower_omp_scan): Likewise. > * tree-vectorizer.h (struct _stmt_vec_info): Use 3-bit instead of > 2-bit bitfield for simd_lane_access_p member. > * tree-vect-data-refs.c (vect_analyze_data_refs): Also handle > aux == (void *)-4 as simd lane access. > * tree-vect-stmts.c (check_scan_store): Handle exclusive scan. Update > comment with permutations to show the canonical permutation order. > (vectorizable_scan_store): Handle exclusive scan. > (vectorizable_store): Call vectorizable_scan_store even for > STMT_VINFO_SIMD_LANE_ACCESS_P > 3. > > * gcc.dg/vect/vect-simd-12.c: New test. > * gcc.dg/vect/vect-simd-13.c: New test. > * gcc.dg/vect/vect-simd-14.c: New test. > * gcc.dg/vect/vect-simd-15.c: New test. > * gcc.target/i386/sse2-vect-simd-12.c: New test. > * gcc.target/i386/sse2-vect-simd-13.c: New test. > * gcc.target/i386/sse2-vect-simd-14.c: New test. > * gcc.target/i386/sse2-vect-simd-15.c: New test. > * gcc.target/i386/avx2-vect-simd-12.c: New test. > * gcc.target/i386/avx2-vect-simd-13.c: New test. > * gcc.target/i386/avx2-vect-simd-14.c: New test. > * gcc.target/i386/avx2-vect-simd-15.c: New test. > * gcc.target/i386/avx512f-vect-simd-12.c: New test. > * gcc.target/i386/avx512f-vect-simd-13.c: New test. > * gcc.target/i386/avx512f-vect-simd-14.c: New test. > * gcc.target/i386/avx512bw-vect-simd-15.c: New test. > * g++.dg/vect/simd-6.cc: New test. > * g++.dg/vect/simd-7.cc: New test. > * g++.dg/vect/simd-8.cc: New test. > * g++.dg/vect/simd-9.cc: New test. > * c-c++-common/gomp/scan-2.c: Don't expect any diagnostics. > > --- gcc/omp-low.c.jj 2019-06-20 13:26:29.085150770 +0200 > +++ gcc/omp-low.c 2019-06-20 15:46:25.964253058 +0200 > @@ -3692,7 +3692,8 @@ struct omplow_simd_context { > static bool > lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, > omplow_simd_context *sctx, tree &ivar, > - tree &lvar, tree *rvar = NULL) > + tree &lvar, tree *rvar = NULL, > + tree *rvar2 = NULL) > { > if (known_eq (sctx->max_vf, 0U)) > { > @@ -3767,6 +3768,25 @@ lower_rec_simd_input_clauses (tree new_v > *rvar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, > sctx->lastlane, NULL_TREE, NULL_TREE); > TREE_THIS_NOTRAP (*rvar) = 1; > + > + if (!ctx->scan_inclusive) > + { > + /* And for exclusive scan yet another one, which will > + hold the value during the scan phase. */ > + tree savar = create_tmp_var_raw (atype); > + if (TREE_ADDRESSABLE (new_var)) > + TREE_ADDRESSABLE (savar) = 1; > + DECL_ATTRIBUTES (savar) > + = tree_cons (get_identifier ("omp simd array"), NULL, > + tree_cons (get_identifier ("omp simd inscan " > + "exclusive"), NULL, > + DECL_ATTRIBUTES (savar))); > + gimple_add_tmp_var (savar); > + ctx->cb.decl_map->put (iavar, savar); > + *rvar2 = build4 (ARRAY_REF, TREE_TYPE (new_var), savar, > + sctx->idx, NULL_TREE, NULL_TREE); > + TREE_THIS_NOTRAP (*rvar2) = 1; > + } > } > ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->idx, > NULL_TREE, NULL_TREE); > @@ -5185,14 +5205,15 @@ lower_rec_input_clauses (tree clauses, g > new_vard = TREE_OPERAND (new_var, 0); > gcc_assert (DECL_P (new_vard)); > } > - tree rvar = NULL_TREE, *rvarp = NULL; > + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE; > if (is_simd > && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION > && OMP_CLAUSE_REDUCTION_INSCAN (c)) > rvarp = &rvar; > if (is_simd > && lower_rec_simd_input_clauses (new_var, ctx, &sctx, > - ivar, lvar, rvarp)) > + ivar, lvar, rvarp, > + &rvar2)) > { > if (new_vard == new_var) > { > @@ -5220,6 +5241,14 @@ lower_rec_input_clauses (tree clauses, g > (c, ivar2, build_outer_var_ref (var, ctx)); > gimplify_and_add (x, &llist[0]); > > + if (rvar2) > + { > + x = lang_hooks.decls.omp_clause_default_ctor > + (c, unshare_expr (rvar2), > + build_outer_var_ref (var, ctx)); > + gimplify_and_add (x, &llist[0]); > + } > + > /* For types that need construction, add another > private var which will be default constructed > and optionally initialized with > @@ -5229,7 +5258,9 @@ lower_rec_input_clauses (tree clauses, g > iteration. */ > tree nv = create_tmp_var_raw (TREE_TYPE (ivar)); > gimple_add_tmp_var (nv); > - ctx->cb.decl_map->put (TREE_OPERAND (ivar, 0), > + ctx->cb.decl_map->put (TREE_OPERAND (rvar2 > + ? rvar2 > + : ivar, 0), > nv); > x = lang_hooks.decls.omp_clause_default_ctor > (c, nv, build_outer_var_ref (var, ctx)); > @@ -5296,6 +5327,18 @@ lower_rec_input_clauses (tree clauses, g > gimplify_stmt (&dtor, &tseq); > gimple_seq_add_seq (&llist[1], tseq); > } > + > + if (rvar2) > + { > + x = lang_hooks.decls.omp_clause_dtor (c, rvar2); > + if (x) > + { > + tseq = NULL; > + dtor = x; > + gimplify_stmt (&dtor, &tseq); > + gimple_seq_add_seq (&llist[1], tseq); > + } > + } > break; > } > if (x) > @@ -5390,6 +5433,24 @@ lower_rec_input_clauses (tree clauses, g > gimple_seq_add_seq (ilist, tseq); > } > OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL; > + if (!ctx->scan_inclusive) > + { > + tree nv2 > + = create_tmp_var_raw (TREE_TYPE (new_var)); > + gimple_add_tmp_var (nv2); > + ctx->cb.decl_map->put (nv, nv2); > + x = lang_hooks.decls.omp_clause_default_ctor > + (c, nv2, build_outer_var_ref (var, ctx)); > + gimplify_and_add (x, ilist); > + x = lang_hooks.decls.omp_clause_dtor (c, nv2); > + if (x) > + { > + tseq = NULL; > + dtor = x; > + gimplify_stmt (&dtor, &tseq); > + gimple_seq_add_seq (dlist, tseq); > + } > + } > x = lang_hooks.decls.omp_clause_dtor (c, nv); > if (x) > { > @@ -5399,6 +5460,21 @@ lower_rec_input_clauses (tree clauses, g > gimple_seq_add_seq (dlist, tseq); > } > } > + else if (!ctx->scan_inclusive > + && TREE_ADDRESSABLE (TREE_TYPE (new_var))) > + { > + tree nv2 = create_tmp_var_raw (TREE_TYPE (new_var)); > + gimple_add_tmp_var (nv2); > + ctx->cb.decl_map->put (new_vard, nv2); > + x = lang_hooks.decls.omp_clause_dtor (c, nv2); > + if (x) > + { > + tseq = NULL; > + dtor = x; > + gimplify_stmt (&dtor, &tseq); > + gimple_seq_add_seq (dlist, tseq); > + } > + } > DECL_HAS_VALUE_EXPR_P (placeholder) = 0; > goto do_dtor; > } > @@ -5487,14 +5563,15 @@ lower_rec_input_clauses (tree clauses, g > new_vard = TREE_OPERAND (new_var, 0); > gcc_assert (DECL_P (new_vard)); > } > - tree rvar = NULL_TREE, *rvarp = NULL; > + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE; > if (is_simd > && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION > && OMP_CLAUSE_REDUCTION_INSCAN (c)) > rvarp = &rvar; > if (is_simd > && lower_rec_simd_input_clauses (new_var, ctx, &sctx, > - ivar, lvar, rvarp)) > + ivar, lvar, rvarp, > + &rvar2)) > { > if (new_vard != new_var) > { > @@ -8573,18 +8650,40 @@ lower_omp_scan (gimple_stmt_iterator *gs > gimple_seq before = NULL; > omp_context *octx = ctx->outer; > gcc_assert (octx); > + if (!octx->scan_inclusive && !has_clauses) > + { > + gimple_stmt_iterator gsi2 = *gsi_p; > + gsi_next (&gsi2); > + gimple *stmt2 = gsi_stmt (gsi2); > + /* For exclusive scan, swap GIMPLE_OMP_SCAN without clauses > + with following GIMPLE_OMP_SCAN with clauses, so that input_phase, > + the one with exclusive clause(s), comes first. */ > + if (stmt2 > + && gimple_code (stmt2) == GIMPLE_OMP_SCAN > + && gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt2)) != NULL) > + { > + gsi_remove (gsi_p, false); > + gsi_insert_after (gsi_p, stmt, GSI_SAME_STMT); > + ctx = maybe_lookup_ctx (stmt2); > + gcc_assert (ctx); > + lower_omp_scan (gsi_p, ctx); > + return; > + } > + } > + > bool input_phase = has_clauses ^ octx->scan_inclusive; > if (gimple_code (octx->stmt) == GIMPLE_OMP_FOR > && (gimple_omp_for_kind (octx->stmt) & GF_OMP_FOR_SIMD) > - && !gimple_omp_for_combined_into_p (octx->stmt) > - && octx->scan_inclusive) > + && !gimple_omp_for_combined_into_p (octx->stmt)) > { > if (tree c = omp_find_clause (gimple_omp_for_clauses (octx->stmt), > OMP_CLAUSE__SIMDUID_)) > { > tree uid = OMP_CLAUSE__SIMDUID__DECL (c); > lane = create_tmp_var (unsigned_type_node); > - tree t = build_int_cst (integer_type_node, 1 + !input_phase); > + tree t = build_int_cst (integer_type_node, > + input_phase ? 1 > + : octx->scan_inclusive ? 2 : 3); > gimple *g > = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 2, uid, t); > gimple_call_set_lhs (g, lane); > @@ -8601,6 +8700,8 @@ lower_omp_scan (gimple_stmt_iterator *gs > tree val = new_var; > tree var2 = NULL_TREE; > tree var3 = NULL_TREE; > + tree var4 = NULL_TREE; > + tree lane0 = NULL_TREE; > tree new_vard = new_var; > if (omp_is_reference (var)) > { > @@ -8623,16 +8724,26 @@ lower_omp_scan (gimple_stmt_iterator *gs > DECL_ATTRIBUTES (v))) > { > val = unshare_expr (val); > + lane0 = TREE_OPERAND (val, 1); > TREE_OPERAND (val, 1) = lane; > var2 = lookup_decl (v, octx); > + if (!octx->scan_inclusive) > + var4 = lookup_decl (var2, octx); > if (input_phase > && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) > - var3 = maybe_lookup_decl (var2, octx); > + var3 = maybe_lookup_decl (var4 ? var4 : var2, octx); > if (!input_phase) > { > var2 = build4 (ARRAY_REF, TREE_TYPE (val), > var2, lane, NULL_TREE, NULL_TREE); > TREE_THIS_NOTRAP (var2) = 1; > + if (!octx->scan_inclusive) > + { > + var4 = build4 (ARRAY_REF, TREE_TYPE (val), > + var4, lane, NULL_TREE, > + NULL_TREE); > + TREE_THIS_NOTRAP (var4) = 1; > + } > } > else > var2 = val; > @@ -8643,12 +8754,28 @@ lower_omp_scan (gimple_stmt_iterator *gs > else > { > var2 = build_outer_var_ref (var, octx); > - if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) > + if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) > { > var3 = maybe_lookup_decl (new_vard, octx); > - if (var3 == new_vard) > + if (var3 == new_vard || var3 == NULL_TREE) > var3 = NULL_TREE; > + else if (!octx->scan_inclusive && !input_phase) > + { > + var4 = maybe_lookup_decl (var3, octx); > + if (var4 == var3 || var4 == NULL_TREE) > + { > + if (TREE_ADDRESSABLE (TREE_TYPE (new_var))) > + { > + var4 = var3; > + var3 = NULL_TREE; > + } > + else > + var4 = NULL_TREE; > + } > + } > } > + if (!octx->scan_inclusive && !input_phase && var4 == NULL_TREE) > + var4 = create_tmp_var (TREE_TYPE (val)); > } > if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) > { > @@ -8689,9 +8816,17 @@ lower_omp_scan (gimple_stmt_iterator *gs > } > else > { > + tree x; > + if (!octx->scan_inclusive) > + { > + tree v4 = unshare_expr (var4); > + tree v2 = unshare_expr (var2); > + x = lang_hooks.decls.omp_clause_assign_op (c, v4, v2); > + gimplify_and_add (x, &before); > + } > gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c); > - tree x = (DECL_HAS_VALUE_EXPR_P (new_vard) > - ? DECL_VALUE_EXPR (new_vard) : NULL_TREE); > + x = (DECL_HAS_VALUE_EXPR_P (new_vard) > + ? DECL_VALUE_EXPR (new_vard) : NULL_TREE); > tree vexpr = val; > if (x && omp_is_reference (var)) > vexpr = build_fold_addr_expr_loc (clause_loc, val); > @@ -8706,8 +8841,18 @@ lower_omp_scan (gimple_stmt_iterator *gs > SET_DECL_VALUE_EXPR (new_vard, x); > SET_DECL_VALUE_EXPR (placeholder, NULL_TREE); > DECL_HAS_VALUE_EXPR_P (placeholder) = 0; > - x = lang_hooks.decls.omp_clause_assign_op (c, val, var2); > - gimplify_and_add (x, &before); > + if (octx->scan_inclusive) > + { > + x = lang_hooks.decls.omp_clause_assign_op (c, val, > + var2); > + gimplify_and_add (x, &before); > + } > + else if (lane0 == NULL_TREE) > + { > + x = lang_hooks.decls.omp_clause_assign_op (c, val, > + var4); > + gimplify_and_add (x, &before); > + } > } > } > else > @@ -8728,10 +8873,29 @@ lower_omp_scan (gimple_stmt_iterator *gs > > tree x = build2 (code, TREE_TYPE (var2), > unshare_expr (var2), unshare_expr (val)); > - gimplify_assign (unshare_expr (var2), x, &before); > - gimplify_assign (val, var2, &before); > + if (octx->scan_inclusive) > + { > + gimplify_assign (unshare_expr (var2), x, &before); > + gimplify_assign (val, var2, &before); > + } > + else > + { > + gimplify_assign (unshare_expr (var4), > + unshare_expr (var2), &before); > + gimplify_assign (var2, x, &before); > + if (lane0 == NULL_TREE) > + gimplify_assign (val, var4, &before); > + } > } > } > + if (!octx->scan_inclusive && !input_phase && lane0) > + { > + tree vexpr = unshare_expr (var4); > + TREE_OPERAND (vexpr, 1) = lane0; > + if (omp_is_reference (var)) > + vexpr = build_fold_addr_expr_loc (clause_loc, vexpr); > + SET_DECL_VALUE_EXPR (new_vard, vexpr); > + } > } > } > else if (has_clauses) > --- gcc/tree-vectorizer.h.jj 2019-06-20 13:26:29.078150879 +0200 > +++ gcc/tree-vectorizer.h 2019-06-20 14:18:04.241075200 +0200 > @@ -917,7 +917,7 @@ struct _stmt_vec_info { > bool strided_p; > > /* For both loads and stores. */ > - unsigned simd_lane_access_p : 2; > + unsigned simd_lane_access_p : 3; > > /* Classifies how the load or store is going to be implemented > for loop vectorization. */ > --- gcc/tree-vect-data-refs.c.jj 2019-06-20 13:55:35.421150589 +0200 > +++ gcc/tree-vect-data-refs.c 2019-06-20 14:18:04.240075216 +0200 > @@ -4223,7 +4223,8 @@ vect_analyze_data_refs (vec_info *vinfo, > /* See if this was detected as SIMD lane access. */ > if (dr->aux == (void *)-1 > || dr->aux == (void *)-2 > - || dr->aux == (void *)-3) > + || dr->aux == (void *)-3 > + || dr->aux == (void *)-4) > { > if (nested_in_vect_loop_p (loop, stmt_info)) > return opt_result::failure_at (stmt_info->stmt, > --- gcc/tree-vect-stmts.c.jj 2019-06-20 13:26:29.084150785 +0200 > +++ gcc/tree-vect-stmts.c 2019-06-20 14:18:04.239075231 +0200 > @@ -6512,7 +6512,37 @@ check_scan_store (stmt_vec_info stmt_inf > kinds are there in order to allow optimizing the initializer store > and combiner sequence, e.g. if it is originally some C++ish user > defined reduction, but allow the vectorizer to pattern recognize it > - and turn into the appropriate vectorized scan. */ > + and turn into the appropriate vectorized scan. > + > + For exclusive scan, this is slightly different: > + #pragma omp simd reduction(inscan,+:r) > + for (...) > + { > + use (r); > + #pragma omp scan exclusive (r) > + r += something (); > + } > + shall have body with: > + // Initialization for input phase, store the reduction initializer: > + _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0); > + _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1); > + D.2042[_21] = 0; > + // Actual input phase: > + ... > + r.0_5 = D.2042[_20]; > + _6 = _4 + r.0_5; > + D.2042[_20] = _6; > + // Initialization for scan phase: > + _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3); > + _26 = D.2043[_25]; > + D.2044[_25] = _26; > + _27 = D.2042[_25]; > + _28 = _26 + _27; > + D.2043[_25] = _28; > + // Actual scan phase: > + ... > + r.1_8 = D.2044[_20]; > + ... */ > > if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2) > { > @@ -6553,26 +6583,52 @@ check_scan_store (stmt_vec_info stmt_inf > if (TREE_CODE (rhs) != SSA_NAME) > goto fail; > > - use_operand_p use_p; > - imm_use_iterator iter; > gimple *other_store_stmt = NULL; > - FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) > + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); > + bool inscan_var_store > + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; > + > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) > { > - gimple *use_stmt = USE_STMT (use_p); > - if (use_stmt == stmt || is_gimple_debug (use_stmt)) > - continue; > - if (gimple_bb (use_stmt) != gimple_bb (stmt) > - || !gimple_store_p (use_stmt) > - || other_store_stmt) > - goto fail; > - other_store_stmt = use_stmt; > + if (!inscan_var_store) > + { > + use_operand_p use_p; > + imm_use_iterator iter; > + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) > + { > + gimple *use_stmt = USE_STMT (use_p); > + if (use_stmt == stmt || is_gimple_debug (use_stmt)) > + continue; > + if (gimple_bb (use_stmt) != gimple_bb (stmt) > + || !is_gimple_assign (use_stmt) > + || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS > + || other_store_stmt > + || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME) > + goto fail; > + other_store_stmt = use_stmt; > + } > + if (other_store_stmt == NULL) > + goto fail; > + rhs = gimple_assign_lhs (other_store_stmt); > + if (!single_imm_use (rhs, &use_p, &other_store_stmt)) > + goto fail; > + } > } > - if (other_store_stmt == NULL) > - goto fail; > - stmt_vec_info other_store_stmt_info > - = loop_vinfo->lookup_stmt (other_store_stmt); > - if (other_store_stmt_info == NULL > - || STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) != 3) > + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3) > + { > + use_operand_p use_p; > + imm_use_iterator iter; > + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) > + { > + gimple *use_stmt = USE_STMT (use_p); > + if (use_stmt == stmt || is_gimple_debug (use_stmt)) > + continue; > + if (other_store_stmt) > + goto fail; > + other_store_stmt = use_stmt; > + } > + } > + else > goto fail; > > gimple *def_stmt = SSA_NAME_DEF_STMT (rhs); > @@ -6599,8 +6655,7 @@ check_scan_store (stmt_vec_info stmt_inf > > tree rhs1 = gimple_assign_rhs1 (def_stmt); > tree rhs2 = gimple_assign_rhs2 (def_stmt); > - if (TREE_CODE (rhs1) != SSA_NAME > - || TREE_CODE (rhs2) != SSA_NAME) > + if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME) > goto fail; > > gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1); > @@ -6615,22 +6670,83 @@ check_scan_store (stmt_vec_info stmt_inf > stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt); > if (load1_stmt_info == NULL > || load2_stmt_info == NULL > - || STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) != 3 > - || STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) != 3) > + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) > + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)) > + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) > + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))) > goto fail; > > - if (scan_operand_equal_p (gimple_assign_lhs (stmt), > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store) > + { > + dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info); > + if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR > + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0))) > + goto fail; > + tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0); > + tree lrhs; > + if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1))) > + lrhs = rhs1; > + else > + lrhs = rhs2; > + use_operand_p use_p; > + imm_use_iterator iter; > + FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs) > + { > + gimple *use_stmt = USE_STMT (use_p); > + if (use_stmt == def_stmt || is_gimple_debug (use_stmt)) > + continue; > + if (other_store_stmt) > + goto fail; > + other_store_stmt = use_stmt; > + } > + } > + > + if (other_store_stmt == NULL) > + goto fail; > + if (gimple_bb (other_store_stmt) != gimple_bb (stmt) > + || !gimple_store_p (other_store_stmt)) > + goto fail; > + > + stmt_vec_info other_store_stmt_info > + = loop_vinfo->lookup_stmt (other_store_stmt); > + if (other_store_stmt_info == NULL > + || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) > + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))) > + goto fail; > + > + gimple *stmt1 = stmt; > + gimple *stmt2 = other_store_stmt; > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) > + std::swap (stmt1, stmt2); > + if (scan_operand_equal_p (gimple_assign_lhs (stmt1), > gimple_assign_rhs1 (load2_stmt))) > { > std::swap (rhs1, rhs2); > std::swap (load1_stmt, load2_stmt); > std::swap (load1_stmt_info, load2_stmt_info); > } > - if (!scan_operand_equal_p (gimple_assign_lhs (stmt), > - gimple_assign_rhs1 (load1_stmt)) > - || !scan_operand_equal_p (gimple_assign_lhs (other_store_stmt), > + if (!scan_operand_equal_p (gimple_assign_lhs (stmt1), > + gimple_assign_rhs1 (load1_stmt))) > + goto fail; > + > + tree var3 = NULL_TREE; > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3 > + && !scan_operand_equal_p (gimple_assign_lhs (stmt2), > gimple_assign_rhs1 (load2_stmt))) > goto fail; > + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) > + { > + dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info); > + if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR > + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0))) > + goto fail; > + var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0); > + if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3)) > + || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3)) > + || lookup_attribute ("omp simd inscan exclusive", > + DECL_ATTRIBUTES (var3))) > + goto fail; > + } > > dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info); > if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR > @@ -6648,6 +6764,14 @@ check_scan_store (stmt_vec_info stmt_inf > if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1))) > std::swap (var1, var2); > > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) > + { > + if (!lookup_attribute ("omp simd inscan exclusive", > + DECL_ATTRIBUTES (var1))) > + goto fail; > + var1 = var3; > + } > + > if (loop_vinfo->scan_map == NULL) > goto fail; > tree *init = loop_vinfo->scan_map->get (var1); > @@ -6655,6 +6779,7 @@ check_scan_store (stmt_vec_info stmt_inf > goto fail; > > /* The IL is as expected, now check if we can actually vectorize it. > + Inclusive scan: > _26 = D.2043[_25]; > _27 = D.2042[_25]; > _28 = _26 + _27; > @@ -6664,21 +6789,49 @@ check_scan_store (stmt_vec_info stmt_inf > from the D.2042[_21] = 0; store): > _30 = MEM <vector(8) int> [(int *)&D.2043]; > _31 = MEM <vector(8) int> [(int *)&D.2042]; > - _32 = VEC_PERM_EXPR <_31, _40, { 8, 0, 1, 2, 3, 4, 5, 6 }>; > + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>; > _33 = _31 + _32; > // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] }; > - _34 = VEC_PERM_EXPR <_33, _40, { 8, 9, 0, 1, 2, 3, 4, 5 }>; > + _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>; > _35 = _33 + _34; > // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], > // _31[1]+.._31[4], ... _31[4]+.._31[7] }; > - _36 = VEC_PERM_EXPR <_35, _40, { 8, 9, 10, 11, 0, 1, 2, 3 }>; > + _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>; > _37 = _35 + _36; > // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], > // _31[0]+.._31[4], ... _31[0]+.._31[7] }; > _38 = _30 + _37; > _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>; > MEM <vector(8) int> [(int *)&D.2043] = _39; > - MEM <vector(8) int> [(int *)&D.2042] = _38; */ > + MEM <vector(8) int> [(int *)&D.2042] = _38; > + Exclusive scan: > + _26 = D.2043[_25]; > + D.2044[_25] = _26; > + _27 = D.2042[_25]; > + _28 = _26 + _27; > + D.2043[_25] = _28; > + should be vectorized as (where _40 is the vectorized rhs > + from the D.2042[_21] = 0; store): > + _30 = MEM <vector(8) int> [(int *)&D.2043]; > + _31 = MEM <vector(8) int> [(int *)&D.2042]; > + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>; > + _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>; > + _34 = _32 + _33; > + // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3], > + // _31[3]+_31[4], ... _31[5]+.._31[6] }; > + _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>; > + _36 = _34 + _35; > + // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], > + // _31[1]+.._31[4], ... _31[3]+.._31[6] }; > + _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>; > + _38 = _36 + _37; > + // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], > + // _31[0]+.._31[4], ... _31[0]+.._31[6] }; > + _39 = _30 + _38; > + _50 = _31 + _39; > + _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>; > + MEM <vector(8) int> [(int *)&D.2044] = _39; > + MEM <vector(8) int> [(int *)&D.2042] = _51; */ > enum machine_mode vec_mode = TYPE_MODE (vectype); > optab optab = optab_for_tree_code (code, vectype, optab_default); > if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing) > @@ -6715,6 +6868,24 @@ vectorizable_scan_store (stmt_vec_info s > tree rhs = gimple_assign_rhs1 (stmt); > gcc_assert (TREE_CODE (rhs) == SSA_NAME); > > + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); > + bool inscan_var_store > + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; > + > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) > + { > + use_operand_p use_p; > + imm_use_iterator iter; > + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) > + { > + gimple *use_stmt = USE_STMT (use_p); > + if (use_stmt == stmt || is_gimple_debug (use_stmt)) > + continue; > + rhs = gimple_assign_lhs (use_stmt); > + break; > + } > + } > + > gimple *def_stmt = SSA_NAME_DEF_STMT (rhs); > enum tree_code code = gimple_assign_rhs_code (def_stmt); > if (code == POINTER_PLUS_EXPR) > @@ -6737,15 +6908,12 @@ vectorizable_scan_store (stmt_vec_info s > { > std::swap (rhs1, rhs2); > std::swap (var1, var2); > + std::swap (load1_dr_info, load2_dr_info); > } > > tree *init = loop_vinfo->scan_map->get (var1); > gcc_assert (init); > > - tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); > - bool inscan_var_store > - = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; > - > unsigned HOST_WIDE_INT nunits; > if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) > gcc_unreachable (); > @@ -6789,29 +6957,50 @@ vectorizable_scan_store (stmt_vec_info s > tree vec_oprnd1 = NULL_TREE; > tree vec_oprnd2 = NULL_TREE; > tree vec_oprnd3 = NULL_TREE; > - tree dataref_ptr = unshare_expr (DR_BASE_ADDRESS (dr_info->dr)); > + tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr); > tree dataref_offset = build_int_cst (ref_type, 0); > tree bump = vect_get_data_ptr_increment (dr_info, vectype, VMAT_CONTIGUOUS); > + tree ldataref_ptr = NULL_TREE; > tree orig = NULL_TREE; > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) > + ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr); > for (int j = 0; j < ncopies; j++) > { > stmt_vec_info new_stmt_info; > if (j == 0) > { > vec_oprnd1 = vect_get_vec_def_for_operand (*init, stmt_info); > - vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info); > + if (ldataref_ptr == NULL) > + vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info); > vec_oprnd3 = vect_get_vec_def_for_operand (rhs2, stmt_info); > orig = vec_oprnd3; > } > else > { > vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); > - vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2); > + if (ldataref_ptr == NULL) > + vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2); > vec_oprnd3 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd3); > if (!inscan_var_store) > dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump); > } > > + if (ldataref_ptr) > + { > + vec_oprnd2 = make_ssa_name (vectype); > + tree data_ref = fold_build2 (MEM_REF, vectype, > + unshare_expr (ldataref_ptr), > + dataref_offset); > + vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr)); > + gimple *g = gimple_build_assign (vec_oprnd2, data_ref); > + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); > + if (prev_stmt_info == NULL) > + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; > + else > + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; > + prev_stmt_info = new_stmt_info; > + } > + > tree v = vec_oprnd2; > for (int i = 0; i < units_log2; ++i) > { > @@ -6848,6 +7037,17 @@ vectorizable_scan_store (stmt_vec_info s > new_temp = new_temp2; > } > > + /* For exclusive scan, perform the perms[i] permutation once > + more. */ > + if (i == 0 > + && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 > + && v == vec_oprnd2) > + { > + v = new_temp; > + --i; > + continue; > + } > + > tree new_temp2 = make_ssa_name (vectype); > g = gimple_build_assign (new_temp2, code, v, new_temp); > new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); > @@ -6863,16 +7063,30 @@ vectorizable_scan_store (stmt_vec_info s > STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; > prev_stmt_info = new_stmt_info; > > + tree last_perm_arg = new_temp; > + /* For exclusive scan, new_temp computed above is the exclusive scan > + prefix sum. Turn it into inclusive prefix sum for the broadcast > + of the last element into orig. */ > + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) > + { > + last_perm_arg = make_ssa_name (vectype); > + g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2); > + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); > + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; > + prev_stmt_info = new_stmt_info; > + } > + > orig = make_ssa_name (vectype); > - g = gimple_build_assign (orig, VEC_PERM_EXPR, new_temp, new_temp, > - perms[units_log2]); > + g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg, > + last_perm_arg, perms[units_log2]); > new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); > STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; > prev_stmt_info = new_stmt_info; > > if (!inscan_var_store) > { > - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, > + tree data_ref = fold_build2 (MEM_REF, vectype, > + unshare_expr (dataref_ptr), > dataref_offset); > vect_copy_ref_info (data_ref, DR_REF (dr_info->dr)); > g = gimple_build_assign (data_ref, new_temp); > @@ -6888,7 +7102,8 @@ vectorizable_scan_store (stmt_vec_info s > if (j != 0) > dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump); > > - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, > + tree data_ref = fold_build2 (MEM_REF, vectype, > + unshare_expr (dataref_ptr), > dataref_offset); > vect_copy_ref_info (data_ref, DR_REF (dr_info->dr)); > gimple *g = gimple_build_assign (data_ref, orig); > @@ -7325,7 +7540,7 @@ vectorizable_store (stmt_vec_info stmt_i > } > return true; > } > - else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3) > + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3) > return vectorizable_scan_store (stmt_info, gsi, vec_stmt, ncopies); > > if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > --- gcc/testsuite/gcc.dg/vect/vect-simd-12.c.jj 2019-06-20 15:08:50.260400440 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-12.c 2019-06-20 15:08:24.332805239 +0200 > @@ -0,0 +1,122 @@ > +/* { dg-require-effective-target size32plus } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > +/* { dg-additional-options "-mavx" { target avx_runtime } } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ > + > +#ifndef main > +#include "tree-vect.h" > +#endif > + > +int r, a[1024], b[1024]; > + > +__attribute__((noipa)) void > +foo (int *a, int *b) > +{ > + #pragma omp simd reduction (inscan, +:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +bar (void) > +{ > + int s = 0; > + #pragma omp simd reduction (inscan, +:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +__attribute__((noipa)) void > +baz (int *a, int *b) > +{ > + #pragma omp simd reduction (inscan, +:r) if (simd: 0) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +qux (void) > +{ > + int s = 0; > + #pragma omp simd reduction (inscan, +:s) simdlen (1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +int > +main () > +{ > + int s = 0; > +#ifndef main > + check_vect (); > +#endif > + for (int i = 0; i < 1024; ++i) > + { > + a[i] = i; > + b[i] = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b); > + if (r != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = 25; > + s += i; > + } > + if (bar () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -1; > + s += 2 * i; > + } > + r = 0; > + baz (a, b); > + if (r != 1024 * 1023 / 2) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -25; > + s += i; > + } > + if (qux () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/gcc.dg/vect/vect-simd-13.c.jj 2019-06-20 15:47:23.580359715 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-13.c 2019-06-20 15:13:23.500134387 +0200 > @@ -0,0 +1,124 @@ > +/* { dg-require-effective-target size32plus } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > +/* { dg-additional-options "-mavx" { target avx_runtime } } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ > + > +#ifndef main > +#include "tree-vect.h" > +#endif > + > +int r, a[1024], b[1024]; > + > +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer (omp_priv = 0) > + > +__attribute__((noipa)) void > +foo (int *a, int *b) > +{ > + #pragma omp simd reduction (inscan, foo:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +bar (void) > +{ > + int s = 0; > + #pragma omp simd reduction (inscan, foo:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +__attribute__((noipa)) void > +baz (int *a, int *b) > +{ > + #pragma omp simd reduction (inscan, foo:r) if (simd: 0) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +qux (void) > +{ > + int s = 0; > + #pragma omp simd reduction (inscan, foo:s) simdlen (1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +int > +main () > +{ > + int s = 0; > +#ifndef main > + check_vect (); > +#endif > + for (int i = 0; i < 1024; ++i) > + { > + a[i] = i; > + b[i] = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b); > + if (r != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = 25; > + s += i; > + } > + if (bar () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -1; > + s += 2 * i; > + } > + r = 0; > + baz (a, b); > + if (r != 1024 * 1023 / 2) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -25; > + s += i; > + } > + if (qux () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/gcc.dg/vect/vect-simd-14.c.jj 2019-06-20 15:48:30.536321539 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-14.c 2019-06-20 15:54:39.291617792 +0200 > @@ -0,0 +1,94 @@ > +/* { dg-require-effective-target size32plus } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > +/* { dg-additional-options "-mavx" { target avx_runtime } } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ > + > +#ifndef main > +#include "tree-vect.h" > +#endif > + > +float r = 1.0f, a[1024], b[1024]; > + > +__attribute__((noipa)) void > +foo (float *a, float *b) > +{ > + #pragma omp simd reduction (inscan, *:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r *= a[i]; > + } > +} > + > +__attribute__((noipa)) float > +bar (void) > +{ > + float s = -__builtin_inff (); > + #pragma omp simd reduction (inscan, max:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s = s > a[i] ? s : a[i]; > + } > + return s; > +} > + > +int > +main () > +{ > + float s = 1.0f; > +#ifndef main > + check_vect (); > +#endif > + for (int i = 0; i < 1024; ++i) > + { > + if (i < 80) > + a[i] = (i & 1) ? 0.25f : 0.5f; > + else if (i < 200) > + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; > + else if (i < 280) > + a[i] = (i & 1) ? 0.25f : 0.5f; > + else if (i < 380) > + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; > + else > + switch (i % 6) > + { > + case 0: a[i] = 0.25f; break; > + case 1: a[i] = 2.0f; break; > + case 2: a[i] = -1.0f; break; > + case 3: a[i] = -4.0f; break; > + case 4: a[i] = 0.5f; break; > + case 5: a[i] = 1.0f; break; > + default: a[i] = 0.0f; break; > + } > + b[i] = -19.0f; > + asm ("" : "+g" (i)); > + } > + foo (a, b); > + if (r * 16384.0f != 0.125f) > + abort (); > + float m = -175.25f; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -231.75f; > + s *= a[i]; > + a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f); > + m += 0.75f; > + } > + if (bar () != 592.0f) > + abort (); > + s = -__builtin_inff (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + if (s < a[i]) > + s = a[i]; > + } > + return 0; > +} Hi, I've noticed that this new test (gcc.dg/vect/vect-simd-14.c) fails at execution time on arm targets. It does pass on aarch64. Christophe > --- gcc/testsuite/gcc.dg/vect/vect-simd-15.c.jj 2019-06-20 15:50:34.483399705 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-simd-15.c 2019-06-20 15:52:09.976919050 +0200 > @@ -0,0 +1,186 @@ > +/* { dg-require-effective-target size32plus } */ > +/* { dg-additional-options "-fopenmp-simd" } */ > +/* { dg-additional-options "-mavx" { target avx_runtime } } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ > + > +#ifndef main > +#include "tree-vect.h" > +#endif > + > +int r, a[1024], b[1024]; > +unsigned short r2, b2[1024]; > +unsigned char r3, b3[1024]; > + > +__attribute__((noipa)) void > +foo (int *a, int *b, unsigned short *b2, unsigned char *b3) > +{ > + #pragma omp simd reduction (inscan, +:r, r2, r3) > + for (int i = 0; i < 1024; i++) > + { > + { > + b[i] = r; > + b2[i] = r2; > + b3[i] = r3; > + } > + #pragma omp scan exclusive(r, r2, r3) > + { r += a[i]; r2 += a[i]; r3 += a[i]; } > + } > +} > + > +__attribute__((noipa)) int > +bar (unsigned short *s2p, unsigned char *s3p) > +{ > + int s = 0; > + unsigned short s2 = 0; > + unsigned char s3 = 0; > + #pragma omp simd reduction (inscan, +:s, s2, s3) > + for (int i = 0; i < 1024; i++) > + { > + { b[i] = s; b2[i] = s2; b3[i] = s3; } > + #pragma omp scan exclusive(s, s2, s3) > + { > + s += 2 * a[i]; > + s2 += 2 * a[i]; > + s3 += 2 * a[i]; > + } > + } > + *s2p = s2; > + *s3p = s3; > + return s; > +} > + > +__attribute__((noipa)) void > +baz (int *a, int *b, unsigned short *b2, unsigned char *b3) > +{ > + #pragma omp simd reduction (inscan, +:r, r2, r3) if (simd: 0) > + for (int i = 0; i < 1024; i++) > + { > + { > + b[i] = r; > + b2[i] = r2; > + b3[i] = r3; > + } > + #pragma omp scan exclusive(r, r2, r3) > + { > + r += a[i]; > + r2 += a[i]; > + r3 += a[i]; > + } > + } > +} > + > +__attribute__((noipa)) int > +qux (unsigned short *s2p, unsigned char *s3p) > +{ > + int s = 0; > + unsigned short s2 = 0; > + unsigned char s3 = 0; > + #pragma omp simd reduction (inscan, +:s, s2, s3) simdlen (1) > + for (int i = 0; i < 1024; i++) > + { > + { b[i] = s; b2[i] = s2; b3[i] = s3; } > + #pragma omp scan exclusive(s, s2, s3) > + { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; } > + } > + *s2p = s2; > + *s3p = s3; > + return s; > +} > + > +int > +main () > +{ > + int s = 0; > + unsigned short s2; > + unsigned char s3; > +#ifndef main > + check_vect (); > +#endif > + for (int i = 0; i < 1024; ++i) > + { > + a[i] = i; > + b[i] = -1; > + b2[i] = -1; > + b3[i] = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b, b2, b3); > + if (r != 1024 * 1023 / 2 > + || r2 != (unsigned short) r > + || r3 != (unsigned char) r) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s > + || b2[i] != (unsigned short) s > + || b3[i] != (unsigned char) s) > + abort (); > + else > + { > + b[i] = 25; > + b2[i] = 24; > + b3[i] = 26; > + } > + s += i; > + } > + if (bar (&s2, &s3) != 1024 * 1023) > + abort (); > + if (s2 != (unsigned short) (1024 * 1023) > + || s3 != (unsigned char) (1024 * 1023)) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s > + || b2[i] != (unsigned short) s > + || b3[i] != (unsigned char) s) > + abort (); > + else > + { > + b[i] = -1; > + b2[i] = -1; > + b3[i] = -1; > + } > + s += 2 * i; > + } > + r = 0; > + r2 = 0; > + r3 = 0; > + baz (a, b, b2, b3); > + if (r != 1024 * 1023 / 2 > + || r2 != (unsigned short) r > + || r3 != (unsigned char) r) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s > + || b2[i] != (unsigned short) s > + || b3[i] != (unsigned char) s) > + abort (); > + else > + { > + b[i] = 25; > + b2[i] = 24; > + b3[i] = 26; > + } > + s += i; > + } > + s2 = 0; > + s3 = 0; > + if (qux (&s2, &s3) != 1024 * 1023) > + abort (); > + if (s2 != (unsigned short) (1024 * 1023) > + || s3 != (unsigned char) (1024 * 1023)) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s > + || b2[i] != (unsigned short) s > + || b3[i] != (unsigned char) s) > + abort (); > + s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c.jj 2019-06-20 15:58:35.276983324 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c 2019-06-20 15:58:35.274983355 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target sse2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "sse2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-12.c" > + > +static void > +sse2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c.jj 2019-06-20 15:58:35.283983216 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c 2019-06-20 15:58:35.281983247 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target sse2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "sse2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-13.c" > + > +static void > +sse2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c.jj 2019-06-20 15:58:35.288983139 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c 2019-06-20 15:58:35.287983154 +0200 > @@ -0,0 +1,15 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target sse2 } */ > + > +#include "sse2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-14.c" > + > +static void > +sse2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c.jj 2019-06-20 15:58:35.293983061 +0200 > +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c 2019-06-20 15:58:35.292983077 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target sse2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "sse2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-15.c" > + > +static void > +sse2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c.jj 2019-06-20 15:58:35.299982969 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c 2019-06-20 15:58:35.297982999 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-12.c" > + > +static void > +avx2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c.jj 2019-06-20 15:58:35.305982876 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c 2019-06-20 15:58:35.303982907 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-13.c" > + > +static void > +avx2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c.jj 2019-06-20 15:58:35.310982799 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c 2019-06-20 15:58:35.309982815 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-14.c" > + > +static void > +avx2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c.jj 2019-06-20 15:58:35.316982707 +0200 > +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c 2019-06-20 15:58:35.314982738 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx2 } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx2-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-15.c" > + > +static void > +avx2_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c.jj 2019-06-20 15:58:35.323982599 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c 2019-06-20 15:58:35.321982630 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx512f } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx512f-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-12.c" > + > +static void > +avx512f_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c.jj 2019-06-20 15:58:35.328982522 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c 2019-06-20 15:58:35.326982553 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx512f } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx512f-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-13.c" > + > +static void > +avx512f_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c.jj 2019-06-20 15:58:35.333982445 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c 2019-06-20 15:58:35.332982461 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx512f } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx512f-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-14.c" > + > +static void > +avx512f_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c.jj 2019-06-20 15:58:35.347982230 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c 2019-06-20 15:58:35.346982245 +0200 > @@ -0,0 +1,16 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -fopenmp-simd -mavx512bw -mprefer-vector-width=512 -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target avx512bw } */ > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ > + > +#include "avx512bw-check.h" > + > +#define main() do_main () > + > +#include "../../gcc.dg/vect/vect-simd-15.c" > + > +static void > +avx512bw_test (void) > +{ > + do_main (); > +} > --- gcc/testsuite/g++.dg/vect/simd-6.cc.jj 2019-06-20 16:00:34.800142524 +0200 > +++ gcc/testsuite/g++.dg/vect/simd-6.cc 2019-06-20 16:07:41.722559826 +0200 > @@ -0,0 +1,161 @@ > +// { dg-require-effective-target size32plus } > +// { dg-additional-options "-fopenmp-simd" } > +// { dg-additional-options "-mavx" { target avx_runtime } } > +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { xfail *-*-* } } } > + > +#include "../../gcc.dg/vect/tree-vect.h" > + > +template <typename T> > +struct S { > + inline S (); > + inline ~S (); > + inline S (const S &); > + inline S & operator= (const S &); > + T s; > +}; > + > +template <typename T> > +S<T>::S () : s (0) > +{ > +} > + > +template <typename T> > +S<T>::~S () > +{ > +} > + > +template <typename T> > +S<T>::S (const S &x) > +{ > + s = x.s; > +} > + > +template <typename T> > +S<T> & > +S<T>::operator= (const S &x) > +{ > + s = x.s; > + return *this; > +} > + > +template <typename T> > +static inline void > +ini (S<T> &x) > +{ > + x.s = 0; > +} > + > +S<int> r, a[1024], b[1024]; > + > +#pragma omp declare reduction (+: S<int>: omp_out.s += omp_in.s) > +#pragma omp declare reduction (plus: S<int>: omp_out.s += omp_in.s) initializer (ini (omp_priv)) > + > +template <typename T> > +__attribute__((noipa)) void > +foo (S<T> *a, S<T> *b) > +{ > + #pragma omp simd reduction (inscan, +:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r.s += a[i].s; > + } > +} > + > +template <typename T> > +__attribute__((noipa)) S<T> > +bar (void) > +{ > + S<T> s; > + #pragma omp simd reduction (inscan, plus:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s.s += 2 * a[i].s; > + } > + return S<T> (s); > +} > + > +__attribute__((noipa)) void > +baz (S<int> *a, S<int> *b) > +{ > + #pragma omp simd reduction (inscan, +:r) simdlen(1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r.s += a[i].s; > + } > +} > + > +__attribute__((noipa)) S<int> > +qux (void) > +{ > + S<int> s; > + #pragma omp simd if (0) reduction (inscan, plus:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s.s += 2 * a[i].s; > + } > + return S<int> (s); > +} > + > +int > +main () > +{ > + S<int> s; > + check_vect (); > + for (int i = 0; i < 1024; ++i) > + { > + a[i].s = i; > + b[i].s = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b); > + if (r.s != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + else > + b[i].s = 25; > + s.s += i; > + } > + if (bar<int> ().s != 1024 * 1023) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + s.s += 2 * i; > + } > + r.s = 0; > + baz (a, b); > + if (r.s != 1024 * 1023 / 2) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + else > + b[i].s = 25; > + s.s += i; > + } > + if (qux ().s != 1024 * 1023) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + s.s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/g++.dg/vect/simd-7.cc.jj 2019-06-20 16:00:51.095891542 +0200 > +++ gcc/testsuite/g++.dg/vect/simd-7.cc 2019-06-20 16:12:50.222747875 +0200 > @@ -0,0 +1,124 @@ > +// { dg-require-effective-target size32plus } > +// { dg-additional-options "-fopenmp-simd" } > +// { dg-additional-options "-mavx" { target avx_runtime } } > +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ > + > +#include "../../gcc.dg/vect/tree-vect.h" > + > +int r, a[1024], b[1024], q; > + > +template <typename T, typename U> > +__attribute__((noipa)) void > +foo (T a, T b, U r) > +{ > + #pragma omp simd reduction (inscan, +:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +template <typename T> > +__attribute__((noipa)) T > +bar (void) > +{ > + T &s = q; > + q = 0; > + #pragma omp simd reduction (inscan, +:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +template <typename T> > +__attribute__((noipa)) void > +baz (T *a, T *b, T &r) > +{ > + #pragma omp simd reduction (inscan, +:r) if (simd: 0) > + for (T i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +template <typename T> > +__attribute__((noipa)) int > +qux (void) > +{ > + T s = q; > + q = 0; > + #pragma omp simd reduction (inscan, +:s) simdlen (1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +int > +main () > +{ > + int s = 0; > + check_vect (); > + for (int i = 0; i < 1024; ++i) > + { > + a[i] = i; > + b[i] = -1; > + asm ("" : "+g" (i)); > + } > + foo<int *, int &> (a, b, r); > + if (r != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = 25; > + s += i; > + } > + if (bar<int> () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -1; > + s += 2 * i; > + } > + r = 0; > + baz<int> (a, b, r); > + if (r != 1024 * 1023 / 2) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -25; > + s += i; > + } > + if (qux<int &> () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/g++.dg/vect/simd-8.cc.jj 2019-06-20 16:00:54.154844430 +0200 > +++ gcc/testsuite/g++.dg/vect/simd-8.cc 2019-06-20 16:15:37.994133891 +0200 > @@ -0,0 +1,122 @@ > +// { dg-require-effective-target size32plus } > +// { dg-additional-options "-fopenmp-simd" } > +// { dg-additional-options "-mavx" { target avx_runtime } } > +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } > + > +#include "../../gcc.dg/vect/tree-vect.h" > + > +int r, a[1024], b[1024], q; > + > +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer (omp_priv = 0) > + > +__attribute__((noipa)) void > +foo (int *a, int *b, int &r) > +{ > + #pragma omp simd reduction (inscan, foo:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +bar (void) > +{ > + int &s = q; > + q = 0; > + #pragma omp simd reduction (inscan, foo:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +__attribute__((noipa)) void > +baz (int *a, int *b, int &r) > +{ > + #pragma omp simd reduction (inscan, foo:r) if (simd: 0) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r += a[i]; > + } > +} > + > +__attribute__((noipa)) int > +qux (void) > +{ > + int &s = q; > + q = 0; > + #pragma omp simd reduction (inscan, foo:s) simdlen (1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s += 2 * a[i]; > + } > + return s; > +} > + > +int > +main () > +{ > + int s = 0; > + check_vect (); > + for (int i = 0; i < 1024; ++i) > + { > + a[i] = i; > + b[i] = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b, r); > + if (r != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = 25; > + s += i; > + } > + if (bar () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -1; > + s += 2 * i; > + } > + r = 0; > + baz (a, b, r); > + if (r != 1024 * 1023 / 2) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + else > + b[i] = -25; > + s += i; > + } > + if (qux () != 1024 * 1023) > + abort (); > + s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i] != s) > + abort (); > + s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/g++.dg/vect/simd-9.cc.jj 2019-06-20 16:00:57.197797566 +0200 > +++ gcc/testsuite/g++.dg/vect/simd-9.cc 2019-06-20 16:17:27.484427949 +0200 > @@ -0,0 +1,153 @@ > +// { dg-require-effective-target size32plus } > +// { dg-additional-options "-fopenmp-simd" } > +// { dg-additional-options "-mavx" { target avx_runtime } } > +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { xfail *-*-* } } } > + > +#include "../../gcc.dg/vect/tree-vect.h" > + > +struct S { > + inline S (); > + inline ~S (); > + inline S (const S &); > + inline S & operator= (const S &); > + int s; > +}; > + > +S::S () : s (0) > +{ > +} > + > +S::~S () > +{ > +} > + > +S::S (const S &x) > +{ > + s = x.s; > +} > + > +S & > +S::operator= (const S &x) > +{ > + s = x.s; > + return *this; > +} > + > +static inline void > +ini (S &x) > +{ > + x.s = 0; > +} > + > +S r, a[1024], b[1024]; > + > +#pragma omp declare reduction (+: S: omp_out.s += omp_in.s) > +#pragma omp declare reduction (plus: S: omp_out.s += omp_in.s) initializer (ini (omp_priv)) > + > +__attribute__((noipa)) void > +foo (S *a, S *b, S &r) > +{ > + #pragma omp simd reduction (inscan, +:r) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r.s += a[i].s; > + } > +} > + > +__attribute__((noipa)) S > +bar (void) > +{ > + S s; > + #pragma omp simd reduction (inscan, plus:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s.s += 2 * a[i].s; > + } > + return s; > +} > + > +__attribute__((noipa)) void > +baz (S *a, S *b, S &r) > +{ > + #pragma omp simd reduction (inscan, +:r) simdlen(1) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = r; > + #pragma omp scan exclusive(r) > + r.s += a[i].s; > + } > +} > + > +__attribute__((noipa)) S > +qux (void) > +{ > + S s; > + #pragma omp simd if (0) reduction (inscan, plus:s) > + for (int i = 0; i < 1024; i++) > + { > + b[i] = s; > + #pragma omp scan exclusive(s) > + s.s += 2 * a[i].s; > + } > + return s; > +} > + > +int > +main () > +{ > + S s; > + check_vect (); > + for (int i = 0; i < 1024; ++i) > + { > + a[i].s = i; > + b[i].s = -1; > + asm ("" : "+g" (i)); > + } > + foo (a, b, r); > + if (r.s != 1024 * 1023 / 2) > + abort (); > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + else > + b[i].s = 25; > + s.s += i; > + } > + if (bar ().s != 1024 * 1023) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + s.s += 2 * i; > + } > + r.s = 0; > + baz (a, b, r); > + if (r.s != 1024 * 1023 / 2) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + else > + b[i].s = 25; > + s.s += i; > + } > + if (qux ().s != 1024 * 1023) > + abort (); > + s.s = 0; > + for (int i = 0; i < 1024; ++i) > + { > + if (b[i].s != s.s) > + abort (); > + s.s += 2 * i; > + } > + return 0; > +} > --- gcc/testsuite/c-c++-common/gomp/scan-2.c.jj 2019-06-10 14:18:17.461525669 +0200 > +++ gcc/testsuite/c-c++-common/gomp/scan-2.c 2019-06-20 23:54:03.615422149 +0200 > @@ -8,7 +8,7 @@ f1 (int *c, int *d) > for (i = 0; i < 64; i++) > { > d[i] = a; > - #pragma omp scan exclusive (a) /* { dg-message "sorry, unimplemented: '#pragma omp scan' not supported yet" } */ > + #pragma omp scan exclusive (a) > a += c[i]; > } > } > > Jakub
--- gcc/omp-low.c.jj 2019-06-20 13:26:29.085150770 +0200 +++ gcc/omp-low.c 2019-06-20 15:46:25.964253058 +0200 @@ -3692,7 +3692,8 @@ struct omplow_simd_context { static bool lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, omplow_simd_context *sctx, tree &ivar, - tree &lvar, tree *rvar = NULL) + tree &lvar, tree *rvar = NULL, + tree *rvar2 = NULL) { if (known_eq (sctx->max_vf, 0U)) { @@ -3767,6 +3768,25 @@ lower_rec_simd_input_clauses (tree new_v *rvar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->lastlane, NULL_TREE, NULL_TREE); TREE_THIS_NOTRAP (*rvar) = 1; + + if (!ctx->scan_inclusive) + { + /* And for exclusive scan yet another one, which will + hold the value during the scan phase. */ + tree savar = create_tmp_var_raw (atype); + if (TREE_ADDRESSABLE (new_var)) + TREE_ADDRESSABLE (savar) = 1; + DECL_ATTRIBUTES (savar) + = tree_cons (get_identifier ("omp simd array"), NULL, + tree_cons (get_identifier ("omp simd inscan " + "exclusive"), NULL, + DECL_ATTRIBUTES (savar))); + gimple_add_tmp_var (savar); + ctx->cb.decl_map->put (iavar, savar); + *rvar2 = build4 (ARRAY_REF, TREE_TYPE (new_var), savar, + sctx->idx, NULL_TREE, NULL_TREE); + TREE_THIS_NOTRAP (*rvar2) = 1; + } } ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->idx, NULL_TREE, NULL_TREE); @@ -5185,14 +5205,15 @@ lower_rec_input_clauses (tree clauses, g new_vard = TREE_OPERAND (new_var, 0); gcc_assert (DECL_P (new_vard)); } - tree rvar = NULL_TREE, *rvarp = NULL; + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE; if (is_simd && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION && OMP_CLAUSE_REDUCTION_INSCAN (c)) rvarp = &rvar; if (is_simd && lower_rec_simd_input_clauses (new_var, ctx, &sctx, - ivar, lvar, rvarp)) + ivar, lvar, rvarp, + &rvar2)) { if (new_vard == new_var) { @@ -5220,6 +5241,14 @@ lower_rec_input_clauses (tree clauses, g (c, ivar2, build_outer_var_ref (var, ctx)); gimplify_and_add (x, &llist[0]); + if (rvar2) + { + x = lang_hooks.decls.omp_clause_default_ctor + (c, unshare_expr (rvar2), + build_outer_var_ref (var, ctx)); + gimplify_and_add (x, &llist[0]); + } + /* For types that need construction, add another private var which will be default constructed and optionally initialized with @@ -5229,7 +5258,9 @@ lower_rec_input_clauses (tree clauses, g iteration. */ tree nv = create_tmp_var_raw (TREE_TYPE (ivar)); gimple_add_tmp_var (nv); - ctx->cb.decl_map->put (TREE_OPERAND (ivar, 0), + ctx->cb.decl_map->put (TREE_OPERAND (rvar2 + ? rvar2 + : ivar, 0), nv); x = lang_hooks.decls.omp_clause_default_ctor (c, nv, build_outer_var_ref (var, ctx)); @@ -5296,6 +5327,18 @@ lower_rec_input_clauses (tree clauses, g gimplify_stmt (&dtor, &tseq); gimple_seq_add_seq (&llist[1], tseq); } + + if (rvar2) + { + x = lang_hooks.decls.omp_clause_dtor (c, rvar2); + if (x) + { + tseq = NULL; + dtor = x; + gimplify_stmt (&dtor, &tseq); + gimple_seq_add_seq (&llist[1], tseq); + } + } break; } if (x) @@ -5390,6 +5433,24 @@ lower_rec_input_clauses (tree clauses, g gimple_seq_add_seq (ilist, tseq); } OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL; + if (!ctx->scan_inclusive) + { + tree nv2 + = create_tmp_var_raw (TREE_TYPE (new_var)); + gimple_add_tmp_var (nv2); + ctx->cb.decl_map->put (nv, nv2); + x = lang_hooks.decls.omp_clause_default_ctor + (c, nv2, build_outer_var_ref (var, ctx)); + gimplify_and_add (x, ilist); + x = lang_hooks.decls.omp_clause_dtor (c, nv2); + if (x) + { + tseq = NULL; + dtor = x; + gimplify_stmt (&dtor, &tseq); + gimple_seq_add_seq (dlist, tseq); + } + } x = lang_hooks.decls.omp_clause_dtor (c, nv); if (x) { @@ -5399,6 +5460,21 @@ lower_rec_input_clauses (tree clauses, g gimple_seq_add_seq (dlist, tseq); } } + else if (!ctx->scan_inclusive + && TREE_ADDRESSABLE (TREE_TYPE (new_var))) + { + tree nv2 = create_tmp_var_raw (TREE_TYPE (new_var)); + gimple_add_tmp_var (nv2); + ctx->cb.decl_map->put (new_vard, nv2); + x = lang_hooks.decls.omp_clause_dtor (c, nv2); + if (x) + { + tseq = NULL; + dtor = x; + gimplify_stmt (&dtor, &tseq); + gimple_seq_add_seq (dlist, tseq); + } + } DECL_HAS_VALUE_EXPR_P (placeholder) = 0; goto do_dtor; } @@ -5487,14 +5563,15 @@ lower_rec_input_clauses (tree clauses, g new_vard = TREE_OPERAND (new_var, 0); gcc_assert (DECL_P (new_vard)); } - tree rvar = NULL_TREE, *rvarp = NULL; + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE; if (is_simd && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION && OMP_CLAUSE_REDUCTION_INSCAN (c)) rvarp = &rvar; if (is_simd && lower_rec_simd_input_clauses (new_var, ctx, &sctx, - ivar, lvar, rvarp)) + ivar, lvar, rvarp, + &rvar2)) { if (new_vard != new_var) { @@ -8573,18 +8650,40 @@ lower_omp_scan (gimple_stmt_iterator *gs gimple_seq before = NULL; omp_context *octx = ctx->outer; gcc_assert (octx); + if (!octx->scan_inclusive && !has_clauses) + { + gimple_stmt_iterator gsi2 = *gsi_p; + gsi_next (&gsi2); + gimple *stmt2 = gsi_stmt (gsi2); + /* For exclusive scan, swap GIMPLE_OMP_SCAN without clauses + with following GIMPLE_OMP_SCAN with clauses, so that input_phase, + the one with exclusive clause(s), comes first. */ + if (stmt2 + && gimple_code (stmt2) == GIMPLE_OMP_SCAN + && gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt2)) != NULL) + { + gsi_remove (gsi_p, false); + gsi_insert_after (gsi_p, stmt, GSI_SAME_STMT); + ctx = maybe_lookup_ctx (stmt2); + gcc_assert (ctx); + lower_omp_scan (gsi_p, ctx); + return; + } + } + bool input_phase = has_clauses ^ octx->scan_inclusive; if (gimple_code (octx->stmt) == GIMPLE_OMP_FOR && (gimple_omp_for_kind (octx->stmt) & GF_OMP_FOR_SIMD) - && !gimple_omp_for_combined_into_p (octx->stmt) - && octx->scan_inclusive) + && !gimple_omp_for_combined_into_p (octx->stmt)) { if (tree c = omp_find_clause (gimple_omp_for_clauses (octx->stmt), OMP_CLAUSE__SIMDUID_)) { tree uid = OMP_CLAUSE__SIMDUID__DECL (c); lane = create_tmp_var (unsigned_type_node); - tree t = build_int_cst (integer_type_node, 1 + !input_phase); + tree t = build_int_cst (integer_type_node, + input_phase ? 1 + : octx->scan_inclusive ? 2 : 3); gimple *g = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 2, uid, t); gimple_call_set_lhs (g, lane); @@ -8601,6 +8700,8 @@ lower_omp_scan (gimple_stmt_iterator *gs tree val = new_var; tree var2 = NULL_TREE; tree var3 = NULL_TREE; + tree var4 = NULL_TREE; + tree lane0 = NULL_TREE; tree new_vard = new_var; if (omp_is_reference (var)) { @@ -8623,16 +8724,26 @@ lower_omp_scan (gimple_stmt_iterator *gs DECL_ATTRIBUTES (v))) { val = unshare_expr (val); + lane0 = TREE_OPERAND (val, 1); TREE_OPERAND (val, 1) = lane; var2 = lookup_decl (v, octx); + if (!octx->scan_inclusive) + var4 = lookup_decl (var2, octx); if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) - var3 = maybe_lookup_decl (var2, octx); + var3 = maybe_lookup_decl (var4 ? var4 : var2, octx); if (!input_phase) { var2 = build4 (ARRAY_REF, TREE_TYPE (val), var2, lane, NULL_TREE, NULL_TREE); TREE_THIS_NOTRAP (var2) = 1; + if (!octx->scan_inclusive) + { + var4 = build4 (ARRAY_REF, TREE_TYPE (val), + var4, lane, NULL_TREE, + NULL_TREE); + TREE_THIS_NOTRAP (var4) = 1; + } } else var2 = val; @@ -8643,12 +8754,28 @@ lower_omp_scan (gimple_stmt_iterator *gs else { var2 = build_outer_var_ref (var, octx); - if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) + if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) { var3 = maybe_lookup_decl (new_vard, octx); - if (var3 == new_vard) + if (var3 == new_vard || var3 == NULL_TREE) var3 = NULL_TREE; + else if (!octx->scan_inclusive && !input_phase) + { + var4 = maybe_lookup_decl (var3, octx); + if (var4 == var3 || var4 == NULL_TREE) + { + if (TREE_ADDRESSABLE (TREE_TYPE (new_var))) + { + var4 = var3; + var3 = NULL_TREE; + } + else + var4 = NULL_TREE; + } + } } + if (!octx->scan_inclusive && !input_phase && var4 == NULL_TREE) + var4 = create_tmp_var (TREE_TYPE (val)); } if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) { @@ -8689,9 +8816,17 @@ lower_omp_scan (gimple_stmt_iterator *gs } else { + tree x; + if (!octx->scan_inclusive) + { + tree v4 = unshare_expr (var4); + tree v2 = unshare_expr (var2); + x = lang_hooks.decls.omp_clause_assign_op (c, v4, v2); + gimplify_and_add (x, &before); + } gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c); - tree x = (DECL_HAS_VALUE_EXPR_P (new_vard) - ? DECL_VALUE_EXPR (new_vard) : NULL_TREE); + x = (DECL_HAS_VALUE_EXPR_P (new_vard) + ? DECL_VALUE_EXPR (new_vard) : NULL_TREE); tree vexpr = val; if (x && omp_is_reference (var)) vexpr = build_fold_addr_expr_loc (clause_loc, val); @@ -8706,8 +8841,18 @@ lower_omp_scan (gimple_stmt_iterator *gs SET_DECL_VALUE_EXPR (new_vard, x); SET_DECL_VALUE_EXPR (placeholder, NULL_TREE); DECL_HAS_VALUE_EXPR_P (placeholder) = 0; - x = lang_hooks.decls.omp_clause_assign_op (c, val, var2); - gimplify_and_add (x, &before); + if (octx->scan_inclusive) + { + x = lang_hooks.decls.omp_clause_assign_op (c, val, + var2); + gimplify_and_add (x, &before); + } + else if (lane0 == NULL_TREE) + { + x = lang_hooks.decls.omp_clause_assign_op (c, val, + var4); + gimplify_and_add (x, &before); + } } } else @@ -8728,10 +8873,29 @@ lower_omp_scan (gimple_stmt_iterator *gs tree x = build2 (code, TREE_TYPE (var2), unshare_expr (var2), unshare_expr (val)); - gimplify_assign (unshare_expr (var2), x, &before); - gimplify_assign (val, var2, &before); + if (octx->scan_inclusive) + { + gimplify_assign (unshare_expr (var2), x, &before); + gimplify_assign (val, var2, &before); + } + else + { + gimplify_assign (unshare_expr (var4), + unshare_expr (var2), &before); + gimplify_assign (var2, x, &before); + if (lane0 == NULL_TREE) + gimplify_assign (val, var4, &before); + } } } + if (!octx->scan_inclusive && !input_phase && lane0) + { + tree vexpr = unshare_expr (var4); + TREE_OPERAND (vexpr, 1) = lane0; + if (omp_is_reference (var)) + vexpr = build_fold_addr_expr_loc (clause_loc, vexpr); + SET_DECL_VALUE_EXPR (new_vard, vexpr); + } } } else if (has_clauses) --- gcc/tree-vectorizer.h.jj 2019-06-20 13:26:29.078150879 +0200 +++ gcc/tree-vectorizer.h 2019-06-20 14:18:04.241075200 +0200 @@ -917,7 +917,7 @@ struct _stmt_vec_info { bool strided_p; /* For both loads and stores. */ - unsigned simd_lane_access_p : 2; + unsigned simd_lane_access_p : 3; /* Classifies how the load or store is going to be implemented for loop vectorization. */ --- gcc/tree-vect-data-refs.c.jj 2019-06-20 13:55:35.421150589 +0200 +++ gcc/tree-vect-data-refs.c 2019-06-20 14:18:04.240075216 +0200 @@ -4223,7 +4223,8 @@ vect_analyze_data_refs (vec_info *vinfo, /* See if this was detected as SIMD lane access. */ if (dr->aux == (void *)-1 || dr->aux == (void *)-2 - || dr->aux == (void *)-3) + || dr->aux == (void *)-3 + || dr->aux == (void *)-4) { if (nested_in_vect_loop_p (loop, stmt_info)) return opt_result::failure_at (stmt_info->stmt, --- gcc/tree-vect-stmts.c.jj 2019-06-20 13:26:29.084150785 +0200 +++ gcc/tree-vect-stmts.c 2019-06-20 14:18:04.239075231 +0200 @@ -6512,7 +6512,37 @@ check_scan_store (stmt_vec_info stmt_inf kinds are there in order to allow optimizing the initializer store and combiner sequence, e.g. if it is originally some C++ish user defined reduction, but allow the vectorizer to pattern recognize it - and turn into the appropriate vectorized scan. */ + and turn into the appropriate vectorized scan. + + For exclusive scan, this is slightly different: + #pragma omp simd reduction(inscan,+:r) + for (...) + { + use (r); + #pragma omp scan exclusive (r) + r += something (); + } + shall have body with: + // Initialization for input phase, store the reduction initializer: + _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0); + _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1); + D.2042[_21] = 0; + // Actual input phase: + ... + r.0_5 = D.2042[_20]; + _6 = _4 + r.0_5; + D.2042[_20] = _6; + // Initialization for scan phase: + _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3); + _26 = D.2043[_25]; + D.2044[_25] = _26; + _27 = D.2042[_25]; + _28 = _26 + _27; + D.2043[_25] = _28; + // Actual scan phase: + ... + r.1_8 = D.2044[_20]; + ... */ if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2) { @@ -6553,26 +6583,52 @@ check_scan_store (stmt_vec_info stmt_inf if (TREE_CODE (rhs) != SSA_NAME) goto fail; - use_operand_p use_p; - imm_use_iterator iter; gimple *other_store_stmt = NULL; - FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); + bool inscan_var_store + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; + + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) { - gimple *use_stmt = USE_STMT (use_p); - if (use_stmt == stmt || is_gimple_debug (use_stmt)) - continue; - if (gimple_bb (use_stmt) != gimple_bb (stmt) - || !gimple_store_p (use_stmt) - || other_store_stmt) - goto fail; - other_store_stmt = use_stmt; + if (!inscan_var_store) + { + use_operand_p use_p; + imm_use_iterator iter; + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) + { + gimple *use_stmt = USE_STMT (use_p); + if (use_stmt == stmt || is_gimple_debug (use_stmt)) + continue; + if (gimple_bb (use_stmt) != gimple_bb (stmt) + || !is_gimple_assign (use_stmt) + || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS + || other_store_stmt + || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME) + goto fail; + other_store_stmt = use_stmt; + } + if (other_store_stmt == NULL) + goto fail; + rhs = gimple_assign_lhs (other_store_stmt); + if (!single_imm_use (rhs, &use_p, &other_store_stmt)) + goto fail; + } } - if (other_store_stmt == NULL) - goto fail; - stmt_vec_info other_store_stmt_info - = loop_vinfo->lookup_stmt (other_store_stmt); - if (other_store_stmt_info == NULL - || STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) != 3) + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3) + { + use_operand_p use_p; + imm_use_iterator iter; + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) + { + gimple *use_stmt = USE_STMT (use_p); + if (use_stmt == stmt || is_gimple_debug (use_stmt)) + continue; + if (other_store_stmt) + goto fail; + other_store_stmt = use_stmt; + } + } + else goto fail; gimple *def_stmt = SSA_NAME_DEF_STMT (rhs); @@ -6599,8 +6655,7 @@ check_scan_store (stmt_vec_info stmt_inf tree rhs1 = gimple_assign_rhs1 (def_stmt); tree rhs2 = gimple_assign_rhs2 (def_stmt); - if (TREE_CODE (rhs1) != SSA_NAME - || TREE_CODE (rhs2) != SSA_NAME) + if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME) goto fail; gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1); @@ -6615,22 +6670,83 @@ check_scan_store (stmt_vec_info stmt_inf stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt); if (load1_stmt_info == NULL || load2_stmt_info == NULL - || STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) != 3 - || STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) != 3) + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)) + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))) goto fail; - if (scan_operand_equal_p (gimple_assign_lhs (stmt), + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store) + { + dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info); + if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0))) + goto fail; + tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0); + tree lrhs; + if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1))) + lrhs = rhs1; + else + lrhs = rhs2; + use_operand_p use_p; + imm_use_iterator iter; + FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs) + { + gimple *use_stmt = USE_STMT (use_p); + if (use_stmt == def_stmt || is_gimple_debug (use_stmt)) + continue; + if (other_store_stmt) + goto fail; + other_store_stmt = use_stmt; + } + } + + if (other_store_stmt == NULL) + goto fail; + if (gimple_bb (other_store_stmt) != gimple_bb (stmt) + || !gimple_store_p (other_store_stmt)) + goto fail; + + stmt_vec_info other_store_stmt_info + = loop_vinfo->lookup_stmt (other_store_stmt); + if (other_store_stmt_info == NULL + || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))) + goto fail; + + gimple *stmt1 = stmt; + gimple *stmt2 = other_store_stmt; + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) + std::swap (stmt1, stmt2); + if (scan_operand_equal_p (gimple_assign_lhs (stmt1), gimple_assign_rhs1 (load2_stmt))) { std::swap (rhs1, rhs2); std::swap (load1_stmt, load2_stmt); std::swap (load1_stmt_info, load2_stmt_info); } - if (!scan_operand_equal_p (gimple_assign_lhs (stmt), - gimple_assign_rhs1 (load1_stmt)) - || !scan_operand_equal_p (gimple_assign_lhs (other_store_stmt), + if (!scan_operand_equal_p (gimple_assign_lhs (stmt1), + gimple_assign_rhs1 (load1_stmt))) + goto fail; + + tree var3 = NULL_TREE; + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3 + && !scan_operand_equal_p (gimple_assign_lhs (stmt2), gimple_assign_rhs1 (load2_stmt))) goto fail; + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) + { + dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info); + if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0))) + goto fail; + var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0); + if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3)) + || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3)) + || lookup_attribute ("omp simd inscan exclusive", + DECL_ATTRIBUTES (var3))) + goto fail; + } dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info); if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR @@ -6648,6 +6764,14 @@ check_scan_store (stmt_vec_info stmt_inf if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1))) std::swap (var1, var2); + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) + { + if (!lookup_attribute ("omp simd inscan exclusive", + DECL_ATTRIBUTES (var1))) + goto fail; + var1 = var3; + } + if (loop_vinfo->scan_map == NULL) goto fail; tree *init = loop_vinfo->scan_map->get (var1); @@ -6655,6 +6779,7 @@ check_scan_store (stmt_vec_info stmt_inf goto fail; /* The IL is as expected, now check if we can actually vectorize it. + Inclusive scan: _26 = D.2043[_25]; _27 = D.2042[_25]; _28 = _26 + _27; @@ -6664,21 +6789,49 @@ check_scan_store (stmt_vec_info stmt_inf from the D.2042[_21] = 0; store): _30 = MEM <vector(8) int> [(int *)&D.2043]; _31 = MEM <vector(8) int> [(int *)&D.2042]; - _32 = VEC_PERM_EXPR <_31, _40, { 8, 0, 1, 2, 3, 4, 5, 6 }>; + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>; _33 = _31 + _32; // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] }; - _34 = VEC_PERM_EXPR <_33, _40, { 8, 9, 0, 1, 2, 3, 4, 5 }>; + _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>; _35 = _33 + _34; // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], // _31[1]+.._31[4], ... _31[4]+.._31[7] }; - _36 = VEC_PERM_EXPR <_35, _40, { 8, 9, 10, 11, 0, 1, 2, 3 }>; + _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>; _37 = _35 + _36; // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], // _31[0]+.._31[4], ... _31[0]+.._31[7] }; _38 = _30 + _37; _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>; MEM <vector(8) int> [(int *)&D.2043] = _39; - MEM <vector(8) int> [(int *)&D.2042] = _38; */ + MEM <vector(8) int> [(int *)&D.2042] = _38; + Exclusive scan: + _26 = D.2043[_25]; + D.2044[_25] = _26; + _27 = D.2042[_25]; + _28 = _26 + _27; + D.2043[_25] = _28; + should be vectorized as (where _40 is the vectorized rhs + from the D.2042[_21] = 0; store): + _30 = MEM <vector(8) int> [(int *)&D.2043]; + _31 = MEM <vector(8) int> [(int *)&D.2042]; + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>; + _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>; + _34 = _32 + _33; + // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3], + // _31[3]+_31[4], ... _31[5]+.._31[6] }; + _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>; + _36 = _34 + _35; + // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], + // _31[1]+.._31[4], ... _31[3]+.._31[6] }; + _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>; + _38 = _36 + _37; + // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3], + // _31[0]+.._31[4], ... _31[0]+.._31[6] }; + _39 = _30 + _38; + _50 = _31 + _39; + _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>; + MEM <vector(8) int> [(int *)&D.2044] = _39; + MEM <vector(8) int> [(int *)&D.2042] = _51; */ enum machine_mode vec_mode = TYPE_MODE (vectype); optab optab = optab_for_tree_code (code, vectype, optab_default); if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing) @@ -6715,6 +6868,24 @@ vectorizable_scan_store (stmt_vec_info s tree rhs = gimple_assign_rhs1 (stmt); gcc_assert (TREE_CODE (rhs) == SSA_NAME); + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); + bool inscan_var_store + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; + + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) + { + use_operand_p use_p; + imm_use_iterator iter; + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs) + { + gimple *use_stmt = USE_STMT (use_p); + if (use_stmt == stmt || is_gimple_debug (use_stmt)) + continue; + rhs = gimple_assign_lhs (use_stmt); + break; + } + } + gimple *def_stmt = SSA_NAME_DEF_STMT (rhs); enum tree_code code = gimple_assign_rhs_code (def_stmt); if (code == POINTER_PLUS_EXPR) @@ -6737,15 +6908,12 @@ vectorizable_scan_store (stmt_vec_info s { std::swap (rhs1, rhs2); std::swap (var1, var2); + std::swap (load1_dr_info, load2_dr_info); } tree *init = loop_vinfo->scan_map->get (var1); gcc_assert (init); - tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0); - bool inscan_var_store - = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL; - unsigned HOST_WIDE_INT nunits; if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) gcc_unreachable (); @@ -6789,29 +6957,50 @@ vectorizable_scan_store (stmt_vec_info s tree vec_oprnd1 = NULL_TREE; tree vec_oprnd2 = NULL_TREE; tree vec_oprnd3 = NULL_TREE; - tree dataref_ptr = unshare_expr (DR_BASE_ADDRESS (dr_info->dr)); + tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr); tree dataref_offset = build_int_cst (ref_type, 0); tree bump = vect_get_data_ptr_increment (dr_info, vectype, VMAT_CONTIGUOUS); + tree ldataref_ptr = NULL_TREE; tree orig = NULL_TREE; + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store) + ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr); for (int j = 0; j < ncopies; j++) { stmt_vec_info new_stmt_info; if (j == 0) { vec_oprnd1 = vect_get_vec_def_for_operand (*init, stmt_info); - vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info); + if (ldataref_ptr == NULL) + vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info); vec_oprnd3 = vect_get_vec_def_for_operand (rhs2, stmt_info); orig = vec_oprnd3; } else { vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); - vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2); + if (ldataref_ptr == NULL) + vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2); vec_oprnd3 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd3); if (!inscan_var_store) dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump); } + if (ldataref_ptr) + { + vec_oprnd2 = make_ssa_name (vectype); + tree data_ref = fold_build2 (MEM_REF, vectype, + unshare_expr (ldataref_ptr), + dataref_offset); + vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr)); + gimple *g = gimple_build_assign (vec_oprnd2, data_ref); + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); + if (prev_stmt_info == NULL) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; + prev_stmt_info = new_stmt_info; + } + tree v = vec_oprnd2; for (int i = 0; i < units_log2; ++i) { @@ -6848,6 +7037,17 @@ vectorizable_scan_store (stmt_vec_info s new_temp = new_temp2; } + /* For exclusive scan, perform the perms[i] permutation once + more. */ + if (i == 0 + && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 + && v == vec_oprnd2) + { + v = new_temp; + --i; + continue; + } + tree new_temp2 = make_ssa_name (vectype); g = gimple_build_assign (new_temp2, code, v, new_temp); new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); @@ -6863,16 +7063,30 @@ vectorizable_scan_store (stmt_vec_info s STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; prev_stmt_info = new_stmt_info; + tree last_perm_arg = new_temp; + /* For exclusive scan, new_temp computed above is the exclusive scan + prefix sum. Turn it into inclusive prefix sum for the broadcast + of the last element into orig. */ + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4) + { + last_perm_arg = make_ssa_name (vectype); + g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2); + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; + prev_stmt_info = new_stmt_info; + } + orig = make_ssa_name (vectype); - g = gimple_build_assign (orig, VEC_PERM_EXPR, new_temp, new_temp, - perms[units_log2]); + g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg, + last_perm_arg, perms[units_log2]); new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; prev_stmt_info = new_stmt_info; if (!inscan_var_store) { - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, + tree data_ref = fold_build2 (MEM_REF, vectype, + unshare_expr (dataref_ptr), dataref_offset); vect_copy_ref_info (data_ref, DR_REF (dr_info->dr)); g = gimple_build_assign (data_ref, new_temp); @@ -6888,7 +7102,8 @@ vectorizable_scan_store (stmt_vec_info s if (j != 0) dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump); - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr, + tree data_ref = fold_build2 (MEM_REF, vectype, + unshare_expr (dataref_ptr), dataref_offset); vect_copy_ref_info (data_ref, DR_REF (dr_info->dr)); gimple *g = gimple_build_assign (data_ref, orig); @@ -7325,7 +7540,7 @@ vectorizable_store (stmt_vec_info stmt_i } return true; } - else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3) + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3) return vectorizable_scan_store (stmt_info, gsi, vec_stmt, ncopies); if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) --- gcc/testsuite/gcc.dg/vect/vect-simd-12.c.jj 2019-06-20 15:08:50.260400440 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-simd-12.c 2019-06-20 15:08:24.332805239 +0200 @@ -0,0 +1,122 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +int r, a[1024], b[1024]; + +__attribute__((noipa)) void +foo (int *a, int *b) +{ + #pragma omp simd reduction (inscan, +:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +bar (void) +{ + int s = 0; + #pragma omp simd reduction (inscan, +:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +__attribute__((noipa)) void +baz (int *a, int *b) +{ + #pragma omp simd reduction (inscan, +:r) if (simd: 0) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +qux (void) +{ + int s = 0; + #pragma omp simd reduction (inscan, +:s) simdlen (1) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +int +main () +{ + int s = 0; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + asm ("" : "+g" (i)); + } + foo (a, b); + if (r != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = 25; + s += i; + } + if (bar () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -1; + s += 2 * i; + } + r = 0; + baz (a, b); + if (r != 1024 * 1023 / 2) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -25; + s += i; + } + if (qux () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + s += 2 * i; + } + return 0; +} --- gcc/testsuite/gcc.dg/vect/vect-simd-13.c.jj 2019-06-20 15:47:23.580359715 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-simd-13.c 2019-06-20 15:13:23.500134387 +0200 @@ -0,0 +1,124 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +int r, a[1024], b[1024]; + +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer (omp_priv = 0) + +__attribute__((noipa)) void +foo (int *a, int *b) +{ + #pragma omp simd reduction (inscan, foo:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +bar (void) +{ + int s = 0; + #pragma omp simd reduction (inscan, foo:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +__attribute__((noipa)) void +baz (int *a, int *b) +{ + #pragma omp simd reduction (inscan, foo:r) if (simd: 0) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +qux (void) +{ + int s = 0; + #pragma omp simd reduction (inscan, foo:s) simdlen (1) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +int +main () +{ + int s = 0; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + asm ("" : "+g" (i)); + } + foo (a, b); + if (r != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = 25; + s += i; + } + if (bar () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -1; + s += 2 * i; + } + r = 0; + baz (a, b); + if (r != 1024 * 1023 / 2) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -25; + s += i; + } + if (qux () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + s += 2 * i; + } + return 0; +} --- gcc/testsuite/gcc.dg/vect/vect-simd-14.c.jj 2019-06-20 15:48:30.536321539 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-simd-14.c 2019-06-20 15:54:39.291617792 +0200 @@ -0,0 +1,94 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +float r = 1.0f, a[1024], b[1024]; + +__attribute__((noipa)) void +foo (float *a, float *b) +{ + #pragma omp simd reduction (inscan, *:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r *= a[i]; + } +} + +__attribute__((noipa)) float +bar (void) +{ + float s = -__builtin_inff (); + #pragma omp simd reduction (inscan, max:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s = s > a[i] ? s : a[i]; + } + return s; +} + +int +main () +{ + float s = 1.0f; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + if (i < 80) + a[i] = (i & 1) ? 0.25f : 0.5f; + else if (i < 200) + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; + else if (i < 280) + a[i] = (i & 1) ? 0.25f : 0.5f; + else if (i < 380) + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; + else + switch (i % 6) + { + case 0: a[i] = 0.25f; break; + case 1: a[i] = 2.0f; break; + case 2: a[i] = -1.0f; break; + case 3: a[i] = -4.0f; break; + case 4: a[i] = 0.5f; break; + case 5: a[i] = 1.0f; break; + default: a[i] = 0.0f; break; + } + b[i] = -19.0f; + asm ("" : "+g" (i)); + } + foo (a, b); + if (r * 16384.0f != 0.125f) + abort (); + float m = -175.25f; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -231.75f; + s *= a[i]; + a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f); + m += 0.75f; + } + if (bar () != 592.0f) + abort (); + s = -__builtin_inff (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + if (s < a[i]) + s = a[i]; + } + return 0; +} --- gcc/testsuite/gcc.dg/vect/vect-simd-15.c.jj 2019-06-20 15:50:34.483399705 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-simd-15.c 2019-06-20 15:52:09.976919050 +0200 @@ -0,0 +1,186 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +int r, a[1024], b[1024]; +unsigned short r2, b2[1024]; +unsigned char r3, b3[1024]; + +__attribute__((noipa)) void +foo (int *a, int *b, unsigned short *b2, unsigned char *b3) +{ + #pragma omp simd reduction (inscan, +:r, r2, r3) + for (int i = 0; i < 1024; i++) + { + { + b[i] = r; + b2[i] = r2; + b3[i] = r3; + } + #pragma omp scan exclusive(r, r2, r3) + { r += a[i]; r2 += a[i]; r3 += a[i]; } + } +} + +__attribute__((noipa)) int +bar (unsigned short *s2p, unsigned char *s3p) +{ + int s = 0; + unsigned short s2 = 0; + unsigned char s3 = 0; + #pragma omp simd reduction (inscan, +:s, s2, s3) + for (int i = 0; i < 1024; i++) + { + { b[i] = s; b2[i] = s2; b3[i] = s3; } + #pragma omp scan exclusive(s, s2, s3) + { + s += 2 * a[i]; + s2 += 2 * a[i]; + s3 += 2 * a[i]; + } + } + *s2p = s2; + *s3p = s3; + return s; +} + +__attribute__((noipa)) void +baz (int *a, int *b, unsigned short *b2, unsigned char *b3) +{ + #pragma omp simd reduction (inscan, +:r, r2, r3) if (simd: 0) + for (int i = 0; i < 1024; i++) + { + { + b[i] = r; + b2[i] = r2; + b3[i] = r3; + } + #pragma omp scan exclusive(r, r2, r3) + { + r += a[i]; + r2 += a[i]; + r3 += a[i]; + } + } +} + +__attribute__((noipa)) int +qux (unsigned short *s2p, unsigned char *s3p) +{ + int s = 0; + unsigned short s2 = 0; + unsigned char s3 = 0; + #pragma omp simd reduction (inscan, +:s, s2, s3) simdlen (1) + for (int i = 0; i < 1024; i++) + { + { b[i] = s; b2[i] = s2; b3[i] = s3; } + #pragma omp scan exclusive(s, s2, s3) + { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; } + } + *s2p = s2; + *s3p = s3; + return s; +} + +int +main () +{ + int s = 0; + unsigned short s2; + unsigned char s3; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + b2[i] = -1; + b3[i] = -1; + asm ("" : "+g" (i)); + } + foo (a, b, b2, b3); + if (r != 1024 * 1023 / 2 + || r2 != (unsigned short) r + || r3 != (unsigned char) r) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = 25; + b2[i] = 24; + b3[i] = 26; + } + s += i; + } + if (bar (&s2, &s3) != 1024 * 1023) + abort (); + if (s2 != (unsigned short) (1024 * 1023) + || s3 != (unsigned char) (1024 * 1023)) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = -1; + b2[i] = -1; + b3[i] = -1; + } + s += 2 * i; + } + r = 0; + r2 = 0; + r3 = 0; + baz (a, b, b2, b3); + if (r != 1024 * 1023 / 2 + || r2 != (unsigned short) r + || r3 != (unsigned char) r) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + else + { + b[i] = 25; + b2[i] = 24; + b3[i] = 26; + } + s += i; + } + s2 = 0; + s3 = 0; + if (qux (&s2, &s3) != 1024 * 1023) + abort (); + if (s2 != (unsigned short) (1024 * 1023) + || s3 != (unsigned char) (1024 * 1023)) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s + || b2[i] != (unsigned short) s + || b3[i] != (unsigned char) s) + abort (); + s += 2 * i; + } + return 0; +} --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c.jj 2019-06-20 15:58:35.276983324 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c 2019-06-20 15:58:35.274983355 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-12.c" + +static void +sse2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c.jj 2019-06-20 15:58:35.283983216 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c 2019-06-20 15:58:35.281983247 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-13.c" + +static void +sse2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c.jj 2019-06-20 15:58:35.288983139 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c 2019-06-20 15:58:35.287983154 +0200 @@ -0,0 +1,15 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-14.c" + +static void +sse2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c.jj 2019-06-20 15:58:35.293983061 +0200 +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c 2019-06-20 15:58:35.292983077 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-15.c" + +static void +sse2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c.jj 2019-06-20 15:58:35.299982969 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c 2019-06-20 15:58:35.297982999 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-12.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c.jj 2019-06-20 15:58:35.305982876 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c 2019-06-20 15:58:35.303982907 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-13.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c.jj 2019-06-20 15:58:35.310982799 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c 2019-06-20 15:58:35.309982815 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-14.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c.jj 2019-06-20 15:58:35.316982707 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c 2019-06-20 15:58:35.314982738 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-15.c" + +static void +avx2_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c.jj 2019-06-20 15:58:35.323982599 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c 2019-06-20 15:58:35.321982630 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-12.c" + +static void +avx512f_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c.jj 2019-06-20 15:58:35.328982522 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c 2019-06-20 15:58:35.326982553 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-13.c" + +static void +avx512f_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c.jj 2019-06-20 15:58:35.333982445 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c 2019-06-20 15:58:35.332982461 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-14.c" + +static void +avx512f_test (void) +{ + do_main (); +} --- gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c.jj 2019-06-20 15:58:35.347982230 +0200 +++ gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c 2019-06-20 15:58:35.346982245 +0200 @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512bw -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512bw } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512bw-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-15.c" + +static void +avx512bw_test (void) +{ + do_main (); +} --- gcc/testsuite/g++.dg/vect/simd-6.cc.jj 2019-06-20 16:00:34.800142524 +0200 +++ gcc/testsuite/g++.dg/vect/simd-6.cc 2019-06-20 16:07:41.722559826 +0200 @@ -0,0 +1,161 @@ +// { dg-require-effective-target size32plus } +// { dg-additional-options "-fopenmp-simd" } +// { dg-additional-options "-mavx" { target avx_runtime } } +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { xfail *-*-* } } } + +#include "../../gcc.dg/vect/tree-vect.h" + +template <typename T> +struct S { + inline S (); + inline ~S (); + inline S (const S &); + inline S & operator= (const S &); + T s; +}; + +template <typename T> +S<T>::S () : s (0) +{ +} + +template <typename T> +S<T>::~S () +{ +} + +template <typename T> +S<T>::S (const S &x) +{ + s = x.s; +} + +template <typename T> +S<T> & +S<T>::operator= (const S &x) +{ + s = x.s; + return *this; +} + +template <typename T> +static inline void +ini (S<T> &x) +{ + x.s = 0; +} + +S<int> r, a[1024], b[1024]; + +#pragma omp declare reduction (+: S<int>: omp_out.s += omp_in.s) +#pragma omp declare reduction (plus: S<int>: omp_out.s += omp_in.s) initializer (ini (omp_priv)) + +template <typename T> +__attribute__((noipa)) void +foo (S<T> *a, S<T> *b) +{ + #pragma omp simd reduction (inscan, +:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r.s += a[i].s; + } +} + +template <typename T> +__attribute__((noipa)) S<T> +bar (void) +{ + S<T> s; + #pragma omp simd reduction (inscan, plus:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s.s += 2 * a[i].s; + } + return S<T> (s); +} + +__attribute__((noipa)) void +baz (S<int> *a, S<int> *b) +{ + #pragma omp simd reduction (inscan, +:r) simdlen(1) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r.s += a[i].s; + } +} + +__attribute__((noipa)) S<int> +qux (void) +{ + S<int> s; + #pragma omp simd if (0) reduction (inscan, plus:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s.s += 2 * a[i].s; + } + return S<int> (s); +} + +int +main () +{ + S<int> s; + check_vect (); + for (int i = 0; i < 1024; ++i) + { + a[i].s = i; + b[i].s = -1; + asm ("" : "+g" (i)); + } + foo (a, b); + if (r.s != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + else + b[i].s = 25; + s.s += i; + } + if (bar<int> ().s != 1024 * 1023) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + s.s += 2 * i; + } + r.s = 0; + baz (a, b); + if (r.s != 1024 * 1023 / 2) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + else + b[i].s = 25; + s.s += i; + } + if (qux ().s != 1024 * 1023) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + s.s += 2 * i; + } + return 0; +} --- gcc/testsuite/g++.dg/vect/simd-7.cc.jj 2019-06-20 16:00:51.095891542 +0200 +++ gcc/testsuite/g++.dg/vect/simd-7.cc 2019-06-20 16:12:50.222747875 +0200 @@ -0,0 +1,124 @@ +// { dg-require-effective-target size32plus } +// { dg-additional-options "-fopenmp-simd" } +// { dg-additional-options "-mavx" { target avx_runtime } } +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#include "../../gcc.dg/vect/tree-vect.h" + +int r, a[1024], b[1024], q; + +template <typename T, typename U> +__attribute__((noipa)) void +foo (T a, T b, U r) +{ + #pragma omp simd reduction (inscan, +:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +template <typename T> +__attribute__((noipa)) T +bar (void) +{ + T &s = q; + q = 0; + #pragma omp simd reduction (inscan, +:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +template <typename T> +__attribute__((noipa)) void +baz (T *a, T *b, T &r) +{ + #pragma omp simd reduction (inscan, +:r) if (simd: 0) + for (T i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +template <typename T> +__attribute__((noipa)) int +qux (void) +{ + T s = q; + q = 0; + #pragma omp simd reduction (inscan, +:s) simdlen (1) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +int +main () +{ + int s = 0; + check_vect (); + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + asm ("" : "+g" (i)); + } + foo<int *, int &> (a, b, r); + if (r != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = 25; + s += i; + } + if (bar<int> () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -1; + s += 2 * i; + } + r = 0; + baz<int> (a, b, r); + if (r != 1024 * 1023 / 2) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -25; + s += i; + } + if (qux<int &> () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + s += 2 * i; + } + return 0; +} --- gcc/testsuite/g++.dg/vect/simd-8.cc.jj 2019-06-20 16:00:54.154844430 +0200 +++ gcc/testsuite/g++.dg/vect/simd-8.cc 2019-06-20 16:15:37.994133891 +0200 @@ -0,0 +1,122 @@ +// { dg-require-effective-target size32plus } +// { dg-additional-options "-fopenmp-simd" } +// { dg-additional-options "-mavx" { target avx_runtime } } +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } + +#include "../../gcc.dg/vect/tree-vect.h" + +int r, a[1024], b[1024], q; + +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer (omp_priv = 0) + +__attribute__((noipa)) void +foo (int *a, int *b, int &r) +{ + #pragma omp simd reduction (inscan, foo:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +bar (void) +{ + int &s = q; + q = 0; + #pragma omp simd reduction (inscan, foo:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +__attribute__((noipa)) void +baz (int *a, int *b, int &r) +{ + #pragma omp simd reduction (inscan, foo:r) if (simd: 0) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r += a[i]; + } +} + +__attribute__((noipa)) int +qux (void) +{ + int &s = q; + q = 0; + #pragma omp simd reduction (inscan, foo:s) simdlen (1) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s += 2 * a[i]; + } + return s; +} + +int +main () +{ + int s = 0; + check_vect (); + for (int i = 0; i < 1024; ++i) + { + a[i] = i; + b[i] = -1; + asm ("" : "+g" (i)); + } + foo (a, b, r); + if (r != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = 25; + s += i; + } + if (bar () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -1; + s += 2 * i; + } + r = 0; + baz (a, b, r); + if (r != 1024 * 1023 / 2) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + else + b[i] = -25; + s += i; + } + if (qux () != 1024 * 1023) + abort (); + s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i] != s) + abort (); + s += 2 * i; + } + return 0; +} --- gcc/testsuite/g++.dg/vect/simd-9.cc.jj 2019-06-20 16:00:57.197797566 +0200 +++ gcc/testsuite/g++.dg/vect/simd-9.cc 2019-06-20 16:17:27.484427949 +0200 @@ -0,0 +1,153 @@ +// { dg-require-effective-target size32plus } +// { dg-additional-options "-fopenmp-simd" } +// { dg-additional-options "-mavx" { target avx_runtime } } +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { xfail *-*-* } } } + +#include "../../gcc.dg/vect/tree-vect.h" + +struct S { + inline S (); + inline ~S (); + inline S (const S &); + inline S & operator= (const S &); + int s; +}; + +S::S () : s (0) +{ +} + +S::~S () +{ +} + +S::S (const S &x) +{ + s = x.s; +} + +S & +S::operator= (const S &x) +{ + s = x.s; + return *this; +} + +static inline void +ini (S &x) +{ + x.s = 0; +} + +S r, a[1024], b[1024]; + +#pragma omp declare reduction (+: S: omp_out.s += omp_in.s) +#pragma omp declare reduction (plus: S: omp_out.s += omp_in.s) initializer (ini (omp_priv)) + +__attribute__((noipa)) void +foo (S *a, S *b, S &r) +{ + #pragma omp simd reduction (inscan, +:r) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r.s += a[i].s; + } +} + +__attribute__((noipa)) S +bar (void) +{ + S s; + #pragma omp simd reduction (inscan, plus:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s.s += 2 * a[i].s; + } + return s; +} + +__attribute__((noipa)) void +baz (S *a, S *b, S &r) +{ + #pragma omp simd reduction (inscan, +:r) simdlen(1) + for (int i = 0; i < 1024; i++) + { + b[i] = r; + #pragma omp scan exclusive(r) + r.s += a[i].s; + } +} + +__attribute__((noipa)) S +qux (void) +{ + S s; + #pragma omp simd if (0) reduction (inscan, plus:s) + for (int i = 0; i < 1024; i++) + { + b[i] = s; + #pragma omp scan exclusive(s) + s.s += 2 * a[i].s; + } + return s; +} + +int +main () +{ + S s; + check_vect (); + for (int i = 0; i < 1024; ++i) + { + a[i].s = i; + b[i].s = -1; + asm ("" : "+g" (i)); + } + foo (a, b, r); + if (r.s != 1024 * 1023 / 2) + abort (); + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + else + b[i].s = 25; + s.s += i; + } + if (bar ().s != 1024 * 1023) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + s.s += 2 * i; + } + r.s = 0; + baz (a, b, r); + if (r.s != 1024 * 1023 / 2) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + else + b[i].s = 25; + s.s += i; + } + if (qux ().s != 1024 * 1023) + abort (); + s.s = 0; + for (int i = 0; i < 1024; ++i) + { + if (b[i].s != s.s) + abort (); + s.s += 2 * i; + } + return 0; +} --- gcc/testsuite/c-c++-common/gomp/scan-2.c.jj 2019-06-10 14:18:17.461525669 +0200 +++ gcc/testsuite/c-c++-common/gomp/scan-2.c 2019-06-20 23:54:03.615422149 +0200 @@ -8,7 +8,7 @@ f1 (int *c, int *d) for (i = 0; i < 64; i++) { d[i] = a; - #pragma omp scan exclusive (a) /* { dg-message "sorry, unimplemented: '#pragma omp scan' not supported yet" } */ + #pragma omp scan exclusive (a) a += c[i]; } }