From ecde6210ff483ad899a2eeba91aa1b623c49173a Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Fri, 22 Mar 2024 19:57:45 +0800
Subject: [PATCH] vect: Support multiple lane-reducing operations for loop
reduction [PR114440]
gcc/
PR tree-optimization/114440
* tree-vectorizer.h (struct _stmt_vec_info): Add a new field
reduc_result_pos.
(vectorizable_lane_reducing): New function declaration.
* tree-vect-stmts.cc (vectorizable_condition): Treat the condition
statement that is pointed by stmt_vec_info of reduction PHI as the
real "for_reduction" statement.
(vect_analyze_stmt): Call new function vectorizable_lane_reducing
to analyze lane-reducing operation.
* tree-vect-loop.cc (vect_is_emulated_mixed_dot_prod): Remove parameter
loop_vinfo. Get input vectype from stmt_info instead of reduction PHI.
(vect_model_reduction_cost): Remove cost computation code related to
emulated_mixed_dot_prod.
(vect_reduction_use_partial_vector): New function.
(vectorizable_lane_reducing): New function.
(vectorizable_reduction): Allow multiple lane-reducing operations in
loop reduction. Move some original lane-reducing related code to
vectorizable_lane_reducing, and move partial vectorization checking
code to vect_reduction_use_partial_vector.
(vect_transform_reduction): Extend transformation to support reduction
statements with mixed input vectypes.
gcc/testsuite/
PR tree-optimization/114440
* gcc.dg/vect/vect-reduc-chain-1.c
* gcc.dg/vect/vect-reduc-chain-2.c
* gcc.dg/vect/vect-reduc-chain-3.c
* gcc.dg/vect/vect-reduc-dot-slp-1.c
* gcc.dg/vect/vect-reduc-dot-slp-2.c
---
.../gcc.dg/vect/vect-reduc-chain-1.c | 62 ++
.../gcc.dg/vect/vect-reduc-chain-2.c | 77 ++
.../gcc.dg/vect/vect-reduc-chain-3.c | 66 ++
.../gcc.dg/vect/vect-reduc-dot-slp-1.c | 97 +++
.../gcc.dg/vect/vect-reduc-dot-slp-2.c | 81 +++
gcc/tree-vect-loop.cc | 668 ++++++++++++------
gcc/tree-vect-stmts.cc | 13 +-
gcc/tree-vectorizer.h | 8 +
8 files changed, 863 insertions(+), 209 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-1.c
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-2.c
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-3.c
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-1.c
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-dot-slp-2.c
new file mode 100644
@@ -0,0 +1,62 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_2 char *restrict c,
+ SIGNEDNESS_2 char *restrict d,
+ SIGNEDNESS_1 int *restrict e)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ res += a[i] * b[i];
+ res += c[i] * d[i];
+ res += e[i];
+ }
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_2 char c[N], d[N];
+ SIGNEDNESS_1 int e[N];
+ int expected = 0x12345;
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ c[i] = BASE + i * 2;
+ d[i] = BASE + OFFSET + i * 3;
+ e[i] = i;
+ asm volatile ("" ::: "memory");
+ expected += a[i] * b[i];
+ expected += c[i] * d[i];
+ expected += e[i];
+ }
+ if (f (0x12345, a, b, c, d, e) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 2 "vect" { target vect_sdot_qi } } } */
new file mode 100644
@@ -0,0 +1,77 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+fn (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_3 char *restrict c,
+ SIGNEDNESS_3 char *restrict d,
+ SIGNEDNESS_4 short *restrict e,
+ SIGNEDNESS_4 short *restrict f,
+ SIGNEDNESS_1 int *restrict g)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ res += a[i] * b[i];
+ res += i + 1;
+ res += c[i] * d[i];
+ res += e[i] * f[i];
+ res += g[i];
+ }
+ return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -126 : 4)
+#define BASE4 ((SIGNEDNESS_4 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_3 char c[N], d[N];
+ SIGNEDNESS_4 short e[N], f[N];
+ SIGNEDNESS_1 int g[N];
+ int expected = 0x12345;
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE2 + i * 5;
+ b[i] = BASE2 + OFFSET + i * 4;
+ c[i] = BASE3 + i * 2;
+ d[i] = BASE3 + OFFSET + i * 3;
+ e[i] = BASE4 + i * 6;
+ f[i] = BASE4 + OFFSET + i * 5;
+ g[i] = i;
+ asm volatile ("" ::: "memory");
+ expected += a[i] * b[i];
+ expected += i + 1;
+ expected += c[i] * d[i];
+ expected += e[i] * f[i];
+ expected += g[i];
+ }
+ if (fn (0x12345, a, b, c, d, e, f, g) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_udot_qi } } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target { vect_sdot_hi } } } } */
new file mode 100644
@@ -0,0 +1,66 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 unsigned
+#define SIGNEDNESS_3 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *restrict a,
+ SIGNEDNESS_2 char *restrict b,
+ SIGNEDNESS_3 short *restrict c,
+ SIGNEDNESS_3 short *restrict d,
+ SIGNEDNESS_1 int *restrict e)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ short diff = a[i] - b[i];
+ SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+ res += abs;
+ res += c[i] * d[i];
+ res += e[i];
+ }
+ return res;
+}
+
+#define BASE2 ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define BASE3 ((SIGNEDNESS_3 int) -1 < 0 ? -1236 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[N], b[N];
+ SIGNEDNESS_3 short c[N], d[N];
+ SIGNEDNESS_1 int e[N];
+ int expected = 0x12345;
+ for (int i = 0; i < N; ++i)
+ {
+ a[i] = BASE2 + i * 5;
+ b[i] = BASE2 - i * 4;
+ c[i] = BASE3 + i * 2;
+ d[i] = BASE3 + OFFSET + i * 3;
+ e[i] = i;
+ asm volatile ("" ::: "memory");
+ short diff = a[i] - b[i];
+ SIGNEDNESS_2 short abs = diff < 0 ? -diff : diff;
+ expected += abs;
+ expected += c[i] * d[i];
+ expected += e[i];
+ }
+ if (f (0x12345, a, b, c, d, e) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = SAD_EXPR" "vect" { target vect_udot_qi } } } */
+/* { dg-final { scan-tree-dump "vectorizing statement: \\S+ = DOT_PROD_EXPR" "vect" { target vect_sdot_hi } } } */
new file mode 100644
@@ -0,0 +1,97 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 char *a,
+ SIGNEDNESS_2 char *b,
+ int step, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[0] * b[0];
+ res += a[1] * b[1];
+ res += a[2] * b[2];
+ res += a[3] * b[3];
+ res += a[4] * b[4];
+ res += a[5] * b[5];
+ res += a[6] * b[6];
+ res += a[7] * b[7];
+ res += a[8] * b[8];
+ res += a[9] * b[9];
+ res += a[10] * b[10];
+ res += a[11] * b[11];
+ res += a[12] * b[12];
+ res += a[13] * b[13];
+ res += a[14] * b[14];
+ res += a[15] * b[15];
+
+ a += step;
+ b += step;
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -126 : 4)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 char a[100], b[100];
+ int expected = 0x12345;
+ int step = 16;
+ int n = 2;
+ int t = 0;
+
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ asm volatile ("" ::: "memory");
+ }
+
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile ("" ::: "memory");
+ expected += a[t + 0] * b[t + 0];
+ expected += a[t + 1] * b[t + 1];
+ expected += a[t + 2] * b[t + 2];
+ expected += a[t + 3] * b[t + 3];
+ expected += a[t + 4] * b[t + 4];
+ expected += a[t + 5] * b[t + 5];
+ expected += a[t + 6] * b[t + 6];
+ expected += a[t + 7] * b[t + 7];
+ expected += a[t + 8] * b[t + 8];
+ expected += a[t + 9] * b[t + 9];
+ expected += a[t + 10] * b[t + 10];
+ expected += a[t + 11] * b[t + 11];
+ expected += a[t + 12] * b[t + 12];
+ expected += a[t + 13] * b[t + 13];
+ expected += a[t + 14] * b[t + 14];
+ expected += a[t + 15] * b[t + 15];
+ t += step;
+ }
+
+ if (f (0x12345, a, b, step, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 16 "vect" } } */
new file mode 100644
@@ -0,0 +1,81 @@
+/* Disabling epilogues until we find a better way to deal with scans. */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw { target { aarch64*-*-* || arm*-*-* } } } */
+/* { dg-add-options arm_v8_2a_dotprod_neon } */
+
+#include "tree-vect.h"
+
+#define N 50
+
+#ifndef SIGNEDNESS_1
+#define SIGNEDNESS_1 signed
+#define SIGNEDNESS_2 signed
+#endif
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res,
+ SIGNEDNESS_2 short *a,
+ SIGNEDNESS_2 short *b,
+ int step, int n)
+{
+ for (int i = 0; i < n; i++)
+ {
+ res += a[0] * b[0];
+ res += a[1] * b[1];
+ res += a[2] * b[2];
+ res += a[3] * b[3];
+ res += a[4] * b[4];
+ res += a[5] * b[5];
+ res += a[6] * b[6];
+ res += a[7] * b[7];
+
+ a += step;
+ b += step;
+ }
+
+ return res;
+}
+
+#define BASE ((SIGNEDNESS_2 int) -1 < 0 ? -1026 : 373)
+#define OFFSET 20
+
+int
+main (void)
+{
+ check_vect ();
+
+ SIGNEDNESS_2 short a[100], b[100];
+ int expected = 0x12345;
+ int step = 8;
+ int n = 2;
+ int t = 0;
+
+ for (int i = 0; i < sizeof (a) / sizeof (a[0]); ++i)
+ {
+ a[i] = BASE + i * 5;
+ b[i] = BASE + OFFSET + i * 4;
+ asm volatile ("" ::: "memory");
+ }
+
+ for (int i = 0; i < n; i++)
+ {
+ asm volatile ("" ::: "memory");
+ expected += a[t + 0] * b[t + 0];
+ expected += a[t + 1] * b[t + 1];
+ expected += a[t + 2] * b[t + 2];
+ expected += a[t + 3] * b[t + 3];
+ expected += a[t + 4] * b[t + 4];
+ expected += a[t + 5] * b[t + 5];
+ expected += a[t + 6] * b[t + 6];
+ expected += a[t + 7] * b[t + 7];
+ t += step;
+ }
+
+ if (f (0x12345, a, b, step, n) != expected)
+ __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_dot_prod_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorizing statement: \\S+ = DOT_PROD_EXPR" 8 "vect" { target vect_sdot_hi } } } */
@@ -5269,8 +5269,7 @@ have_whole_vector_shift (machine_mode mode)
See vect_emulate_mixed_dot_prod for the actual sequence used. */
static bool
-vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
- stmt_vec_info stmt_info)
+vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
{
gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
@@ -5281,10 +5280,9 @@ vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
return false;
- stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
- gcc_assert (reduc_info->is_reduc_info);
+ gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
return !directly_supported_p (DOT_PROD_EXPR,
- STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
+ STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
optab_vector_mixed_sign);
}
@@ -5323,8 +5321,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
if (!gimple_extract_op (orig_stmt_info->stmt, &op))
gcc_unreachable ();
- bool emulated_mixed_dot_prod
- = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
if (reduction_type == EXTRACT_LAST_REDUCTION)
/* No extra instructions are needed in the prologue. The loop body
operations are costed in vectorizable_condition. */
@@ -5359,12 +5355,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
initial result of the data reduction, initial value of the index
reduction. */
prologue_stmts = 4;
- else if (emulated_mixed_dot_prod)
- /* We need the initial reduction value and two invariants:
- one that contains the minimum signed value and one that
- contains half of its negative. */
- prologue_stmts = 3;
else
+ /* We need the initial reduction value. */
prologue_stmts = 1;
prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
scalar_to_vec, stmt_info, 0,
@@ -7376,6 +7368,244 @@ build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
}
}
+/* Given an operation with CODE in loop reduction path whose reduction PHI is
+ specified by REDUC_INFO, the operation has TYPE of scalar result, and its
+ input vectype is represented by VECTYPE_IN. The vectype of vectorized result
+ may be different from VECTYPE_IN, either in base type or vectype lanes,
+ lane-reducing operation is the case. This function check if it is possible,
+ and how to perform partial vectorization on the operation in the context
+ of LOOP_VINFO. */
+
+static void
+vect_reduction_use_partial_vector (loop_vec_info loop_vinfo,
+ stmt_vec_info reduc_info,
+ slp_tree slp_node, code_helper code,
+ tree type, tree vectype_in)
+{
+ if (!LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ return;
+
+ enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
+ internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
+ internal_fn cond_fn = get_conditional_internal_fn (code, type);
+
+ if (reduc_type != FOLD_LEFT_REDUCTION
+ && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
+ && (cond_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_fn, vectype_in,
+ OPTIMIZE_FOR_SPEED)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors because"
+ " no conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else if (reduc_type == FOLD_LEFT_REDUCTION
+ && reduc_fn == IFN_LAST
+ && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in),
+ SSA_NAME))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors because"
+ " no conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else if (reduc_type == FOLD_LEFT_REDUCTION
+ && internal_fn_mask_index (reduc_fn) == -1
+ && FLOAT_TYPE_P (vectype_in)
+ && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't operate on partial vectors because"
+ " signed zeros cannot be preserved.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else
+ {
+ internal_fn mask_reduc_fn
+ = get_masked_reduction_fn (reduc_fn, vectype_in);
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+ unsigned nvectors;
+
+ if (slp_node)
+ nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ else
+ nvectors = vect_get_num_copies (loop_vinfo, vectype_in);
+
+ if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
+ else
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
+ }
+}
+
+/* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
+ the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC.
+ Now there are three such kinds of operations: dot-prod/widen-sum/sad
+ (sum-of-absolute-differences).
+
+ For a lane-reducing operation, the loop reduction path that it lies in,
+ may contain normal operation, or other lane-reducing operation of different
+ input type size, an example as:
+
+ int sum = 0;
+ for (i)
+ {
+ ...
+ sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += w[i]; // widen-sum <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
+ ...
+ }
+
+ Vectorization factor is essentially determined by operation whose input
+ vectype has the most lanes ("vector(16) char" in the example), while we
+ need to choose input vectype with the least lanes ("vector(4) int" in the
+ example) for the reduction PHI statement. */
+
+bool
+vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
+ slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+ gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
+ if (!stmt)
+ return false;
+
+ enum tree_code code = gimple_assign_rhs_code (stmt);
+
+ if (code != DOT_PROD_EXPR && code != WIDEN_SUM_EXPR && code != SAD_EXPR)
+ return false;
+
+ tree type = TREE_TYPE (gimple_assign_lhs (stmt));
+
+ if (!INTEGRAL_TYPE_P (type) && !SCALAR_FLOAT_TYPE_P (type))
+ return false;
+
+ /* Do not try to vectorize bit-precision reductions. */
+ if (!type_has_mode_precision_p (type))
+ return false;
+
+ tree vectype_in = NULL_TREE;
+
+ for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
+ {
+ stmt_vec_info def_stmt_info;
+ slp_tree slp_op;
+ tree op;
+ tree vectype;
+ enum vect_def_type dt;
+
+ if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
+ &slp_op, &dt, &vectype, &def_stmt_info))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "use not simple.\n");
+ return false;
+ }
+
+ if (!vectype)
+ {
+ vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
+ slp_op);
+ if (!vectype)
+ return false;
+ }
+
+ if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+
+ if (i == STMT_VINFO_REDUC_IDX (stmt_info))
+ continue;
+
+ /* There should be at most one cycle def in the stmt. */
+ if (VECTORIZABLE_CYCLE_DEF (dt))
+ return false;
+
+ /* To properly compute ncopies we are interested in the widest
+ non-reduction input type in case we're looking at a widening
+ accumulation that we later handle in vect transformation. */
+ if (!vectype_in
+ || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+ < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype)))))
+ vectype_in = vectype;
+ }
+
+ STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
+
+ stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
+
+ /* TODO: Support lane-reducing operation that does not directly participate
+ in loop reduction. */
+ if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
+ return false;
+
+ /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
+ recoginized. */
+ gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
+ gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
+
+ tree vphi_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+
+ /* To accommodate lane-reducing operations of mixed input vectypes, choose
+ input vectype with the least lanes for the reduction PHI statement, which
+ would result in the most ncopies for vectorized reduction results. */
+ if (!vphi_vectype_in
+ || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+ > GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vphi_vectype_in)))))
+ STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
+ int ncopies_for_cost;
+
+ if (slp_node)
+ {
+ /* Now lane-reducing operations in a slp node should only come from
+ the same loop reduction path. */
+ gcc_assert (REDUC_GROUP_FIRST_ELEMENT (stmt_info));
+ ncopies_for_cost = 1;
+ }
+ else
+ {
+ ncopies_for_cost = vect_get_num_copies (loop_vinfo, vectype_in);
+ gcc_assert (ncopies_for_cost >= 1);
+ }
+
+ if (vect_is_emulated_mixed_dot_prod (stmt_info))
+ {
+ /* We need extra two invariants: one that contains the minimum signed
+ value and one that contains half of its negative. */
+ int prologue_stmts = 2;
+ unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
+ scalar_to_vec, stmt_info, 0,
+ vect_prologue);
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
+ "extra prologue_cost = %d .\n", cost);
+
+ /* Three dot-products and a subtraction. */
+ ncopies_for_cost *= 4;
+ }
+
+ record_stmt_cost (cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0,
+ vect_body);
+
+ vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node, code,
+ type, vectype_in);
+
+ STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
+ return true;
+}
+
/* Function vectorizable_reduction.
Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -7441,7 +7671,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
bool single_defuse_cycle = false;
bool nested_cycle = false;
bool double_reduc = false;
- int vec_num;
tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
tree cond_reduc_val = NULL_TREE;
@@ -7522,6 +7751,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
(gimple_bb (reduc_def_phi)->loop_father));
unsigned reduc_chain_length = 0;
bool only_slp_reduc_chain = true;
+ bool only_lane_reduc_code_p = true;
stmt_info = NULL;
slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
while (reduc_def != PHI_RESULT (reduc_def_phi))
@@ -7543,14 +7773,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
all lanes here - even though we only will vectorize from
the SLP node with live lane zero the other live lanes also
need to be identified as part of a reduction to be able
- to skip code generation for them. */
+ to skip code generation for them. For lane-reducing operation
+ vectorizable analysis needs the reduction PHI information. */
if (slp_for_stmt_info)
{
for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
if (STMT_VINFO_LIVE_P (s))
STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
}
- else if (STMT_VINFO_LIVE_P (vdef))
+ else
STMT_VINFO_REDUC_DEF (def) = phi_info;
gimple_match_op op;
if (!gimple_extract_op (vdef->stmt, &op))
@@ -7571,9 +7802,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
}
}
- else if (!stmt_info)
- /* First non-conversion stmt. */
- stmt_info = vdef;
+ else
+ {
+ /* First non-conversion stmt. */
+ if (!stmt_info)
+ stmt_info = vdef;
+
+ if (op.code != DOT_PROD_EXPR
+ && op.code != WIDEN_SUM_EXPR
+ && op.code != SAD_EXPR)
+ only_lane_reduc_code_p = false;
+ }
+
reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
reduc_chain_length++;
if (!stmt_info && slp_node)
@@ -7647,18 +7887,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
if (!type_has_mode_precision_p (op.type))
return false;
- /* For lane-reducing ops we're reducing the number of reduction PHIs
- which means the only use of that may be in the lane-reducing operation. */
- if (lane_reduc_code_p
- && reduc_chain_length != 1
- && !only_slp_reduc_chain)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "lane-reducing reduction with extra stmts.\n");
- return false;
- }
-
/* All uses but the last are expected to be defined in the loop.
The last use is the reduction variable. In case of nested cycle this
assumption is not true: we use reduc_index to record the index of the
@@ -7687,9 +7915,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
"use not simple.\n");
return false;
}
- if (i == STMT_VINFO_REDUC_IDX (stmt_info))
- continue;
-
/* For an IFN_COND_OP we might hit the reduction definition operand
twice (once as definition, once as else). */
if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
@@ -7735,12 +7960,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
if (!vectype_in)
vectype_in = STMT_VINFO_VECTYPE (phi_info);
- STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
- enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
- STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
+ /* If there is a normal (non-lane-reducing) operation in the loop reduction
+ path, to ensure there will be enough copies to hold vectorized results of
+ the operation, we need set the input vectype of the reduction PHI to be
+ same as the reduction output vectype somewhere, here is a suitable place.
+ Otherwise the input vectype is set to the one with the least lanes, which
+ can only be determined in vectorizable analysis routine of lane-reducing
+ operation. */
+ if (!only_lane_reduc_code_p)
+ STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = STMT_VINFO_VECTYPE (phi_info);
+
+ enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
+ STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
/* If we have a condition reduction, see if we can simplify it further. */
- if (v_reduc_type == COND_REDUCTION)
+ if (reduction_type == COND_REDUCTION)
{
if (slp_node)
return false;
@@ -7906,8 +8140,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
+ reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
- vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
if (reduction_type == TREE_CODE_REDUCTION)
{
/* Check whether it's ok to change the order of the computation.
@@ -8181,14 +8415,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
&& loop_vinfo->suggested_unroll_factor == 1)
single_defuse_cycle = true;
- if (single_defuse_cycle || lane_reduc_code_p)
+ if (single_defuse_cycle && !lane_reduc_code_p)
{
gcc_assert (op.code != COND_EXPR);
- /* 4. Supportable by target? */
- bool ok = true;
-
- /* 4.1. check support for the operation in the loop
+ /* 4. check support for the operation in the loop
This isn't necessary for the lane reduction codes, since they
can only be produced by pattern matching, and it's up to the
@@ -8197,14 +8428,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
mixed-sign dot-products can be implemented using signed
dot-products. */
machine_mode vec_mode = TYPE_MODE (vectype_in);
- if (!lane_reduc_code_p
- && !directly_supported_p (op.code, vectype_in, optab_vector))
+ if (!directly_supported_p (op.code, vectype_in, optab_vector))
{
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "op not supported by target.\n");
if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
|| !vect_can_vectorize_without_simd_p (op.code))
- ok = false;
+ single_defuse_cycle = false;
else
if (dump_enabled_p ())
dump_printf (MSG_NOTE, "proceeding using word mode.\n");
@@ -8217,35 +8447,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
dump_printf (MSG_NOTE, "using word mode not possible.\n");
return false;
}
-
- /* lane-reducing operations have to go through vect_transform_reduction.
- For the other cases try without the single cycle optimization. */
- if (!ok)
- {
- if (lane_reduc_code_p)
- return false;
- else
- single_defuse_cycle = false;
- }
}
STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
- /* If the reduction stmt is one of the patterns that have lane
- reduction embedded we cannot handle the case of ! single_defuse_cycle. */
- if ((ncopies > 1 && ! single_defuse_cycle)
- && lane_reduc_code_p)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "multi def-use cycle not possible for lane-reducing "
- "reduction operation\n");
- return false;
- }
-
- if (slp_node
- && !(!single_defuse_cycle
- && !lane_reduc_code_p
- && reduction_type != FOLD_LEFT_REDUCTION))
+ /* Reduction type of lane-reducing operation is TREE_CODE_REDUCTION, the
+ below processing will be done in its own vectorizable function. */
+ if (slp_node && reduction_type == FOLD_LEFT_REDUCTION)
for (i = 0; i < (int) op.num_ops; i++)
if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
{
@@ -8255,36 +8462,24 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
return false;
}
- if (slp_node)
- vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
- else
- vec_num = 1;
-
vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
reduction_type, ncopies, cost_vec);
/* Cost the reduction op inside the loop if transformed via
- vect_transform_reduction. Otherwise this is costed by the
- separate vectorizable_* routines. */
- if (single_defuse_cycle || lane_reduc_code_p)
- {
- int factor = 1;
- if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
- /* Three dot-products and a subtraction. */
- factor = 4;
- record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
- stmt_info, 0, vect_body);
- }
+ vect_transform_reduction for non-lane-reducing operation. Otherwise
+ this is costed by the separate vectorizable_* routines. */
+ if (single_defuse_cycle && !lane_reduc_code_p)
+ record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
if (dump_enabled_p ()
&& reduction_type == FOLD_LEFT_REDUCTION)
dump_printf_loc (MSG_NOTE, vect_location,
"using an in-order (fold-left) reduction.\n");
STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
- /* All but single defuse-cycle optimized, lane-reducing and fold-left
- reductions go through their own vectorizable_* routines. */
- if (!single_defuse_cycle
- && !lane_reduc_code_p
- && reduction_type != FOLD_LEFT_REDUCTION)
+
+ /* All but single defuse-cycle optimized and fold-left reductions go
+ through their own vectorizable_* routines. */
+ if ((!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
+ || lane_reduc_code_p)
{
stmt_vec_info tem
= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
@@ -8296,60 +8491,10 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
}
- else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
- {
- vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
- vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
- internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
-
- if (reduction_type != FOLD_LEFT_REDUCTION
- && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
- && (cond_fn == IFN_LAST
- || !direct_internal_fn_supported_p (cond_fn, vectype_in,
- OPTIMIZE_FOR_SPEED)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors because"
- " no conditional operation is available.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
- else if (reduction_type == FOLD_LEFT_REDUCTION
- && reduc_fn == IFN_LAST
- && !expand_vec_cond_expr_p (vectype_in,
- truth_type_for (vectype_in),
- SSA_NAME))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors because"
- " no conditional operation is available.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
- else if (reduction_type == FOLD_LEFT_REDUCTION
- && internal_fn_mask_index (reduc_fn) == -1
- && FLOAT_TYPE_P (vectype_in)
- && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors because"
- " signed zeros cannot be preserved.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
- else
- {
- internal_fn mask_reduc_fn
- = get_masked_reduction_fn (reduc_fn, vectype_in);
+ else
+ vect_reduction_use_partial_vector (loop_vinfo, reduc_info, slp_node,
+ op.code, op.type, vectype_in);
- if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
- vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
- vectype_in, 1);
- else
- vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
- vectype_in, NULL);
- }
- }
return true;
}
@@ -8440,6 +8585,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
int i;
int ncopies;
+ int stmt_ncopies;
int vec_num;
stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
@@ -8463,15 +8609,28 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+ tree stmt_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
+
+ /* Get input vectypes from the reduction PHI and the statement to be
+ transformed, these two vectypes may have different lanes when
+ lane-reducing operation is present. */
+ if (!vectype_in)
+ vectype_in = STMT_VINFO_REDUC_VECTYPE (reduc_info);
+
+ if (!stmt_vectype_in)
+ stmt_vectype_in = STMT_VINFO_VECTYPE (stmt_info);
if (slp_node)
{
ncopies = 1;
+ stmt_ncopies = 1;
vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
}
else
{
ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+ stmt_ncopies = vect_get_num_copies (loop_vinfo, stmt_vectype_in);
+ gcc_assert (stmt_ncopies >= 1 && stmt_ncopies <= ncopies);
vec_num = 1;
}
@@ -8480,14 +8639,10 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
- bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
-
+ bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
+ stmt_vectype_in);
/* Transform. */
- tree new_temp = NULL_TREE;
- auto_vec<tree> vec_oprnds0;
- auto_vec<tree> vec_oprnds1;
- auto_vec<tree> vec_oprnds2;
- tree def0;
+ auto_vec<tree> vec_oprnds[3];
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
@@ -8510,8 +8665,6 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
== op.ops[internal_fn_else_index ((internal_fn) code)]));
}
- bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
-
vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
if (reduction_type == FOLD_LEFT_REDUCTION)
{
@@ -8519,7 +8672,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
gcc_assert (code.is_tree_code () || cond_fn_p);
return vectorize_fold_left_reduction
(loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
- code, reduc_fn, op.ops, op.num_ops, vectype_in,
+ code, reduc_fn, op.ops, op.num_ops, stmt_vectype_in,
reduc_index, masks, lens);
}
@@ -8533,55 +8686,160 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
- /* Get NCOPIES vector definitions for all operands except the reduction
- definition. */
- if (!cond_fn_p)
+ gcc_assert (reduc_index < 3);
+
+ if (slp_node)
{
- vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
- single_defuse_cycle && reduc_index == 0
- ? NULL_TREE : op.ops[0], &vec_oprnds0,
- single_defuse_cycle && reduc_index == 1
- ? NULL_TREE : op.ops[1], &vec_oprnds1,
- op.num_ops == 3
- && !(single_defuse_cycle && reduc_index == 2)
- ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+ gcc_assert (!single_defuse_cycle && op.num_ops <= 3);
+
+ for (i = 0; i < (int) op.num_ops; i++)
+ vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds[i]);
}
else
{
- /* For a conditional operation pass the truth type as mask
- vectype. */
- gcc_assert (single_defuse_cycle
- && (reduc_index == 1 || reduc_index == 2));
- vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
- op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
- reduc_index == 1 ? NULL_TREE : op.ops[1],
- NULL_TREE, &vec_oprnds1,
- reduc_index == 2 ? NULL_TREE : op.ops[2],
- NULL_TREE, &vec_oprnds2);
- }
+ int result_pos = 0;
+
+ /* The input vectype of the reduction PHI determines copies of
+ vectorized def-use cycles, which might be more than effective copies
+ of vectorized lane-reducing reduction statements. This could be
+ complemented by generating extra trivial pass-through copies. For
+ example:
+
+ int sum = 0;
+ for (i)
+ {
+ sum += d0[i] * d1[i]; // dot-prod <vector(16) char>
+ sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+ sum += n[i]; // normal <vector(4) int>
+ }
+
+ The vector size is 128-bit,vectorization factor is 16. Reduction
+ statements would be transformed as:
+
+ vector<4> int sum_v0 = { 0, 0, 0, 0 };
+ vector<4> int sum_v1 = { 0, 0, 0, 0 };
+ vector<4> int sum_v2 = { 0, 0, 0, 0 };
+ vector<4> int sum_v3 = { 0, 0, 0, 0 };
+
+ for (i / 16)
+ {
+ sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
+ sum_v1 = sum_v1; // copy
+ sum_v2 = sum_v2; // copy
+ sum_v3 = sum_v3; // copy
+
+ sum_v0 = sum_v0; // copy
+ sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+ sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
+ sum_v3 = sum_v3; // copy
+
+ sum_v0 += n_v0[i: 0 ~ 3 ];
+ sum_v1 += n_v1[i: 4 ~ 7 ];
+ sum_v2 += n_v2[i: 8 ~ 11];
+ sum_v3 += n_v3[i: 12 ~ 15];
+ }
+
+ Moreover, for a higher instruction parallelism in final vectorized
+ loop, it is considered to make those effective vectorized
+ lane-reducing statements be distributed evenly among all def-use
+ cycles. In the above example, SADs are generated into other cycles
+ rather than that of DOT_PROD. */
+
+ if (stmt_ncopies < ncopies)
+ {
+ gcc_assert (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR
+ || code == SAD_EXPR);
+ result_pos = reduc_info->reduc_result_pos;
+ reduc_info->reduc_result_pos = (result_pos + stmt_ncopies) % ncopies;
+ gcc_assert (result_pos >= 0 && result_pos < ncopies);
+ }
+
+ for (i = 0; i < MIN (3, (int) op.num_ops); i++)
+ {
+ tree vectype = NULL_TREE;
+ int used_ncopies = ncopies;
+
+ if (cond_fn_p && i == 0)
+ {
+ /* For a conditional operation pass the truth type as mask
+ vectype. */
+ gcc_assert (single_defuse_cycle && reduc_index > 0);
+ vectype = truth_type_for (vectype_in);
+ }
- /* For single def-use cycles get one copy of the vectorized reduction
- definition. */
- if (single_defuse_cycle)
- {
- gcc_assert (!slp_node);
- vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
- op.ops[reduc_index],
- reduc_index == 0 ? &vec_oprnds0
- : (reduc_index == 1 ? &vec_oprnds1
- : &vec_oprnds2));
+ if (i != reduc_index)
+ {
+ /* For non-reduction operand, deduce effictive copies that are
+ involved in vectorized def-use cycles based on the input
+ vectype of the reduction statement. */
+ used_ncopies = stmt_ncopies;
+ }
+ else if (single_defuse_cycle)
+ {
+ /* For single def-use cycles get one copy of the vectorized
+ reduction definition. */
+ used_ncopies = 1;
+ }
+
+ vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, used_ncopies,
+ op.ops[i], &vec_oprnds[i], vectype);
+
+ if (used_ncopies < ncopies)
+ {
+ vec_oprnds[i].safe_grow_cleared (ncopies);
+
+ /* Find suitable def-use cycles to generate vectorized
+ statements into, and reorder operands based on the
+ selection. */
+ if (i != reduc_index && result_pos)
+ {
+ int count = ncopies - used_ncopies;
+ int start = result_pos - count;
+
+ if (start < 0)
+ {
+ count = result_pos;
+ start = 0;
+ }
+
+ for (int j = used_ncopies - 1; j >= start; j--)
+ {
+ std::swap (vec_oprnds[i][j], vec_oprnds[i][j + count]);
+ gcc_assert (!vec_oprnds[i][j]);
+ }
+ }
+ }
+ }
}
- bool emulated_mixed_dot_prod
- = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
- FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+ bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+ bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
+ tree def0;
+
+ FOR_EACH_VEC_ELT (vec_oprnds[0], i, def0)
{
gimple *new_stmt;
- tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
- if (masked_loop_p && !mask_by_cond_expr)
+ tree new_temp = NULL_TREE;
+ tree vop[3] = { def0, vec_oprnds[1][i], NULL_TREE };
+
+ if (!vop[0] || !vop[1])
+ {
+ tree reduc_vop = vec_oprnds[reduc_index][i];
+
+ /* Insert trivial copy if no need to generate vectorized
+ statement. */
+ gcc_assert (reduc_vop && stmt_ncopies < ncopies);
+
+ new_stmt = gimple_build_assign (vec_dest, reduc_vop);
+ new_temp = make_ssa_name (vec_dest, new_stmt);
+ gimple_set_lhs (new_stmt, new_temp);
+ vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+ }
+ else if (masked_loop_p && !mask_by_cond_expr)
{
- /* No conditional ifns have been defined for dot-product yet. */
- gcc_assert (code != DOT_PROD_EXPR);
+ /* No conditional ifns have been defined for dot-product and sad
+ yet. */
+ gcc_assert (code != DOT_PROD_EXPR && code != SAD_EXPR);
/* Make sure that the reduction accumulator is vop[0]. */
if (reduc_index == 1)
@@ -8590,7 +8848,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
std::swap (vop[0], vop[1]);
}
tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
- vec_num * ncopies, vectype_in, i);
+ vec_num * stmt_ncopies,
+ stmt_vectype_in, i);
gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
vop[0], vop[1], vop[0]);
new_temp = make_ssa_name (vec_dest, call);
@@ -8602,12 +8861,13 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
else
{
if (op.num_ops >= 3)
- vop[2] = vec_oprnds2[i];
+ vop[2] = vec_oprnds[2][i];
if (masked_loop_p && mask_by_cond_expr)
{
tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
- vec_num * ncopies, vectype_in, i);
+ vec_num * stmt_ncopies,
+ stmt_vectype_in, i);
build_vect_cond_expr (code, vop, mask, gsi);
}
@@ -8634,16 +8894,8 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
if (slp_node)
slp_node->push_vec_def (new_stmt);
- else if (single_defuse_cycle
- && i < ncopies - 1)
- {
- if (reduc_index == 0)
- vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
- else if (reduc_index == 1)
- vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
- else if (reduc_index == 2)
- vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
- }
+ else if (single_defuse_cycle && i < ncopies - 1)
+ vec_oprnds[reduc_index][i + 1] = gimple_get_lhs (new_stmt);
else
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
@@ -12093,11 +12093,20 @@ vectorizable_condition (vec_info *vinfo,
vect_reduction_type reduction_type = TREE_CODE_REDUCTION;
bool for_reduction
= STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
+ if (for_reduction)
+ {
+ reduc_info = info_for_reduction (vinfo, stmt_info);
+ if (STMT_VINFO_REDUC_DEF (reduc_info) != vect_orig_stmt (stmt_info))
+ {
+ for_reduction = false;
+ reduc_info = NULL;
+ }
+ }
+
if (for_reduction)
{
if (slp_node)
return false;
- reduc_info = info_for_reduction (vinfo, stmt_info);
reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
gcc_assert (reduction_type != EXTRACT_LAST_REDUCTION
@@ -13273,6 +13282,8 @@ vect_analyze_stmt (vec_info *vinfo,
NULL, NULL, node, cost_vec)
|| vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
|| vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+ || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
+ stmt_info, node, cost_vec)
|| vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
node, node_instance, cost_vec)
|| vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
@@ -1399,6 +1399,12 @@ public:
/* The vector type for performing the actual reduction. */
tree reduc_vectype;
+ /* For loop reduction with multiple vectorized results (ncopies > 1), a
+ lane-reducing operation participating in it may not use all of those
+ results, this field specifies result index starting from which any
+ following land-reducing operation would be assigned to. */
+ int reduc_result_pos;
+
/* If IS_REDUC_INFO is true and if the vector code is performing
N scalar reductions in parallel, this variable gives the initial
scalar values of those N reductions. */
@@ -2430,6 +2436,8 @@ extern loop_vec_info vect_create_loop_vinfo (class loop *, vec_info_shared *,
extern bool vectorizable_live_operation (vec_info *, stmt_vec_info,
slp_tree, slp_instance, int,
bool, stmt_vector_for_cost *);
+extern bool vectorizable_lane_reducing (loop_vec_info, stmt_vec_info,
+ slp_tree, stmt_vector_for_cost *);
extern bool vectorizable_reduction (loop_vec_info, stmt_vec_info,
slp_tree, slp_instance,
stmt_vector_for_cost *);
--
2.17.1