Message ID | 20230811142448.2913622-1-juzhe.zhong@rivai.ai |
---|---|
State | New |
Headers | show |
Series | [V4] VECT: Support loop len control on EXTRACT_LAST vectorization | expand |
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > Hi, Richard and Richi. > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control. OK. Thanks, Richard. > Consider this following case: > > #include <stdint.h> > > #define EXTRACT_LAST(TYPE) \ > TYPE __attribute__ ((noinline, noclone)) \ > test_##TYPE (TYPE *x, int n, TYPE value) \ > { \ > TYPE last; \ > for (int j = 0; j < n; ++j) \ > { \ > last = x[j]; \ > x[j] = last * value; \ > } \ > return last; \ > } > > #define TEST_ALL(T) \ > T (uint8_t) \ > > TEST_ALL (EXTRACT_LAST) > > ARM SVE IR: > > Preheader: > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); > > Loop: > ... > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> > ... > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > ... > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > ... > > Epilogue: > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); > > For RVV since we prefer len in loop control, after this patch for RVV: > > Loop: > ... > loop_len_22 = SELECT_VL; > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); > ... > > Epilogue: > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23); > > Details of this approach: > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization > for LEN loop control. > > This function we check whether target support: > - Use LEN as the loop control. > - Support VEC_EXTRACT optab. > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true. > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1). > > The only difference between mask and len is that len is using length generated by SELECT_VL and > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE. > > gcc/ChangeLog: > > * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control. > > --- > gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++--------- > 1 file changed, 62 insertions(+), 16 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index bf8d677b584..a011e2dacb2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > /* No transformation required. */ > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > - OPTIMIZE_FOR_SPEED)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "can't operate on partial vectors " > - "because the target doesn't support extract " > - "last reduction.\n"); > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > - } > - else if (slp_node) > + if (slp_node) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > else > { > gcc_assert (ncopies == 1 && !slp_node); > - vect_record_loop_mask (loop_vinfo, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, NULL); > + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > + OPTIMIZE_FOR_SPEED)) > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), > + 1, vectype, NULL); > + else if (convert_optab_handler (vec_extract_optab, > + TYPE_MODE (vectype), > + TYPE_MODE (TREE_TYPE (vectype))) > + != CODE_FOR_nothing) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 1); > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc ( > + MSG_MISSED_OPTIMIZATION, vect_location, > + "can't operate on partial vectors " > + "because the target doesn't support extract " > + "last reduction.\n"); > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > + } > } > } > /* ??? Enable for loop costing as well. */ > @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > /* Get the correct slp vectorized stmt. */ > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple_seq stmts = NULL; > tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + { > + /* Emit: > + > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> > + > + where VEC_LHS is the vectorized live-out result and MASK is > + the loop mask for the final iteration. */ > + gcc_assert (ncopies == 1 && !slp_node); > + gimple_seq tem = NULL; > + gimple_stmt_iterator gsi = gsi_last (tem); > + tree len > + = vect_get_loop_len (loop_vinfo, &gsi, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 0, 0); > + > + /* BIAS - 1. */ > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + tree bias_minus_one > + = int_const_binop (MINUS_EXPR, > + build_int_cst (TREE_TYPE (len), biasval), > + build_one_cst (TREE_TYPE (len))); > + > + /* LAST_INDEX = LEN + (BIAS - 1). */ > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > + len, bias_minus_one); > + > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ > + tree scalar_res > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > + vec_lhs_phi, last_index); > + > + /* Convert the extracted vector element to the scalar type. */ > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + } > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > { > /* Emit: > >
Thanks Richi. CC kewen to see whether this patch is suitable for powerpc and s390. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-08-14 14:53 To: Ju-Zhe Zhong CC: gcc-patches; richard.sandiford Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > Hi, Richard and Richi. > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control. OK. Thanks, Richard. > Consider this following case: > > #include <stdint.h> > > #define EXTRACT_LAST(TYPE) \ > TYPE __attribute__ ((noinline, noclone)) \ > test_##TYPE (TYPE *x, int n, TYPE value) \ > { \ > TYPE last; \ > for (int j = 0; j < n; ++j) \ > { \ > last = x[j]; \ > x[j] = last * value; \ > } \ > return last; \ > } > > #define TEST_ALL(T) \ > T (uint8_t) \ > > TEST_ALL (EXTRACT_LAST) > > ARM SVE IR: > > Preheader: > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); > > Loop: > ... > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> > ... > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > ... > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > ... > > Epilogue: > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); > > For RVV since we prefer len in loop control, after this patch for RVV: > > Loop: > ... > loop_len_22 = SELECT_VL; > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); > ... > > Epilogue: > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23); > > Details of this approach: > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization > for LEN loop control. > > This function we check whether target support: > - Use LEN as the loop control. > - Support VEC_EXTRACT optab. > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true. > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1). > > The only difference between mask and len is that len is using length generated by SELECT_VL and > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE. > > gcc/ChangeLog: > > * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control. > > --- > gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++--------- > 1 file changed, 62 insertions(+), 16 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index bf8d677b584..a011e2dacb2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > /* No transformation required. */ > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > - OPTIMIZE_FOR_SPEED)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "can't operate on partial vectors " > - "because the target doesn't support extract " > - "last reduction.\n"); > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > - } > - else if (slp_node) > + if (slp_node) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > else > { > gcc_assert (ncopies == 1 && !slp_node); > - vect_record_loop_mask (loop_vinfo, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, NULL); > + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > + OPTIMIZE_FOR_SPEED)) > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), > + 1, vectype, NULL); > + else if (convert_optab_handler (vec_extract_optab, > + TYPE_MODE (vectype), > + TYPE_MODE (TREE_TYPE (vectype))) > + != CODE_FOR_nothing) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 1); > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc ( > + MSG_MISSED_OPTIMIZATION, vect_location, > + "can't operate on partial vectors " > + "because the target doesn't support extract " > + "last reduction.\n"); > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > + } > } > } > /* ??? Enable for loop costing as well. */ > @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > /* Get the correct slp vectorized stmt. */ > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple_seq stmts = NULL; > tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + { > + /* Emit: > + > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> > + > + where VEC_LHS is the vectorized live-out result and MASK is > + the loop mask for the final iteration. */ > + gcc_assert (ncopies == 1 && !slp_node); > + gimple_seq tem = NULL; > + gimple_stmt_iterator gsi = gsi_last (tem); > + tree len > + = vect_get_loop_len (loop_vinfo, &gsi, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 0, 0); > + > + /* BIAS - 1. */ > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + tree bias_minus_one > + = int_const_binop (MINUS_EXPR, > + build_int_cst (TREE_TYPE (len), biasval), > + build_one_cst (TREE_TYPE (len))); > + > + /* LAST_INDEX = LEN + (BIAS - 1). */ > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > + len, bias_minus_one); > + > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ > + tree scalar_res > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > + vec_lhs_phi, last_index); > + > + /* Convert the extracted vector element to the scalar type. */ > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + } > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > { > /* Emit: > >
Hi Juzhe, on 2023/8/14 15:09, juzhe.zhong@rivai.ai wrote: > Thanks Richi. > > CC kewen to see whether this patch is suitable for powerpc and s390. I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. A short list looks like: < FAIL: gcc.c-torture/compile/20150108.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/compile/20150108.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.c-torture/compile/20150108.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/compile/20150108.c -O3 -g (test for excess errors) < FAIL: gcc.c-torture/execute/20011126-2.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/execute/20011126-2.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.c-torture/execute/20011126-2.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/execute/20011126-2.c -O3 -g (test for excess errors) < FAIL: gcc.c-torture/execute/pr58419.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/execute/pr58419.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.c-torture/execute/pr58419.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.c-torture/execute/pr58419.c -O3 -g (test for excess errors) < FAIL: gcc.dg/pr84321.c (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/pr84321.c (test for excess errors) < FAIL: gcc.dg/torture/pr108793.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/torture/pr108793.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.dg/torture/pr108793.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/torture/pr108793.c -O3 -g (test for excess errors) < FAIL: gcc.dg/torture/pr51070-2.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/torture/pr51070-2.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.dg/torture/pr51070-2.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/torture/pr51070-2.c -O3 -g (test for excess errors) < FAIL: gcc.dg/torture/pr51070.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) < FAIL: gcc.dg/torture/pr51070.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) < FAIL: gcc.dg/torture/pr51070.c -O3 -g (internal compiler error: in expand_vec_extract_optab_fn, at internal-fn.cc:3164) .... > > ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ > juzhe.zhong@rivai.ai > > > *From:* Richard Biener <mailto:rguenther@suse.de> > *Date:* 2023-08-14 14:53 > *To:* Ju-Zhe Zhong <mailto:juzhe.zhong@rivai.ai> > *CC:* gcc-patches <mailto:gcc-patches@gcc.gnu.org>; richard.sandiford <mailto:richard.sandiford@arm.com> > *Subject:* Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote: > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> > > > > Hi, Richard and Richi. > > > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control. > > OK. > > Thanks, > Richard. > > > Consider this following case: > > > > #include <stdint.h> > > > > #define EXTRACT_LAST(TYPE) \ > > TYPE __attribute__ ((noinline, noclone)) \ > > test_##TYPE (TYPE *x, int n, TYPE value) \ > > { \ > > TYPE last; \ > > for (int j = 0; j < n; ++j) \ > > { \ > > last = x[j]; \ > > x[j] = last * value; \ > > } \ > > return last; \ > > } > > > > #define TEST_ALL(T) \ > > T (uint8_t) \ > > > > TEST_ALL (EXTRACT_LAST) > > > > ARM SVE IR: > > > > Preheader: > > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); > > > > Loop: > > ... > > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> > > ... > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > > ... > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > > ... > > > > Epilogue: > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); > > > > For RVV since we prefer len in loop control, after this patch for RVV: > > > > Loop: > > ... > > loop_len_22 = SELECT_VL; > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); > > ... > > > > Epilogue: > > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23); > > > > Details of this approach: > > > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization > > for LEN loop control. > > > > This function we check whether target support: > > - Use LEN as the loop control. > > - Support VEC_EXTRACT optab. > > > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true. > > > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1). > > > > The only difference between mask and len is that len is using length generated by SELECT_VL and > > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE. > > > > gcc/ChangeLog: > > > > * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control. > > > > --- > > gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++--------- > > 1 file changed, 62 insertions(+), 16 deletions(-) > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index bf8d677b584..a011e2dacb2 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > /* No transformation required. */ > > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > > { > > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > > - OPTIMIZE_FOR_SPEED)) > > - { > > - if (dump_enabled_p ()) > > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > - "can't operate on partial vectors " > > - "because the target doesn't support extract " > > - "last reduction.\n"); > > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > - } > > - else if (slp_node) > > + if (slp_node) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > else > > { > > gcc_assert (ncopies == 1 && !slp_node); > > - vect_record_loop_mask (loop_vinfo, > > - &LOOP_VINFO_MASKS (loop_vinfo), > > - 1, vectype, NULL); > > + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > > + OPTIMIZE_FOR_SPEED)) > > + vect_record_loop_mask (loop_vinfo, > > + &LOOP_VINFO_MASKS (loop_vinfo), > > + 1, vectype, NULL); > > + else if (convert_optab_handler (vec_extract_optab, > > + TYPE_MODE (vectype), > > + TYPE_MODE (TREE_TYPE (vectype))) > > + != CODE_FOR_nothing) > > + vect_record_loop_len (loop_vinfo, > > + &LOOP_VINFO_LENS (loop_vinfo), > > + 1, vectype, 1); > > + else > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc ( > > + MSG_MISSED_OPTIMIZATION, vect_location, > > + "can't operate on partial vectors " > > + "because the target doesn't support extract " > > + "last reduction.\n"); > > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > + } > > } > > } > > /* ??? Enable for loop costing as well. */ > > @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple *vec_stmt; > > if (slp_node) > > { > > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > > + gcc_assert (!loop_vinfo > > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > > > /* Get the correct slp vectorized stmt. */ > > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > > @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > > > gimple_seq stmts = NULL; > > tree new_tree; > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > > + { > > + /* Emit: > > + > > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> > > + > > + where VEC_LHS is the vectorized live-out result and MASK is > > + the loop mask for the final iteration. */ > > + gcc_assert (ncopies == 1 && !slp_node); > > + gimple_seq tem = NULL; > > + gimple_stmt_iterator gsi = gsi_last (tem); > > + tree len > > + = vect_get_loop_len (loop_vinfo, &gsi, > > + &LOOP_VINFO_LENS (loop_vinfo), > > + 1, vectype, 0, 0); > > + > > + /* BIAS - 1. */ > > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + tree bias_minus_one > > + = int_const_binop (MINUS_EXPR, > > + build_int_cst (TREE_TYPE (len), biasval), > > + build_one_cst (TREE_TYPE (len))); > > + > > + /* LAST_INDEX = LEN + (BIAS - 1). */ > > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > > + len, bias_minus_one); > > + > > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ > > + tree scalar_res > > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > > + vec_lhs_phi, last_index); > > + > > + /* Convert the extracted vector element to the scalar type. */ > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > > + } > > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > > { > > /* Emit: > > > > > > -- > Richard Biener <rguenther@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) > >
Hi Kewen,
> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures.
I think the problem is that just like for vec_set we're expecting
the vec_extract expander not to fail. It is probably passed not a
const int here anymore and therefore fails to expand?
can_vec_extract_var_idx_p is supposed to check if the backend
supports extracting a variable index.
Regards
Robin
On Mon, 14 Aug 2023, Robin Dapp wrote: > Hi Kewen, > > > I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. > > I think the problem is that just like for vec_set we're expecting > the vec_extract expander not to fail. It is probably passed not a > const int here anymore and therefore fails to expand? > > can_vec_extract_var_idx_p is supposed to check if the backend > supports extracting a variable index. expansion does enum insn_code icode = convert_optab_handler (optab, outermode, extract_mode); if (icode != CODE_FOR_nothing) { create_output_operand (&ops[0], target, extract_mode); create_input_operand (&ops[1], src, outermode); create_convert_operand_from (&ops[2], pos, TYPE_MODE (TREE_TYPE (op1)), true); if (maybe_expand_insn (icode, 3, ops)) { if (!rtx_equal_p (target, ops[0].value)) emit_move_insn (target, ops[0].value); return; } <--- here } gcc_unreachable (); so if maybe_expand_insn fails that would need to be sth we need to cover in the predicate to check. But that looks possibly target dependent? What does actually fail here? Richard.
Hi Robin, on 2023/8/14 16:58, Robin Dapp wrote: > Hi Kewen, > >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. > > I think the problem is that just like for vec_set we're expecting > the vec_extract expander not to fail. It is probably passed not a > const int here anymore and therefore fails to expand? Thanks for the comments! Yeah, I think the expectation doesn't hold on Power, as our vec_extract optab only support const index, that is: (define_expand "vec_extract<mode><VEC_base_l>" [(match_operand:<VEC_base> 0 "register_operand") (match_operand:VEC_E 1 "vlogical_operand") (match_operand 2 "const_int_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { rs6000_expand_vector_extract (operands[0], operands[1], operands[2]); DONE; }) > > can_vec_extract_var_idx_p is supposed to check if the backend > supports extracting a variable index. OK, it sounds that this new capability needs to further check with function can_vec_extract_var_idx_p to ensure the ifn expanding work as expected. I re-spined by adding the below as your comments: diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 07f3717ed9d..80ba5cae84a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else if (convert_optab_handler (vec_extract_optab, TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))) - != CODE_FOR_nothing) + != CODE_FOR_nothing + && can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), 1, vectype, 1); BR, Kewen
- != CODE_FOR_nothing) + != CODE_FOR_nothing + && can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check. Looking forward Richi's more comments. Thanks. juzhe.zhong@rivai.ai From: Kewen.Lin Date: 2023-08-14 17:19 To: Robin Dapp CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization Hi Robin, on 2023/8/14 16:58, Robin Dapp wrote: > Hi Kewen, > >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. > > I think the problem is that just like for vec_set we're expecting > the vec_extract expander not to fail. It is probably passed not a > const int here anymore and therefore fails to expand? Thanks for the comments! Yeah, I think the expectation doesn't hold on Power, as our vec_extract optab only support const index, that is: (define_expand "vec_extract<mode><VEC_base_l>" [(match_operand:<VEC_base> 0 "register_operand") (match_operand:VEC_E 1 "vlogical_operand") (match_operand 2 "const_int_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { rs6000_expand_vector_extract (operands[0], operands[1], operands[2]); DONE; }) > > can_vec_extract_var_idx_p is supposed to check if the backend > supports extracting a variable index. OK, it sounds that this new capability needs to further check with function can_vec_extract_var_idx_p to ensure the ifn expanding work as expected. I re-spined by adding the below as your comments: diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 07f3717ed9d..80ba5cae84a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else if (convert_optab_handler (vec_extract_optab, TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))) - != CODE_FOR_nothing) + != CODE_FOR_nothing + && can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), 1, vectype, 1); BR, Kewen
On Mon, 14 Aug 2023, juzhe.zhong@rivai.ai wrote: > - != CODE_FOR_nothing) > + != CODE_FOR_nothing > + && can_vec_extract_var_idx_p ( > + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > > I think maybe 'can_vec_extract_var_idx_p' check is enough and remove convert_optab_handler (vec_extract_optab,... check. > Looking forward Richi's more comments. Yes, I think can_vec_extract_var_idx_p already does that so no need to duplicate it here. Richard. > Thanks. > > > juzhe.zhong@rivai.ai > > From: Kewen.Lin > Date: 2023-08-14 17:19 > To: Robin Dapp > CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai > Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization > Hi Robin, > > on 2023/8/14 16:58, Robin Dapp wrote: > > Hi Kewen, > > > >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. > > > > I think the problem is that just like for vec_set we're expecting > > the vec_extract expander not to fail. It is probably passed not a > > const int here anymore and therefore fails to expand? > > Thanks for the comments! Yeah, I think the expectation doesn't hold > on Power, as our vec_extract optab only support const index, that > is: > > (define_expand "vec_extract<mode><VEC_base_l>" > [(match_operand:<VEC_base> 0 "register_operand") > (match_operand:VEC_E 1 "vlogical_operand") > (match_operand 2 "const_int_operand")] > "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" > { > rs6000_expand_vector_extract (operands[0], operands[1], operands[2]); > DONE; > }) > > > > > can_vec_extract_var_idx_p is supposed to check if the backend > > supports extracting a variable index. > > OK, it sounds that this new capability needs to further check with > function can_vec_extract_var_idx_p to ensure the ifn expanding work > as expected. I re-spined by adding the below as your comments: > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 07f3717ed9d..80ba5cae84a 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > else if (convert_optab_handler (vec_extract_optab, > TYPE_MODE (vectype), > TYPE_MODE (TREE_TYPE (vectype))) > - != CODE_FOR_nothing) > + != CODE_FOR_nothing > + && can_vec_extract_var_idx_p ( > + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > vect_record_loop_len (loop_vinfo, > &LOOP_VINFO_LENS (loop_vinfo), > 1, vectype, 1); > > BR, > Kewen > >
Hi, Kewin. Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing? Thanks. juzhe.zhong@rivai.ai From: Kewen.Lin Date: 2023-08-14 17:19 To: Robin Dapp CC: gcc-patches; richard.sandiford; rguenther; juzhe.zhong@rivai.ai Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization Hi Robin, on 2023/8/14 16:58, Robin Dapp wrote: > Hi Kewen, > >> I did a bootstrapping and regression testing on Power10 (LE) and found a lot of failures. > > I think the problem is that just like for vec_set we're expecting > the vec_extract expander not to fail. It is probably passed not a > const int here anymore and therefore fails to expand? Thanks for the comments! Yeah, I think the expectation doesn't hold on Power, as our vec_extract optab only support const index, that is: (define_expand "vec_extract<mode><VEC_base_l>" [(match_operand:<VEC_base> 0 "register_operand") (match_operand:VEC_E 1 "vlogical_operand") (match_operand 2 "const_int_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { rs6000_expand_vector_extract (operands[0], operands[1], operands[2]); DONE; }) > > can_vec_extract_var_idx_p is supposed to check if the backend > supports extracting a variable index. OK, it sounds that this new capability needs to further check with function can_vec_extract_var_idx_p to ensure the ifn expanding work as expected. I re-spined by adding the below as your comments: diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 07f3717ed9d..80ba5cae84a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10328,7 +10328,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else if (convert_optab_handler (vec_extract_optab, TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))) - != CODE_FOR_nothing) + != CODE_FOR_nothing + && can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), 1, vectype, 1); BR, Kewen
Hi Juzhe, on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote: > Hi, Kewin. > > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing? The below diff was bootstrapped and regress-tested on Power10 LE. Comparing to the previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and its required new includes as below: +#include "memmodel.h" +#include "optabs.h" Could you have a double check? Since I just tested it on Power10, you have the full ownership on the patch, I'd leave the v5 posting to you. Thanks! BR, Kewen ----- diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index bc3063c3615..5ae9f69c7eb 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "ssa.h" #include "optabs-tree.h" +#include "memmodel.h" +#include "optabs.h" #include "diagnostic-core.h" #include "fold-const.h" #include "stor-layout.h" @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, /* No transformation required. */ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) { - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, - OPTIMIZE_FOR_SPEED)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors " - "because the target doesn't support extract " - "last reduction.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (slp_node) + if (slp_node) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else { gcc_assert (ncopies == 1 && !slp_node); - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, NULL); + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, + OPTIMIZE_FOR_SPEED)) + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), + 1, vectype, NULL); + else if (can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) + vect_record_loop_len (loop_vinfo, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 1); + else + { + if (dump_enabled_p ()) + dump_printf_loc ( + MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors " + "because the target doesn't support extract " + "last reduction.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } } } /* ??? Enable for loop costing as well. */ @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple *vec_stmt; if (slp_node) { - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!loop_vinfo + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); /* Get the correct slp vectorized stmt. */ vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple_seq stmts = NULL; tree new_tree; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + /* Emit: + + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> + + where VEC_LHS is the vectorized live-out result and MASK is + the loop mask for the final iteration. */ + gcc_assert (ncopies == 1 && !slp_node); + gimple_seq tem = NULL; + gimple_stmt_iterator gsi = gsi_last (tem); + tree len + = vect_get_loop_len (loop_vinfo, &gsi, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 0, 0); + + /* BIAS - 1. */ + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias_minus_one + = int_const_binop (MINUS_EXPR, + build_int_cst (TREE_TYPE (len), biasval), + build_one_cst (TREE_TYPE (len))); + + /* LAST_INDEX = LEN + (BIAS - 1). */ + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), + len, bias_minus_one); + + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ + tree scalar_res + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), + vec_lhs_phi, last_index); + + /* Convert the extracted vector element to the scalar type. */ + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); + } + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { /* Emit:
Thanks Kewen. But I saw there is 2 more files include: +#include "memmodel.h" +#include "optabs.h" Not sure whether Richard and Richi ok with that change ? Thanks. juzhe.zhong@rivai.ai From: Kewen.Lin Date: 2023-08-14 20:45 To: juzhe.zhong@rivai.ai CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization Hi Juzhe, on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote: > Hi, Kewin. > > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing? The below diff was bootstrapped and regress-tested on Power10 LE. Comparing to the previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and its required new includes as below: +#include "memmodel.h" +#include "optabs.h" Could you have a double check? Since I just tested it on Power10, you have the full ownership on the patch, I'd leave the v5 posting to you. Thanks! BR, Kewen ----- diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index bc3063c3615..5ae9f69c7eb 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "ssa.h" #include "optabs-tree.h" +#include "memmodel.h" +#include "optabs.h" #include "diagnostic-core.h" #include "fold-const.h" #include "stor-layout.h" @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, /* No transformation required. */ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) { - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, - OPTIMIZE_FOR_SPEED)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors " - "because the target doesn't support extract " - "last reduction.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (slp_node) + if (slp_node) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else { gcc_assert (ncopies == 1 && !slp_node); - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, NULL); + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, + OPTIMIZE_FOR_SPEED)) + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), + 1, vectype, NULL); + else if (can_vec_extract_var_idx_p ( + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) + vect_record_loop_len (loop_vinfo, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 1); + else + { + if (dump_enabled_p ()) + dump_printf_loc ( + MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors " + "because the target doesn't support extract " + "last reduction.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } } } /* ??? Enable for loop costing as well. */ @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple *vec_stmt; if (slp_node) { - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!loop_vinfo + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); /* Get the correct slp vectorized stmt. */ vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple_seq stmts = NULL; tree new_tree; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + /* Emit: + + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> + + where VEC_LHS is the vectorized live-out result and MASK is + the loop mask for the final iteration. */ + gcc_assert (ncopies == 1 && !slp_node); + gimple_seq tem = NULL; + gimple_stmt_iterator gsi = gsi_last (tem); + tree len + = vect_get_loop_len (loop_vinfo, &gsi, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 0, 0); + + /* BIAS - 1. */ + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias_minus_one + = int_const_binop (MINUS_EXPR, + build_int_cst (TREE_TYPE (len), biasval), + build_one_cst (TREE_TYPE (len))); + + /* LAST_INDEX = LEN + (BIAS - 1). */ + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), + len, bias_minus_one); + + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ + tree scalar_res + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), + vec_lhs_phi, last_index); + + /* Convert the extracted vector element to the scalar type. */ + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); + } + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { /* Emit:
On Mon, 14 Aug 2023, ??? wrote: > Thanks Kewen. > > But I saw there is 2 more files include: > > +#include "memmodel.h" > +#include "optabs.h" > > Not sure whether Richard and Richi ok with that change ? Yes, please just apply some common sense. > Thanks. > > > > juzhe.zhong@rivai.ai > > From: Kewen.Lin > Date: 2023-08-14 20:45 > To: juzhe.zhong@rivai.ai > CC: Robin Dapp; richard.sandiford; rguenther; GCC Patches > Subject: Re: [PATCH V4] VECT: Support loop len control on EXTRACT_LAST vectorization > Hi Juzhe, > > on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote: > > Hi, Kewin. > > > > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing? > > The below diff was bootstrapped and regress-tested on Power10 LE. Comparing to the > previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and > its required new includes as below: > > +#include "memmodel.h" > +#include "optabs.h" > Could you have a double check? > > Since I just tested it on Power10, you have the full ownership on the patch, I'd leave > the v5 posting to you. Thanks! > > BR, > Kewen > ----- > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index bc3063c3615..5ae9f69c7eb 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3. If not see > #include "tree-pass.h" > #include "ssa.h" > #include "optabs-tree.h" > +#include "memmodel.h" > +#include "optabs.h" > #include "diagnostic-core.h" > #include "fold-const.h" > #include "stor-layout.h" > @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > /* No transformation required. */ > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > - OPTIMIZE_FOR_SPEED)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "can't operate on partial vectors " > - "because the target doesn't support extract " > - "last reduction.\n"); > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > - } > - else if (slp_node) > + if (slp_node) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > else > { > gcc_assert (ncopies == 1 && !slp_node); > - vect_record_loop_mask (loop_vinfo, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, NULL); > + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > + OPTIMIZE_FOR_SPEED)) > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), > + 1, vectype, NULL); > + else if (can_vec_extract_var_idx_p ( > + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 1); > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc ( > + MSG_MISSED_OPTIMIZATION, vect_location, > + "can't operate on partial vectors " > + "because the target doesn't support extract " > + "last reduction.\n"); > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > + } > } > } > /* ??? Enable for loop costing as well. */ > @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > /* Get the correct slp vectorized stmt. */ > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple_seq stmts = NULL; > tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + { > + /* Emit: > + > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> > + > + where VEC_LHS is the vectorized live-out result and MASK is > + the loop mask for the final iteration. */ > + gcc_assert (ncopies == 1 && !slp_node); > + gimple_seq tem = NULL; > + gimple_stmt_iterator gsi = gsi_last (tem); > + tree len > + = vect_get_loop_len (loop_vinfo, &gsi, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 0, 0); > + > + /* BIAS - 1. */ > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + tree bias_minus_one > + = int_const_binop (MINUS_EXPR, > + build_int_cst (TREE_TYPE (len), biasval), > + build_one_cst (TREE_TYPE (len))); > + > + /* LAST_INDEX = LEN + (BIAS - 1). */ > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > + len, bias_minus_one); > + > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ > + tree scalar_res > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > + vec_lhs_phi, last_index); > + > + /* Convert the extracted vector element to the scalar type. */ > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + } > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > { > /* Emit: > >
Hi everyone, I have bootstrapped and regtested the patch below on s390. For the 64-bit target I do not see any changes regarding the testsuite. For the 31-bit target I see the following failures: FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr50451.c (test for excess errors) FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr53773.c (test for excess errors) FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr71407.c (test for excess errors) FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr94443.c (test for excess errors) FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr97558.c (test for excess errors) FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable I've randomely picked pr50451.c and ran gcc against it which results in: during GIMPLE pass: vect dump file: pr50451.c.174t.vect /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’: /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 0x1265d21 opt_mode<scalar_int_mode>::require() const /gcc-verify-workdir/patched/src/gcc/machmode.h:313 0x1d7e4e9 opt_mode<machine_mode>::require() const /gcc-verify-workdir/patched/src/gcc/vec.h:955 0x1d7e4e9 vect_verify_loop_lens /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 0x1da29ab vect_analyze_loop_2 /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 0x1da40c7 vect_analyze_loop_1 /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 0x1deed27 try_vectorize_loop_1 /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 0x1deed27 try_vectorize_loop /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 0x1def5c1 execute /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 Please submit a full bug report, with preprocessed source (by using -freport-bug). Please include the complete backtrace with any bug report. See <https://gcc.gnu.org/bugs/> for instructions. I will come back to this tomorrow. Cheers, Stefan On Mon, Aug 14, 2023 at 08:45:21PM +0800, Kewen.Lin via Gcc-patches wrote: > Hi Juzhe, > > on 2023/8/14 20:08, juzhe.zhong@rivai.ai wrote: > > Hi, Kewin. > > > > Could you test 'can_vec_extract_var_idx_p' and send V5 patch when you pass the testing? > > The below diff was bootstrapped and regress-tested on Power10 LE. Comparing to the > previous v4, the only changes should be the proposed can_vec_extract_var_idx_p and > its required new includes as below: > > +#include "memmodel.h" > +#include "optabs.h" > > Could you have a double check? > > Since I just tested it on Power10, you have the full ownership on the patch, I'd leave > the v5 posting to you. Thanks! > > BR, > Kewen > ----- > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index bc3063c3615..5ae9f69c7eb 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -32,6 +32,8 @@ along with GCC; see the file COPYING3. If not see > #include "tree-pass.h" > #include "ssa.h" > #include "optabs-tree.h" > +#include "memmodel.h" > +#include "optabs.h" > #include "diagnostic-core.h" > #include "fold-const.h" > #include "stor-layout.h" > @@ -10300,17 +10302,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > /* No transformation required. */ > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > - OPTIMIZE_FOR_SPEED)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "can't operate on partial vectors " > - "because the target doesn't support extract " > - "last reduction.\n"); > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > - } > - else if (slp_node) > + if (slp_node) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -10330,9 +10322,26 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > else > { > gcc_assert (ncopies == 1 && !slp_node); > - vect_record_loop_mask (loop_vinfo, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, NULL); > + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, > + OPTIMIZE_FOR_SPEED)) > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), > + 1, vectype, NULL); > + else if (can_vec_extract_var_idx_p ( > + TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 1); > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc ( > + MSG_MISSED_OPTIMIZATION, vect_location, > + "can't operate on partial vectors " > + "because the target doesn't support extract " > + "last reduction.\n"); > + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > + } > } > } > /* ??? Enable for loop costing as well. */ > @@ -10358,7 +10367,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); > > /* Get the correct slp vectorized stmt. */ > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > @@ -10402,7 +10413,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > > gimple_seq stmts = NULL; > tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + { > + /* Emit: > + > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> > + > + where VEC_LHS is the vectorized live-out result and MASK is > + the loop mask for the final iteration. */ > + gcc_assert (ncopies == 1 && !slp_node); > + gimple_seq tem = NULL; > + gimple_stmt_iterator gsi = gsi_last (tem); > + tree len > + = vect_get_loop_len (loop_vinfo, &gsi, > + &LOOP_VINFO_LENS (loop_vinfo), > + 1, vectype, 0, 0); > + > + /* BIAS - 1. */ > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + tree bias_minus_one > + = int_const_binop (MINUS_EXPR, > + build_int_cst (TREE_TYPE (len), biasval), > + build_one_cst (TREE_TYPE (len))); > + > + /* LAST_INDEX = LEN + (BIAS - 1). */ > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), > + len, bias_minus_one); > + > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ > + tree scalar_res > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), > + vec_lhs_phi, last_index); > + > + /* Convert the extracted vector element to the scalar type. */ > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > + } > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > { > /* Emit:
Hi Stefan, on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote: > Hi everyone, > > I have bootstrapped and regtested the patch below on s390. For the > 64-bit target I do not see any changes regarding the testsuite. For the > 31-bit target I see the following failures: > > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr50451.c (test for excess errors) > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr53773.c (test for excess errors) > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr71407.c (test for excess errors) > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr94443.c (test for excess errors) > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr97558.c (test for excess errors) > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable > > I've randomely picked pr50451.c and ran gcc against it which results in: > > during GIMPLE pass: vect > dump file: pr50451.c.174t.vect > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ‘foo’: > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 > 0x1265d21 opt_mode<scalar_int_mode>::require() const > /gcc-verify-workdir/patched/src/gcc/machmode.h:313 > 0x1d7e4e9 opt_mode<machine_mode>::require() const > /gcc-verify-workdir/patched/src/gcc/vec.h:955 > 0x1d7e4e9 vect_verify_loop_lens > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 > 0x1da29ab vect_analyze_loop_2 > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 > 0x1da40c7 vect_analyze_loop_1 > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 > 0x1deed27 try_vectorize_loop_1 > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 > 0x1deed27 try_vectorize_loop > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 > 0x1def5c1 execute > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 > Please submit a full bug report, with preprocessed source (by using -freport-bug). > Please include the complete backtrace with any bug report. > See <https://gcc.gnu.org/bugs/> for instructions. > It looks like s390 supports variable index vec_extract at -m31 but no vector with length. It seems we need to further check the vector with length capability, with something like: diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 5ae9f69c7eb..ef754467baf 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), 1, vectype, NULL); - else if (can_vec_extract_var_idx_p ( + else if (get_len_load_store_mode (TYPE_MODE (vectype), true) + .exists () + && can_vec_extract_var_idx_p ( TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) - vect_record_loop_len (loop_vinfo, - &LOOP_VINFO_LENS (loop_vinfo), + vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), 1, vectype, 1); else { sigh, the formatting looks odd. BR, Kewen
On Tue, 15 Aug 2023, Kewen.Lin wrote: > Hi Stefan, > > on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote: > > Hi everyone, > > > > I have bootstrapped and regtested the patch below on s390. For the > > 64-bit target I do not see any changes regarding the testsuite. For the > > 31-bit target I see the following failures: > > > > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) > > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr50451.c (test for excess errors) > > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr53773.c (test for excess errors) > > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr71407.c (test for excess errors) > > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) > > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr94443.c (test for excess errors) > > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr97558.c (test for excess errors) > > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) > > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) > > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable > > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 > > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 > > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable > > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable > > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable > > > > I've randomely picked pr50451.c and ran gcc against it which results in: > > > > during GIMPLE pass: vect > > dump file: pr50451.c.174t.vect > > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?: > > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 > > 0x1265d21 opt_mode<scalar_int_mode>::require() const > > /gcc-verify-workdir/patched/src/gcc/machmode.h:313 > > 0x1d7e4e9 opt_mode<machine_mode>::require() const > > /gcc-verify-workdir/patched/src/gcc/vec.h:955 > > 0x1d7e4e9 vect_verify_loop_lens > > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 > > 0x1da29ab vect_analyze_loop_2 > > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 > > 0x1da40c7 vect_analyze_loop_1 > > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 > > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) > > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 > > 0x1deed27 try_vectorize_loop_1 > > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 > > 0x1deed27 try_vectorize_loop > > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 > > 0x1def5c1 execute > > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 > > Please submit a full bug report, with preprocessed source (by using -freport-bug). > > Please include the complete backtrace with any bug report. > > See <https://gcc.gnu.org/bugs/> for instructions. > > > > It looks like s390 supports variable index vec_extract at -m31 but > no vector with length. It seems we need to further check the vector > with length capability, with something like: > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 5ae9f69c7eb..ef754467baf 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > vect_record_loop_mask (loop_vinfo, > &LOOP_VINFO_MASKS (loop_vinfo), > 1, vectype, NULL); > - else if (can_vec_extract_var_idx_p ( > + else if (get_len_load_store_mode (TYPE_MODE (vectype), true) > + .exists () > + && can_vec_extract_var_idx_p ( > TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > - vect_record_loop_len (loop_vinfo, > - &LOOP_VINFO_LENS (loop_vinfo), > + vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), > 1, vectype, 1); > else > { > > sigh, the formatting looks odd. I think the error is in vect_verify_loop_lens which assumes that when we record _any_ length related op the target has to support both len_load and len_store. Now that we have many other _len functions that's certainly not true. Instead a vect_verify_loop_lens-local "fix" would be to not use .require () but instead when !.exists () simply return false. That would still effectively require both len-load and len-store for any -len predicated loop, but at least avoid the ICE. Richard.
Richard Biener <rguenther@suse.de> writes: > On Tue, 15 Aug 2023, Kewen.Lin wrote: > >> Hi Stefan, >> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote: >> > Hi everyone, >> > >> > I have bootstrapped and regtested the patch below on s390. For the >> > 64-bit target I do not see any changes regarding the testsuite. For the >> > 31-bit target I see the following failures: >> > >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors) >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable >> > >> > I've randomely picked pr50451.c and ran gcc against it which results in: >> > >> > during GIMPLE pass: vect >> > dump file: pr50451.c.174t.vect >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?: >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const >> > /gcc-verify-workdir/patched/src/gcc/machmode.h:313 >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const >> > /gcc-verify-workdir/patched/src/gcc/vec.h:955 >> > 0x1d7e4e9 vect_verify_loop_lens >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 >> > 0x1da29ab vect_analyze_loop_2 >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 >> > 0x1da40c7 vect_analyze_loop_1 >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 >> > 0x1deed27 try_vectorize_loop_1 >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 >> > 0x1deed27 try_vectorize_loop >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 >> > 0x1def5c1 execute >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 >> > Please submit a full bug report, with preprocessed source (by using -freport-bug). >> > Please include the complete backtrace with any bug report. >> > See <https://gcc.gnu.org/bugs/> for instructions. >> > >> >> It looks like s390 supports variable index vec_extract at -m31 but >> no vector with length. It seems we need to further check the vector >> with length capability, with something like: >> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc >> index 5ae9f69c7eb..ef754467baf 100644 >> --- a/gcc/tree-vect-loop.cc >> +++ b/gcc/tree-vect-loop.cc >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, >> vect_record_loop_mask (loop_vinfo, >> &LOOP_VINFO_MASKS (loop_vinfo), >> 1, vectype, NULL); >> - else if (can_vec_extract_var_idx_p ( >> + else if (get_len_load_store_mode (TYPE_MODE (vectype), true) >> + .exists () >> + && can_vec_extract_var_idx_p ( >> TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) >> - vect_record_loop_len (loop_vinfo, >> - &LOOP_VINFO_LENS (loop_vinfo), >> + vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), >> 1, vectype, 1); >> else >> { >> >> sigh, the formatting looks odd. > > I think the error is in vect_verify_loop_lens which assumes that > when we record _any_ length related op the target has to support > both len_load and len_store. Now that we have many other _len > functions that's certainly not true. > > Instead a vect_verify_loop_lens-local "fix" would be to not use > .require () but instead when !.exists () simply return false. > That would still effectively require both len-load and len-store > for any -len predicated loop, but at least avoid the ICE. Yeah, agree that would be the simplest workaround. But I think instead we should require vectorizable_load and vectorizable_store to record the bias that they want to use (perhaps in a hash_set?). Then vect_verify_loop_lens can return false if the set has more than one element. It can use a bias of 0 if the set is empty. Thanks, Richard
On Tue, 15 Aug 2023, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: > > On Tue, 15 Aug 2023, Kewen.Lin wrote: > > > >> Hi Stefan, > >> > >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote: > >> > Hi everyone, > >> > > >> > I have bootstrapped and regtested the patch below on s390. For the > >> > 64-bit target I do not see any changes regarding the testsuite. For the > >> > 31-bit target I see the following failures: > >> > > >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors) > >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) > >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) > >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) > >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable > >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 > >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 > >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable > >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable > >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable > >> > > >> > I've randomely picked pr50451.c and ran gcc against it which results in: > >> > > >> > during GIMPLE pass: vect > >> > dump file: pr50451.c.174t.vect > >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?: > >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 > >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const > >> > /gcc-verify-workdir/patched/src/gcc/machmode.h:313 > >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const > >> > /gcc-verify-workdir/patched/src/gcc/vec.h:955 > >> > 0x1d7e4e9 vect_verify_loop_lens > >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 > >> > 0x1da29ab vect_analyze_loop_2 > >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 > >> > 0x1da40c7 vect_analyze_loop_1 > >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 > >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) > >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 > >> > 0x1deed27 try_vectorize_loop_1 > >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 > >> > 0x1deed27 try_vectorize_loop > >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 > >> > 0x1def5c1 execute > >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 > >> > Please submit a full bug report, with preprocessed source (by using -freport-bug). > >> > Please include the complete backtrace with any bug report. > >> > See <https://gcc.gnu.org/bugs/> for instructions. > >> > > >> > >> It looks like s390 supports variable index vec_extract at -m31 but > >> no vector with length. It seems we need to further check the vector > >> with length capability, with something like: > >> > >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > >> index 5ae9f69c7eb..ef754467baf 100644 > >> --- a/gcc/tree-vect-loop.cc > >> +++ b/gcc/tree-vect-loop.cc > >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, > >> vect_record_loop_mask (loop_vinfo, > >> &LOOP_VINFO_MASKS (loop_vinfo), > >> 1, vectype, NULL); > >> - else if (can_vec_extract_var_idx_p ( > >> + else if (get_len_load_store_mode (TYPE_MODE (vectype), true) > >> + .exists () > >> + && can_vec_extract_var_idx_p ( > >> TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) > >> - vect_record_loop_len (loop_vinfo, > >> - &LOOP_VINFO_LENS (loop_vinfo), > >> + vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), > >> 1, vectype, 1); > >> else > >> { > >> > >> sigh, the formatting looks odd. > > > > I think the error is in vect_verify_loop_lens which assumes that > > when we record _any_ length related op the target has to support > > both len_load and len_store. Now that we have many other _len > > functions that's certainly not true. > > > > Instead a vect_verify_loop_lens-local "fix" would be to not use > > .require () but instead when !.exists () simply return false. > > That would still effectively require both len-load and len-store > > for any -len predicated loop, but at least avoid the ICE. > > Yeah, agree that would be the simplest workaround. But I think > instead we should require vectorizable_load and vectorizable_store > to record the bias that they want to use (perhaps in a hash_set?). > Then vect_verify_loop_lens can return false if the set has more > than one element. It can use a bias of 0 if the set is empty. But with all the other _LEN fns now also having a bias, never quering it but using the one from len_{load,store} having that supported looks like a requirement? OTOH if we would require querying it for each used _LEN fn then supporting multiple different biases wouldn't be an issue either? Richard.
Richard Biener <rguenther@suse.de> writes: > On Tue, 15 Aug 2023, Richard Sandiford wrote: > >> Richard Biener <rguenther@suse.de> writes: >> > On Tue, 15 Aug 2023, Kewen.Lin wrote: >> > >> >> Hi Stefan, >> >> >> >> on 2023/8/15 02:51, Stefan Schulze Frielinghaus wrote: >> >> > Hi everyone, >> >> > >> >> > I have bootstrapped and regtested the patch below on s390. For the >> >> > 64-bit target I do not see any changes regarding the testsuite. For the >> >> > 31-bit target I see the following failures: >> >> > >> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/no-scevccp-outer-14.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr50451.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr50451.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr50451.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr53773.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr53773.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr71407.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr71407.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr71407.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr71416-1.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr71416-1.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr94443.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr94443.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr94443.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr97558.c (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr97558.c (test for excess errors) >> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/pr97558.c -flto -ffat-lto-objects (test for excess errors) >> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (internal compiler error: in require, at machmode.h:313) >> >> > FAIL: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects (test for excess errors) >> >> > UNRESOLVED: gcc.dg/vect/no-scevccp-outer-14.c compilation failed to produce executable >> >> > UNRESOLVED: gcc.dg/vect/pr53773.c -flto -ffat-lto-objects scan-tree-dump-times optimized "\\* 10" 2 >> >> > UNRESOLVED: gcc.dg/vect/pr53773.c scan-tree-dump-times optimized "\\* 10" 2 >> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c -flto -ffat-lto-objects compilation failed to produce executable >> >> > UNRESOLVED: gcc.dg/vect/pr71416-1.c compilation failed to produce executable >> >> > UNRESOLVED: gcc.dg/vect/vect-reduc-pattern-3.c -flto -ffat-lto-objects compilation failed to produce executable >> >> > >> >> > I've randomely picked pr50451.c and ran gcc against it which results in: >> >> > >> >> > during GIMPLE pass: vect >> >> > dump file: pr50451.c.174t.vect >> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c: In function ?foo?: >> >> > /gcc-verify-workdir/patched/src/gcc/testsuite/gcc.dg/vect/pr50451.c:5:1: internal compiler error: in require, at machmode.h:313 >> >> > 0x1265d21 opt_mode<scalar_int_mode>::require() const >> >> > /gcc-verify-workdir/patched/src/gcc/machmode.h:313 >> >> > 0x1d7e4e9 opt_mode<machine_mode>::require() const >> >> > /gcc-verify-workdir/patched/src/gcc/vec.h:955 >> >> > 0x1d7e4e9 vect_verify_loop_lens >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:1471 >> >> > 0x1da29ab vect_analyze_loop_2 >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:2929 >> >> > 0x1da40c7 vect_analyze_loop_1 >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3330 >> >> > 0x1da499d vect_analyze_loop(loop*, vec_info_shared*) >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vect-loop.cc:3484 >> >> > 0x1deed27 try_vectorize_loop_1 >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1064 >> >> > 0x1deed27 try_vectorize_loop >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1180 >> >> > 0x1def5c1 execute >> >> > /gcc-verify-workdir/patched/src/gcc/tree-vectorizer.cc:1296 >> >> > Please submit a full bug report, with preprocessed source (by using -freport-bug). >> >> > Please include the complete backtrace with any bug report. >> >> > See <https://gcc.gnu.org/bugs/> for instructions. >> >> > >> >> >> >> It looks like s390 supports variable index vec_extract at -m31 but >> >> no vector with length. It seems we need to further check the vector >> >> with length capability, with something like: >> >> >> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc >> >> index 5ae9f69c7eb..ef754467baf 100644 >> >> --- a/gcc/tree-vect-loop.cc >> >> +++ b/gcc/tree-vect-loop.cc >> >> @@ -10327,10 +10327,11 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, >> >> vect_record_loop_mask (loop_vinfo, >> >> &LOOP_VINFO_MASKS (loop_vinfo), >> >> 1, vectype, NULL); >> >> - else if (can_vec_extract_var_idx_p ( >> >> + else if (get_len_load_store_mode (TYPE_MODE (vectype), true) >> >> + .exists () >> >> + && can_vec_extract_var_idx_p ( >> >> TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype)))) >> >> - vect_record_loop_len (loop_vinfo, >> >> - &LOOP_VINFO_LENS (loop_vinfo), >> >> + vect_record_loop_len (loop_vinfo, &LOOP_VINFO_LENS (loop_vinfo), >> >> 1, vectype, 1); >> >> else >> >> { >> >> >> >> sigh, the formatting looks odd. >> > >> > I think the error is in vect_verify_loop_lens which assumes that >> > when we record _any_ length related op the target has to support >> > both len_load and len_store. Now that we have many other _len >> > functions that's certainly not true. >> > >> > Instead a vect_verify_loop_lens-local "fix" would be to not use >> > .require () but instead when !.exists () simply return false. >> > That would still effectively require both len-load and len-store >> > for any -len predicated loop, but at least avoid the ICE. >> >> Yeah, agree that would be the simplest workaround. But I think >> instead we should require vectorizable_load and vectorizable_store >> to record the bias that they want to use (perhaps in a hash_set?). >> Then vect_verify_loop_lens can return false if the set has more >> than one element. It can use a bias of 0 if the set is empty. > > But with all the other _LEN fns now also having a bias, never > quering it but using the one from len_{load,store} having > that supported looks like a requirement? OTOH if we would require > querying it for each used _LEN fn then supporting multiple > different biases wouldn't be an issue either? Yeah, the OTOH is what I think we should do. The bias stuff dates back to when loads and stores were the only LEN_* operations. The reason it checks both loads and stores is to ensure consistency between "all" length operations. If instead we took the position that LEN_LOAD determines the biases for everything, we wouldn't need to check LEN_STORE biases. But the biases need to be consistent because they're built into the IVs. In principle we could create a different LEN input for each bias, but that's pointless when no target needs it. Thanks, Richard
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index bf8d677b584..a011e2dacb2 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10278,17 +10278,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, /* No transformation required. */ if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) { - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, - OPTIMIZE_FOR_SPEED)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "can't operate on partial vectors " - "because the target doesn't support extract " - "last reduction.\n"); - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; - } - else if (slp_node) + if (slp_node) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -10308,9 +10298,28 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, else { gcc_assert (ncopies == 1 && !slp_node); - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, NULL); + if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, + OPTIMIZE_FOR_SPEED)) + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), + 1, vectype, NULL); + else if (convert_optab_handler (vec_extract_optab, + TYPE_MODE (vectype), + TYPE_MODE (TREE_TYPE (vectype))) + != CODE_FOR_nothing) + vect_record_loop_len (loop_vinfo, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 1); + else + { + if (dump_enabled_p ()) + dump_printf_loc ( + MSG_MISSED_OPTIMIZATION, vect_location, + "can't operate on partial vectors " + "because the target doesn't support extract " + "last reduction.\n"); + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; + } } } /* ??? Enable for loop costing as well. */ @@ -10336,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple *vec_stmt; if (slp_node) { - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!loop_vinfo + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))); /* Get the correct slp vectorized stmt. */ vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; @@ -10380,7 +10391,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, gimple_seq stmts = NULL; tree new_tree; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + /* Emit: + + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1> + + where VEC_LHS is the vectorized live-out result and MASK is + the loop mask for the final iteration. */ + gcc_assert (ncopies == 1 && !slp_node); + gimple_seq tem = NULL; + gimple_stmt_iterator gsi = gsi_last (tem); + tree len + = vect_get_loop_len (loop_vinfo, &gsi, + &LOOP_VINFO_LENS (loop_vinfo), + 1, vectype, 0, 0); + + /* BIAS - 1. */ + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + tree bias_minus_one + = int_const_binop (MINUS_EXPR, + build_int_cst (TREE_TYPE (len), biasval), + build_one_cst (TREE_TYPE (len))); + + /* LAST_INDEX = LEN + (BIAS - 1). */ + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len), + len, bias_minus_one); + + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */ + tree scalar_res + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype), + vec_lhs_phi, last_index); + + /* Convert the extracted vector element to the scalar type. */ + new_tree = gimple_convert (&stmts, lhs_type, scalar_res); + } + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) { /* Emit:
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai> Hi, Richard and Richi. This patch add support live vectorization by VEC_EXTRACT for LEN loop control. Consider this following case: #include <stdint.h> #define EXTRACT_LAST(TYPE) \ TYPE __attribute__ ((noinline, noclone)) \ test_##TYPE (TYPE *x, int n, TYPE value) \ { \ TYPE last; \ for (int j = 0; j < n; ++j) \ { \ last = x[j]; \ x[j] = last * value; \ } \ return last; \ } #define TEST_ALL(T) \ T (uint8_t) \ TEST_ALL (EXTRACT_LAST) ARM SVE IR: Preheader: max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); Loop: ... # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> ... vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); ... next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); ... Epilogue: _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); For RVV since we prefer len in loop control, after this patch for RVV: Loop: ... loop_len_22 = SELECT_VL; vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); ... Epilogue: _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23); Details of this approach: 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization for LEN loop control. This function we check whether target support: - Use LEN as the loop control. - Support VEC_EXTRACT optab. 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true. 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1). The only difference between mask and len is that len is using length generated by SELECT_VL and use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE. gcc/ChangeLog: * tree-vect-loop.cc (vectorizable_live_operation): Add loop len control. --- gcc/tree-vect-loop.cc | 78 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 16 deletions(-)