Message ID | 814943f9-df8a-3049-f726-fa17f6f8d3b3@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [V2] PR88497 - Extend reassoc for vector bit_field_ref | expand |
On 3/12/19 7:57 AM, Kewen.Lin wrote: > Hi, > > As the comments from Richard and Segher (Thanks!), I've made some > changes by adding some checks and extensions. > *) Check whether vector type available on target machine, > *) Check whether vector operation available on target machine, > *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR. > *) Add more test cases for coverage. > > Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, > is it ok for GCC10? > > gcc/ChangeLog > > 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> > > PR target/88497 > * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of > GIMPLE_BINARY_RHS check and gimple_visited_p check, call new > function undistribute_bitref_for_vector. > (undistribute_bitref_for_vector): New function. > (cleanup_vinfo_map): Likewise. > (unsigned_cmp): Likewise. > > gcc/testsuite/ChangeLog > > 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> > > * gcc.dg/tree-ssa/pr88497-1.c: New test. > * gcc.dg/tree-ssa/pr88497-2.c: Likewise. > * gcc.dg/tree-ssa/pr88497-3.c: Likewise. > * gcc.dg/tree-ssa/pr88497-4.c: Likewise. > * gcc.dg/tree-ssa/pr88497-5.c: Likewise. > --- > gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 42 ++++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 31 +++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 31 +++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 31 +++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 31 +++ > gcc/tree-ssa-reassoc.c | 309 +++++++++++++++++++++++++++++- > 6 files changed, 470 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > new file mode 100644 > index 0000000..c87caff > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > @@ -0,0 +1,42 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_double } */ > +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ > + > +/* To test reassoc can undistribute vector bit_field_ref summation. > + > + arg1 and arg2 are two arrays whose elements of type vector double. > + Assuming: > + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], > + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], > + > + Then: > + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, > + > + reassoc transforms > + > + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] > + + V3[0] + V3[1]; > + > + into: > + > + T = V0 + V1 + V2 + V3 > + accumulator += T[0] + T[1]; > + > + Fewer bit_field_refs, only two for 128 or more bits vector. */ > + > +typedef double v2df __attribute__ ((vector_size (16))); > +double > +test (double accumulator, v2df arg1[], v2df arg2[]) > +{ > + v2df temp; > + temp = arg1[0] * arg2[0]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[1] * arg2[1]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[2] * arg2[2]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[3] * arg2[3]; > + accumulator += temp[0] + temp[1]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > new file mode 100644 > index 0000000..d1851ff > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_float } */ > +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on multiplication. > + > + v1, v2, v3, v4 of type vector float. > + > + reassoc transforms > + > + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * > + v2[0] * v2[1] * v2[2] * v2[3] * > + v3[0] * v3[1] * v3[2] * v3[3] * > + v4[0] * v4[1] * v4[2] * v4[3] ; > + > + into: > + > + T = v1 * v2 * v3 * v4; > + accumulator *= T[0] * T[1] * T[2] * T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef float v4si __attribute__((vector_size(16))); > +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; > + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; > + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; > + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > new file mode 100644 > index 0000000..e774d25 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & > + v2[0] & v2[1] & v2[2] & v2[3] & > + v3[0] & v3[1] & v3[2] & v3[3] & > + v4[0] & v4[1] & v4[2] & v4[3] ; > + > + into: > + > + T = v1 & v2 & v3 & v4; > + accumulator &= T[0] & T[1] & T[2] & T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; > + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; > + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; > + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > new file mode 100644 > index 0000000..7b75404 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | > + v2[0] | v2[1] | v2[2] | v2[3] | > + v3[0] | v3[1] | v3[2] | v3[3] | > + v4[0] | v4[1] | v4[2] | v4[3] ; > + > + into: > + > + T = v1 | v2 | v3 | v4; > + accumulator |= T[0] | T[1] | T[2] | T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; > + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; > + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; > + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > new file mode 100644 > index 0000000..d069725 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ > + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ > + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ > + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; > + > + into: > + > + T = v1 ^ v2 ^ v3 ^ v4; > + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; > + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; > + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; > + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c > index e1c4dfe..d755911 100644 > --- a/gcc/tree-ssa-reassoc.c > +++ b/gcc/tree-ssa-reassoc.c > @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode, > return changed; > } > > +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. > + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. > + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ > +struct v_info > +{ > + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; > + auto_vec<unsigned, 32> ops_indexes; > +}; > + > +typedef struct v_info *v_info_ptr; > + > +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ > +static int > +unsigned_cmp (const void *p_i, const void *p_j) > +{ > + if (*(const unsigned *) p_i >= *(const unsigned *) p_j) > + return 1; > + else > + return -1; > +} > + > +/* Cleanup hash map for VECTOR information. */ > +static void > +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) > +{ > + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); > + it != info_map.end (); ++it) > + { > + v_info_ptr info = (*it).second; > + delete info; > + (*it).second = NULL; > + } > +} > + > +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. > + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] > + is transformed to > + Vs = (V1 + V2 + ... + Vn) > + Vs[0] + Vs[1] + ... + Vs[k] > + > + The basic steps are listed below: > + > + 1) Check the addition chain *OPS by looking those summands coming from > + VECTOR bit_field_ref on VECTOR type. Put the information into > + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. > + > + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are > + continous, they can cover the whole VECTOR perfectly without any holes. > + Obtain one VECTOR list which contain candidates to be transformed. > + > + 3) Build the addition statements for all VECTOR candidates, generate > + BIT_FIELD_REFs accordingly. > + > + TODO: > + 1) The current implementation restrict all candidate VECTORs should have > + the same VECTOR type, but it can be extended into different groups by > + VECTOR types in future if any profitable cases found. > + 2) The current implementation requires the whole VECTORs should be fully > + covered, but it can be extended to support partial, checking adjacent > + but not fill the whole, it may need some cost model to define the > + boundary to do or not. > +*/ > +static bool > +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, > + struct loop *loop) > +{ > + if (ops->length () <= 1) > + return false; > + > + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR > + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) > + return false; > + > + hash_map<tree, v_info_ptr> v_info_map; > + operand_entry *oe1; > + unsigned i; > + > + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the > + information into map. */ > + FOR_EACH_VEC_ELT (*ops, i, oe1) > + { > + enum tree_code dcode; > + gimple *oe1def; > + > + if (TREE_CODE (oe1->op) != SSA_NAME) > + continue; > + oe1def = SSA_NAME_DEF_STMT (oe1->op); > + if (!is_gimple_assign (oe1def)) > + continue; > + dcode = gimple_assign_rhs_code (oe1def); > + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) > + continue; > + > + tree rhs = gimple_op (oe1def, 1); > + tree op0 = TREE_OPERAND (rhs, 0); > + tree vec_type = TREE_TYPE (op0); > + > + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) > + continue; > + > + tree op1 = TREE_OPERAND (rhs, 1); > + tree op2 = TREE_OPERAND (rhs, 2); > + > + tree elem_type = TREE_TYPE (vec_type); > + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); > + if (size != TREE_INT_CST_LOW (op1)) > + continue; > + > + /* Ignore it if target machine can't support this VECTOR type. */ > + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) > + continue; I don't think this is sufficient. I think you need to use something like (targetm.vector_mode_supported_p (TYPE_MODE (vec_type)) && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing) I might not have spelled all of this right. There are examples of testing for target support in tree-vect-stmts.c. Thanks, Bill > + > + v_info_ptr *info_ptr = v_info_map.get (op0); > + if (info_ptr) > + { > + v_info_ptr info = *info_ptr; > + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); > + info->ops_indexes.safe_push (i); > + } > + else > + { > + v_info_ptr info = new v_info; > + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); > + info->ops_indexes.safe_push (i); > + v_info_map.put (op0, info); > + } > + } > + > + /* At least two VECTOR to combine. */ > + if (v_info_map.elements () <= 1) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + /* Use the first VECTOR and its information as the reference. > + Firstly, we should validate it, that is: > + 1) required VECTOR operation supported on target machine. > + 2) sorted offsets are adjacent, no holes. > + 3) can fill the whole VECTOR perfectly. */ > + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); > + tree ref_vec = (*it).first; > + tree vec_type = TREE_TYPE (ref_vec); > + tree elem_type = TREE_TYPE (vec_type); > + > + /* Check VECTOR operation available on target machine. */ > + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); > + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + v_info_ptr ref_info = (*it).second; > + ref_info->offsets.qsort (unsigned_cmp); > + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); > + unsigned HOST_WIDE_INT curr; > + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; > + > + /* Continous check. */ > + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) > + { > + if (curr != (prev + elem_size)) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + prev = curr; > + } > + > + /* Check whether fill the whole. */ > + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + auto_vec<tree> vectors (v_info_map.elements ()); > + vectors.quick_push (ref_vec); > + > + /* Use the ref_vec to filter others. */ > + for (++it; it != v_info_map.end (); ++it) > + { > + tree vec = (*it).first; > + v_info_ptr info = (*it).second; > + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) > + continue; > + if (ref_info->offsets.length () != info->offsets.length ()) > + continue; > + bool same_offset = true; > + info->offsets.qsort (unsigned_cmp); > + for (unsigned i = 0; i < ref_info->offsets.length (); i++) > + { > + if (ref_info->offsets[i] != info->offsets[i]) > + { > + same_offset = false; > + break; > + } > + } > + if (!same_offset) > + continue; > + vectors.quick_push (vec); > + } > + > + if (vectors.length () < 2) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + tree tr; > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); > + FOR_EACH_VEC_ELT (vectors, i, tr) > + { > + print_generic_expr (dump_file, tr); > + fprintf (dump_file, " "); > + } > + fprintf (dump_file, "\n"); > + } > + > + /* Build the sum for all candidate VECTORs. */ > + unsigned idx; > + gimple *sum = NULL; > + v_info_ptr info; > + tree sum_vec = ref_vec; > + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) > + { > + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); > + info = *(v_info_map.get (tr)); > + unsigned j; > + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) > + { > + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); > + gimple_set_visited (def, true); > + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR > + || opcode == BIT_IOR_EXPR) > + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); > + else if (opcode == MULT_EXPR) > + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); > + else > + { > + gcc_assert (opcode == BIT_AND_EXPR); > + (*ops)[idx]->op > + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); > + } > + (*ops)[idx]->rank = 0; > + } > + sum_vec = gimple_get_lhs (sum); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Generating addition -> "); > + print_gimple_stmt (dump_file, sum, 0); > + } > + } > + > + /* Referring to any good shape VECTOR (here using ref_vec), generate the > + BIT_FIELD_REF statements accordingly. */ > + info = *(v_info_map.get (ref_vec)); > + gcc_assert (sum); > + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) > + { > + tree dst = make_ssa_name (elem_type); > + gimple *gs > + = gimple_build_assign (dst, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, elem_type, sum_vec, > + TYPE_SIZE (elem_type), > + bitsize_int (info->offsets[i]))); > + insert_stmt_after (gs, sum); > + update_stmt (gs); > + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); > + gimple_set_visited (def, true); > + (*ops)[idx]->op = gimple_assign_lhs (gs); > + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Generating bit_field_ref -> "); > + print_gimple_stmt (dump_file, gs, 0); > + } > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); > + } > + > + cleanup_vinfo_map (v_info_map); > + > + return true; > +} > + > /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison > expression, examine the other OPS to see if any of them are comparisons > of the same values, which we may be able to combine or eliminate. > @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb) > tree lhs, rhs1, rhs2; > enum tree_code rhs_code = gimple_assign_rhs_code (stmt); > > - /* If this is not a gimple binary expression, there is > - nothing for us to do with it. */ > - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) > - continue; > - > /* If this was part of an already processed statement, > we don't need to touch it again. */ > if (gimple_visited_p (stmt)) > @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb) > continue; > } > > + /* If this is not a gimple binary expression, there is > + nothing for us to do with it. */ > + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) > + continue; > + > lhs = gimple_assign_lhs (stmt); > rhs1 = gimple_assign_rhs1 (stmt); > rhs2 = gimple_assign_rhs2 (stmt); > @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb) > optimize_ops_list (rhs_code, &ops); > } > > + if (undistribute_bitref_for_vector (rhs_code, &ops, > + loop_containing_stmt (stmt))) > + { > + ops.qsort (sort_by_operand_rank); > + optimize_ops_list (rhs_code, &ops); > + } > + > if (rhs_code == PLUS_EXPR > && transform_add_to_multiply (&ops)) > ops.qsort (sort_by_operand_rank);
On 3/12/19 9:29 AM, Bill Schmidt wrote: > On 3/12/19 7:57 AM, Kewen.Lin wrote: >> Hi, >> >> As the comments from Richard and Segher (Thanks!), I've made some >> changes by adding some checks and extensions. >> *) Check whether vector type available on target machine, >> *) Check whether vector operation available on target machine, >> *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR. >> *) Add more test cases for coverage. >> >> Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, >> is it ok for GCC10? >> >> gcc/ChangeLog >> >> 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> >> >> PR target/88497 >> * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of >> GIMPLE_BINARY_RHS check and gimple_visited_p check, call new >> function undistribute_bitref_for_vector. >> (undistribute_bitref_for_vector): New function. >> (cleanup_vinfo_map): Likewise. >> (unsigned_cmp): Likewise. >> >> gcc/testsuite/ChangeLog >> >> 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> >> >> * gcc.dg/tree-ssa/pr88497-1.c: New test. >> * gcc.dg/tree-ssa/pr88497-2.c: Likewise. >> * gcc.dg/tree-ssa/pr88497-3.c: Likewise. >> * gcc.dg/tree-ssa/pr88497-4.c: Likewise. >> * gcc.dg/tree-ssa/pr88497-5.c: Likewise. >> --- >> gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 42 ++++ >> gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 31 +++ >> gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 31 +++ >> gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 31 +++ >> gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 31 +++ >> gcc/tree-ssa-reassoc.c | 309 +++++++++++++++++++++++++++++- >> 6 files changed, 470 insertions(+), 5 deletions(-) >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >> >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >> new file mode 100644 >> index 0000000..c87caff >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >> @@ -0,0 +1,42 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_double } */ >> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >> + >> +/* To test reassoc can undistribute vector bit_field_ref summation. >> + >> + arg1 and arg2 are two arrays whose elements of type vector double. >> + Assuming: >> + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], >> + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], >> + >> + Then: >> + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, >> + >> + reassoc transforms >> + >> + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] >> + + V3[0] + V3[1]; >> + >> + into: >> + >> + T = V0 + V1 + V2 + V3 >> + accumulator += T[0] + T[1]; >> + >> + Fewer bit_field_refs, only two for 128 or more bits vector. */ >> + >> +typedef double v2df __attribute__ ((vector_size (16))); >> +double >> +test (double accumulator, v2df arg1[], v2df arg2[]) >> +{ >> + v2df temp; >> + temp = arg1[0] * arg2[0]; >> + accumulator += temp[0] + temp[1]; >> + temp = arg1[1] * arg2[1]; >> + accumulator += temp[0] + temp[1]; >> + temp = arg1[2] * arg2[2]; >> + accumulator += temp[0] + temp[1]; >> + temp = arg1[3] * arg2[3]; >> + accumulator += temp[0] + temp[1]; >> + return accumulator; >> +} >> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >> new file mode 100644 >> index 0000000..d1851ff >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >> @@ -0,0 +1,31 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_float } */ >> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >> + >> +/* To test reassoc can undistribute vector bit_field_ref on multiplication. >> + >> + v1, v2, v3, v4 of type vector float. >> + >> + reassoc transforms >> + >> + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * >> + v2[0] * v2[1] * v2[2] * v2[3] * >> + v3[0] * v3[1] * v3[2] * v3[3] * >> + v4[0] * v4[1] * v4[2] * v4[3] ; >> + >> + into: >> + >> + T = v1 * v2 * v3 * v4; >> + accumulator *= T[0] * T[1] * T[2] * T[3]; >> + >> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >> + >> +typedef float v4si __attribute__((vector_size(16))); >> +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >> + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; >> + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; >> + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; >> + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; >> + return accumulator; >> +} >> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >> new file mode 100644 >> index 0000000..e774d25 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >> @@ -0,0 +1,31 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ >> + >> +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. >> + >> + v1, v2, v3, v4 of type vector int. >> + >> + reassoc transforms >> + >> + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & >> + v2[0] & v2[1] & v2[2] & v2[3] & >> + v3[0] & v3[1] & v3[2] & v3[3] & >> + v4[0] & v4[1] & v4[2] & v4[3] ; >> + >> + into: >> + >> + T = v1 & v2 & v3 & v4; >> + accumulator &= T[0] & T[1] & T[2] & T[3]; >> + >> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >> + >> +typedef int v4si __attribute__((vector_size(16))); >> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >> + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; >> + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; >> + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; >> + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; >> + return accumulator; >> +} >> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >> new file mode 100644 >> index 0000000..7b75404 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >> @@ -0,0 +1,31 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ >> + >> +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. >> + >> + v1, v2, v3, v4 of type vector int. >> + >> + reassoc transforms >> + >> + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | >> + v2[0] | v2[1] | v2[2] | v2[3] | >> + v3[0] | v3[1] | v3[2] | v3[3] | >> + v4[0] | v4[1] | v4[2] | v4[3] ; >> + >> + into: >> + >> + T = v1 | v2 | v3 | v4; >> + accumulator |= T[0] | T[1] | T[2] | T[3]; >> + >> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >> + >> +typedef int v4si __attribute__((vector_size(16))); >> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >> + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; >> + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; >> + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; >> + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; >> + return accumulator; >> +} >> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >> new file mode 100644 >> index 0000000..d069725 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >> @@ -0,0 +1,31 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target vect_int } */ >> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >> + >> +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. >> + >> + v1, v2, v3, v4 of type vector int. >> + >> + reassoc transforms >> + >> + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ >> + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ >> + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ >> + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; >> + >> + into: >> + >> + T = v1 ^ v2 ^ v3 ^ v4; >> + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; >> + >> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >> + >> +typedef int v4si __attribute__((vector_size(16))); >> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >> + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; >> + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; >> + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; >> + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; >> + return accumulator; >> +} >> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c >> index e1c4dfe..d755911 100644 >> --- a/gcc/tree-ssa-reassoc.c >> +++ b/gcc/tree-ssa-reassoc.c >> @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode, >> return changed; >> } >> >> +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. >> + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. >> + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ >> +struct v_info >> +{ >> + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; >> + auto_vec<unsigned, 32> ops_indexes; >> +}; >> + >> +typedef struct v_info *v_info_ptr; >> + >> +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ >> +static int >> +unsigned_cmp (const void *p_i, const void *p_j) >> +{ >> + if (*(const unsigned *) p_i >= *(const unsigned *) p_j) >> + return 1; >> + else >> + return -1; >> +} >> + >> +/* Cleanup hash map for VECTOR information. */ >> +static void >> +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) >> +{ >> + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); >> + it != info_map.end (); ++it) >> + { >> + v_info_ptr info = (*it).second; >> + delete info; >> + (*it).second = NULL; >> + } >> +} >> + >> +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. >> + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] >> + is transformed to >> + Vs = (V1 + V2 + ... + Vn) >> + Vs[0] + Vs[1] + ... + Vs[k] >> + >> + The basic steps are listed below: >> + >> + 1) Check the addition chain *OPS by looking those summands coming from >> + VECTOR bit_field_ref on VECTOR type. Put the information into >> + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. >> + >> + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are >> + continous, they can cover the whole VECTOR perfectly without any holes. >> + Obtain one VECTOR list which contain candidates to be transformed. >> + >> + 3) Build the addition statements for all VECTOR candidates, generate >> + BIT_FIELD_REFs accordingly. >> + >> + TODO: >> + 1) The current implementation restrict all candidate VECTORs should have >> + the same VECTOR type, but it can be extended into different groups by >> + VECTOR types in future if any profitable cases found. >> + 2) The current implementation requires the whole VECTORs should be fully >> + covered, but it can be extended to support partial, checking adjacent >> + but not fill the whole, it may need some cost model to define the >> + boundary to do or not. >> +*/ >> +static bool >> +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, >> + struct loop *loop) >> +{ >> + if (ops->length () <= 1) >> + return false; >> + >> + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR >> + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) >> + return false; >> + >> + hash_map<tree, v_info_ptr> v_info_map; >> + operand_entry *oe1; >> + unsigned i; >> + >> + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the >> + information into map. */ >> + FOR_EACH_VEC_ELT (*ops, i, oe1) >> + { >> + enum tree_code dcode; >> + gimple *oe1def; >> + >> + if (TREE_CODE (oe1->op) != SSA_NAME) >> + continue; >> + oe1def = SSA_NAME_DEF_STMT (oe1->op); >> + if (!is_gimple_assign (oe1def)) >> + continue; >> + dcode = gimple_assign_rhs_code (oe1def); >> + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) >> + continue; >> + >> + tree rhs = gimple_op (oe1def, 1); >> + tree op0 = TREE_OPERAND (rhs, 0); >> + tree vec_type = TREE_TYPE (op0); >> + >> + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) >> + continue; >> + >> + tree op1 = TREE_OPERAND (rhs, 1); >> + tree op2 = TREE_OPERAND (rhs, 2); >> + >> + tree elem_type = TREE_TYPE (vec_type); >> + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); >> + if (size != TREE_INT_CST_LOW (op1)) >> + continue; >> + >> + /* Ignore it if target machine can't support this VECTOR type. */ >> + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) >> + continue; > I don't think this is sufficient. I think you need to use something like > > (targetm.vector_mode_supported_p (TYPE_MODE (vec_type)) > && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing) > > I might not have spelled all of this right. There are examples of testing > for target support in tree-vect-stmts.c. As we discussed offline, I see now that what you have is correct -- it's just not as clear or efficient as I would like. I think it's better to check the type support explicitly, and do the optab check up front also, so that you can exit quickly if this doesn't apply. But please test! I haven't tried this at all. ;-) Thanks, Bill > > Thanks, > Bill >> + >> + v_info_ptr *info_ptr = v_info_map.get (op0); >> + if (info_ptr) >> + { >> + v_info_ptr info = *info_ptr; >> + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); >> + info->ops_indexes.safe_push (i); >> + } >> + else >> + { >> + v_info_ptr info = new v_info; >> + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); >> + info->ops_indexes.safe_push (i); >> + v_info_map.put (op0, info); >> + } >> + } >> + >> + /* At least two VECTOR to combine. */ >> + if (v_info_map.elements () <= 1) >> + { >> + cleanup_vinfo_map (v_info_map); >> + return false; >> + } >> + >> + /* Use the first VECTOR and its information as the reference. >> + Firstly, we should validate it, that is: >> + 1) required VECTOR operation supported on target machine. >> + 2) sorted offsets are adjacent, no holes. >> + 3) can fill the whole VECTOR perfectly. */ >> + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); >> + tree ref_vec = (*it).first; >> + tree vec_type = TREE_TYPE (ref_vec); >> + tree elem_type = TREE_TYPE (vec_type); >> + >> + /* Check VECTOR operation available on target machine. */ >> + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); >> + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) >> + { >> + cleanup_vinfo_map (v_info_map); >> + return false; >> + } >> + >> + v_info_ptr ref_info = (*it).second; >> + ref_info->offsets.qsort (unsigned_cmp); >> + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); >> + unsigned HOST_WIDE_INT curr; >> + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; >> + >> + /* Continous check. */ >> + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) >> + { >> + if (curr != (prev + elem_size)) >> + { >> + cleanup_vinfo_map (v_info_map); >> + return false; >> + } >> + prev = curr; >> + } >> + >> + /* Check whether fill the whole. */ >> + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) >> + { >> + cleanup_vinfo_map (v_info_map); >> + return false; >> + } >> + >> + auto_vec<tree> vectors (v_info_map.elements ()); >> + vectors.quick_push (ref_vec); >> + >> + /* Use the ref_vec to filter others. */ >> + for (++it; it != v_info_map.end (); ++it) >> + { >> + tree vec = (*it).first; >> + v_info_ptr info = (*it).second; >> + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) >> + continue; >> + if (ref_info->offsets.length () != info->offsets.length ()) >> + continue; >> + bool same_offset = true; >> + info->offsets.qsort (unsigned_cmp); >> + for (unsigned i = 0; i < ref_info->offsets.length (); i++) >> + { >> + if (ref_info->offsets[i] != info->offsets[i]) >> + { >> + same_offset = false; >> + break; >> + } >> + } >> + if (!same_offset) >> + continue; >> + vectors.quick_push (vec); >> + } >> + >> + if (vectors.length () < 2) >> + { >> + cleanup_vinfo_map (v_info_map); >> + return false; >> + } >> + >> + tree tr; >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> + { >> + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); >> + FOR_EACH_VEC_ELT (vectors, i, tr) >> + { >> + print_generic_expr (dump_file, tr); >> + fprintf (dump_file, " "); >> + } >> + fprintf (dump_file, "\n"); >> + } >> + >> + /* Build the sum for all candidate VECTORs. */ >> + unsigned idx; >> + gimple *sum = NULL; >> + v_info_ptr info; >> + tree sum_vec = ref_vec; >> + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) >> + { >> + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); >> + info = *(v_info_map.get (tr)); >> + unsigned j; >> + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) >> + { >> + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); >> + gimple_set_visited (def, true); >> + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR >> + || opcode == BIT_IOR_EXPR) >> + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); >> + else if (opcode == MULT_EXPR) >> + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); >> + else >> + { >> + gcc_assert (opcode == BIT_AND_EXPR); >> + (*ops)[idx]->op >> + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); >> + } >> + (*ops)[idx]->rank = 0; >> + } >> + sum_vec = gimple_get_lhs (sum); >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> + { >> + fprintf (dump_file, "Generating addition -> "); >> + print_gimple_stmt (dump_file, sum, 0); >> + } >> + } >> + >> + /* Referring to any good shape VECTOR (here using ref_vec), generate the >> + BIT_FIELD_REF statements accordingly. */ >> + info = *(v_info_map.get (ref_vec)); >> + gcc_assert (sum); >> + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) >> + { >> + tree dst = make_ssa_name (elem_type); >> + gimple *gs >> + = gimple_build_assign (dst, BIT_FIELD_REF, >> + build3 (BIT_FIELD_REF, elem_type, sum_vec, >> + TYPE_SIZE (elem_type), >> + bitsize_int (info->offsets[i]))); >> + insert_stmt_after (gs, sum); >> + update_stmt (gs); >> + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); >> + gimple_set_visited (def, true); >> + (*ops)[idx]->op = gimple_assign_lhs (gs); >> + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> + { >> + fprintf (dump_file, "Generating bit_field_ref -> "); >> + print_gimple_stmt (dump_file, gs, 0); >> + } >> + } >> + >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> + { >> + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); >> + } >> + >> + cleanup_vinfo_map (v_info_map); >> + >> + return true; >> +} >> + >> /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison >> expression, examine the other OPS to see if any of them are comparisons >> of the same values, which we may be able to combine or eliminate. >> @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb) >> tree lhs, rhs1, rhs2; >> enum tree_code rhs_code = gimple_assign_rhs_code (stmt); >> >> - /* If this is not a gimple binary expression, there is >> - nothing for us to do with it. */ >> - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) >> - continue; >> - >> /* If this was part of an already processed statement, >> we don't need to touch it again. */ >> if (gimple_visited_p (stmt)) >> @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb) >> continue; >> } >> >> + /* If this is not a gimple binary expression, there is >> + nothing for us to do with it. */ >> + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) >> + continue; >> + >> lhs = gimple_assign_lhs (stmt); >> rhs1 = gimple_assign_rhs1 (stmt); >> rhs2 = gimple_assign_rhs2 (stmt); >> @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb) >> optimize_ops_list (rhs_code, &ops); >> } >> >> + if (undistribute_bitref_for_vector (rhs_code, &ops, >> + loop_containing_stmt (stmt))) >> + { >> + ops.qsort (sort_by_operand_rank); >> + optimize_ops_list (rhs_code, &ops); >> + } >> + >> if (rhs_code == PLUS_EXPR >> && transform_add_to_multiply (&ops)) >> ops.qsort (sort_by_operand_rank); >
on 2019/3/12 下午11:22, Bill Schmidt wrote: > On 3/12/19 9:29 AM, Bill Schmidt wrote: >> On 3/12/19 7:57 AM, Kewen.Lin wrote: >>> Hi, >>> >>> As the comments from Richard and Segher (Thanks!), I've made some >>> changes by adding some checks and extensions. >>> *) Check whether vector type available on target machine, >>> *) Check whether vector operation available on target machine, >>> *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR. >>> *) Add more test cases for coverage. >>> >>> Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, >>> is it ok for GCC10? >>> >>> gcc/ChangeLog >>> >>> 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> >>> >>> PR target/88497 >>> * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of >>> GIMPLE_BINARY_RHS check and gimple_visited_p check, call new >>> function undistribute_bitref_for_vector. >>> (undistribute_bitref_for_vector): New function. >>> (cleanup_vinfo_map): Likewise. >>> (unsigned_cmp): Likewise. >>> >>> gcc/testsuite/ChangeLog >>> >>> 2019-03-12 Kewen Lin <linkw@gcc.gnu.org> >>> >>> * gcc.dg/tree-ssa/pr88497-1.c: New test. >>> * gcc.dg/tree-ssa/pr88497-2.c: Likewise. >>> * gcc.dg/tree-ssa/pr88497-3.c: Likewise. >>> * gcc.dg/tree-ssa/pr88497-4.c: Likewise. >>> * gcc.dg/tree-ssa/pr88497-5.c: Likewise. >>> --- >>> gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 42 ++++ >>> gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 31 +++ >>> gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 31 +++ >>> gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 31 +++ >>> gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 31 +++ >>> gcc/tree-ssa-reassoc.c | 309 +++++++++++++++++++++++++++++- >>> 6 files changed, 470 insertions(+), 5 deletions(-) >>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >>> >>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >>> new file mode 100644 >>> index 0000000..c87caff >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c >>> @@ -0,0 +1,42 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_double } */ >>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >>> + >>> +/* To test reassoc can undistribute vector bit_field_ref summation. >>> + >>> + arg1 and arg2 are two arrays whose elements of type vector double. >>> + Assuming: >>> + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], >>> + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], >>> + >>> + Then: >>> + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, >>> + >>> + reassoc transforms >>> + >>> + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] >>> + + V3[0] + V3[1]; >>> + >>> + into: >>> + >>> + T = V0 + V1 + V2 + V3 >>> + accumulator += T[0] + T[1]; >>> + >>> + Fewer bit_field_refs, only two for 128 or more bits vector. */ >>> + >>> +typedef double v2df __attribute__ ((vector_size (16))); >>> +double >>> +test (double accumulator, v2df arg1[], v2df arg2[]) >>> +{ >>> + v2df temp; >>> + temp = arg1[0] * arg2[0]; >>> + accumulator += temp[0] + temp[1]; >>> + temp = arg1[1] * arg2[1]; >>> + accumulator += temp[0] + temp[1]; >>> + temp = arg1[2] * arg2[2]; >>> + accumulator += temp[0] + temp[1]; >>> + temp = arg1[3] * arg2[3]; >>> + accumulator += temp[0] + temp[1]; >>> + return accumulator; >>> +} >>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ >>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >>> new file mode 100644 >>> index 0000000..d1851ff >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c >>> @@ -0,0 +1,31 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_float } */ >>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >>> + >>> +/* To test reassoc can undistribute vector bit_field_ref on multiplication. >>> + >>> + v1, v2, v3, v4 of type vector float. >>> + >>> + reassoc transforms >>> + >>> + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * >>> + v2[0] * v2[1] * v2[2] * v2[3] * >>> + v3[0] * v3[1] * v3[2] * v3[3] * >>> + v4[0] * v4[1] * v4[2] * v4[3] ; >>> + >>> + into: >>> + >>> + T = v1 * v2 * v3 * v4; >>> + accumulator *= T[0] * T[1] * T[2] * T[3]; >>> + >>> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >>> + >>> +typedef float v4si __attribute__((vector_size(16))); >>> +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >>> + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; >>> + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; >>> + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; >>> + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; >>> + return accumulator; >>> +} >>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >>> new file mode 100644 >>> index 0000000..e774d25 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c >>> @@ -0,0 +1,31 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_int } */ >>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ >>> + >>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. >>> + >>> + v1, v2, v3, v4 of type vector int. >>> + >>> + reassoc transforms >>> + >>> + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & >>> + v2[0] & v2[1] & v2[2] & v2[3] & >>> + v3[0] & v3[1] & v3[2] & v3[3] & >>> + v4[0] & v4[1] & v4[2] & v4[3] ; >>> + >>> + into: >>> + >>> + T = v1 & v2 & v3 & v4; >>> + accumulator &= T[0] & T[1] & T[2] & T[3]; >>> + >>> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >>> + >>> +typedef int v4si __attribute__((vector_size(16))); >>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >>> + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; >>> + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; >>> + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; >>> + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; >>> + return accumulator; >>> +} >>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >>> new file mode 100644 >>> index 0000000..7b75404 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c >>> @@ -0,0 +1,31 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_int } */ >>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ >>> + >>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. >>> + >>> + v1, v2, v3, v4 of type vector int. >>> + >>> + reassoc transforms >>> + >>> + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | >>> + v2[0] | v2[1] | v2[2] | v2[3] | >>> + v3[0] | v3[1] | v3[2] | v3[3] | >>> + v4[0] | v4[1] | v4[2] | v4[3] ; >>> + >>> + into: >>> + >>> + T = v1 | v2 | v3 | v4; >>> + accumulator |= T[0] | T[1] | T[2] | T[3]; >>> + >>> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >>> + >>> +typedef int v4si __attribute__((vector_size(16))); >>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >>> + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; >>> + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; >>> + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; >>> + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; >>> + return accumulator; >>> +} >>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >>> new file mode 100644 >>> index 0000000..d069725 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c >>> @@ -0,0 +1,31 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target vect_int } */ >>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ >>> + >>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. >>> + >>> + v1, v2, v3, v4 of type vector int. >>> + >>> + reassoc transforms >>> + >>> + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ >>> + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ >>> + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ >>> + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; >>> + >>> + into: >>> + >>> + T = v1 ^ v2 ^ v3 ^ v4; >>> + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; >>> + >>> + Fewer bit_field_refs, only four for 128 or more bits vector. */ >>> + >>> +typedef int v4si __attribute__((vector_size(16))); >>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { >>> + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; >>> + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; >>> + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; >>> + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; >>> + return accumulator; >>> +} >>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ >>> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c >>> index e1c4dfe..d755911 100644 >>> --- a/gcc/tree-ssa-reassoc.c >>> +++ b/gcc/tree-ssa-reassoc.c >>> @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode, >>> return changed; >>> } >>> >>> +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. >>> + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. >>> + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ >>> +struct v_info >>> +{ >>> + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; >>> + auto_vec<unsigned, 32> ops_indexes; >>> +}; >>> + >>> +typedef struct v_info *v_info_ptr; >>> + >>> +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ >>> +static int >>> +unsigned_cmp (const void *p_i, const void *p_j) >>> +{ >>> + if (*(const unsigned *) p_i >= *(const unsigned *) p_j) >>> + return 1; >>> + else >>> + return -1; >>> +} >>> + >>> +/* Cleanup hash map for VECTOR information. */ >>> +static void >>> +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) >>> +{ >>> + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); >>> + it != info_map.end (); ++it) >>> + { >>> + v_info_ptr info = (*it).second; >>> + delete info; >>> + (*it).second = NULL; >>> + } >>> +} >>> + >>> +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. >>> + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] >>> + is transformed to >>> + Vs = (V1 + V2 + ... + Vn) >>> + Vs[0] + Vs[1] + ... + Vs[k] >>> + >>> + The basic steps are listed below: >>> + >>> + 1) Check the addition chain *OPS by looking those summands coming from >>> + VECTOR bit_field_ref on VECTOR type. Put the information into >>> + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. >>> + >>> + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are >>> + continous, they can cover the whole VECTOR perfectly without any holes. >>> + Obtain one VECTOR list which contain candidates to be transformed. >>> + >>> + 3) Build the addition statements for all VECTOR candidates, generate >>> + BIT_FIELD_REFs accordingly. >>> + >>> + TODO: >>> + 1) The current implementation restrict all candidate VECTORs should have >>> + the same VECTOR type, but it can be extended into different groups by >>> + VECTOR types in future if any profitable cases found. >>> + 2) The current implementation requires the whole VECTORs should be fully >>> + covered, but it can be extended to support partial, checking adjacent >>> + but not fill the whole, it may need some cost model to define the >>> + boundary to do or not. >>> +*/ >>> +static bool >>> +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, >>> + struct loop *loop) >>> +{ >>> + if (ops->length () <= 1) >>> + return false; >>> + >>> + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR >>> + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) >>> + return false; >>> + >>> + hash_map<tree, v_info_ptr> v_info_map; >>> + operand_entry *oe1; >>> + unsigned i; >>> + >>> + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the >>> + information into map. */ >>> + FOR_EACH_VEC_ELT (*ops, i, oe1) >>> + { >>> + enum tree_code dcode; >>> + gimple *oe1def; >>> + >>> + if (TREE_CODE (oe1->op) != SSA_NAME) >>> + continue; >>> + oe1def = SSA_NAME_DEF_STMT (oe1->op); >>> + if (!is_gimple_assign (oe1def)) >>> + continue; >>> + dcode = gimple_assign_rhs_code (oe1def); >>> + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) >>> + continue; >>> + >>> + tree rhs = gimple_op (oe1def, 1); >>> + tree op0 = TREE_OPERAND (rhs, 0); >>> + tree vec_type = TREE_TYPE (op0); >>> + >>> + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) >>> + continue; >>> + >>> + tree op1 = TREE_OPERAND (rhs, 1); >>> + tree op2 = TREE_OPERAND (rhs, 2); >>> + >>> + tree elem_type = TREE_TYPE (vec_type); >>> + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); >>> + if (size != TREE_INT_CST_LOW (op1)) >>> + continue; >>> + >>> + /* Ignore it if target machine can't support this VECTOR type. */ >>> + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) >>> + continue; >> I don't think this is sufficient. I think you need to use something like >> >> (targetm.vector_mode_supported_p (TYPE_MODE (vec_type)) >> && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing) >> >> I might not have spelled all of this right. There are examples of testing >> for target support in tree-vect-stmts.c. > > As we discussed offline, I see now that what you have is correct -- it's just > not as clear or efficient as I would like. I think it's better to check the > type support explicitly, and do the optab check up front also, so that you > can exit quickly if this doesn't apply. > > But please test! I haven't tried this at all. ;-) > > Thanks, > Bill > Thanks Bill! I put the vector operation check as part of validating ref vector. You are right, it's better to move it up to vector type check. >> Thanks, >> Bill >>> + >>> + v_info_ptr *info_ptr = v_info_map.get (op0); >>> + if (info_ptr) >>> + { >>> + v_info_ptr info = *info_ptr; >>> + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); >>> + info->ops_indexes.safe_push (i); >>> + } >>> + else >>> + { >>> + v_info_ptr info = new v_info; >>> + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); >>> + info->ops_indexes.safe_push (i); >>> + v_info_map.put (op0, info); >>> + } >>> + } >>> + >>> + /* At least two VECTOR to combine. */ >>> + if (v_info_map.elements () <= 1) >>> + { >>> + cleanup_vinfo_map (v_info_map); >>> + return false; >>> + } >>> + >>> + /* Use the first VECTOR and its information as the reference. >>> + Firstly, we should validate it, that is: >>> + 1) required VECTOR operation supported on target machine. >>> + 2) sorted offsets are adjacent, no holes. >>> + 3) can fill the whole VECTOR perfectly. */ >>> + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); >>> + tree ref_vec = (*it).first; >>> + tree vec_type = TREE_TYPE (ref_vec); >>> + tree elem_type = TREE_TYPE (vec_type); >>> + >>> + /* Check VECTOR operation available on target machine. */ >>> + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); >>> + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ >>> + { >>> + cleanup_vinfo_map (v_info_map); >>> + return false; >>> + } >>> + >>> + v_info_ptr ref_info = (*it).second; >>> + ref_info->offsets.qsort (unsigned_cmp); >>> + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); >>> + unsigned HOST_WIDE_INT curr; >>> + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; >>> + >>> + /* Continous check. */ >>> + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) >>> + { >>> + if (curr != (prev + elem_size)) >>> + { >>> + cleanup_vinfo_map (v_info_map); >>> + return false; >>> + } >>> + prev = curr; >>> + } >>> + >>> + /* Check whether fill the whole. */ >>> + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) >>> + { >>> + cleanup_vinfo_map (v_info_map); >>> + return false; >>> + } >>> + >>> + auto_vec<tree> vectors (v_info_map.elements ()); >>> + vectors.quick_push (ref_vec); >>> + >>> + /* Use the ref_vec to filter others. */ >>> + for (++it; it != v_info_map.end (); ++it) >>> + { >>> + tree vec = (*it).first; >>> + v_info_ptr info = (*it).second; >>> + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) >>> + continue; >>> + if (ref_info->offsets.length () != info->offsets.length ()) >>> + continue; >>> + bool same_offset = true; >>> + info->offsets.qsort (unsigned_cmp); >>> + for (unsigned i = 0; i < ref_info->offsets.length (); i++) >>> + { >>> + if (ref_info->offsets[i] != info->offsets[i]) >>> + { >>> + same_offset = false; >>> + break; >>> + } >>> + } >>> + if (!same_offset) >>> + continue; >>> + vectors.quick_push (vec); >>> + } >>> + >>> + if (vectors.length () < 2) >>> + { >>> + cleanup_vinfo_map (v_info_map); >>> + return false; >>> + } >>> + >>> + tree tr; >>> + if (dump_file && (dump_flags & TDF_DETAILS)) >>> + { >>> + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); >>> + FOR_EACH_VEC_ELT (vectors, i, tr) >>> + { >>> + print_generic_expr (dump_file, tr); >>> + fprintf (dump_file, " "); >>> + } >>> + fprintf (dump_file, "\n"); >>> + } >>> + >>> + /* Build the sum for all candidate VECTORs. */ >>> + unsigned idx; >>> + gimple *sum = NULL; >>> + v_info_ptr info; >>> + tree sum_vec = ref_vec; >>> + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) >>> + { >>> + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); >>> + info = *(v_info_map.get (tr)); >>> + unsigned j; >>> + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) >>> + { >>> + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); >>> + gimple_set_visited (def, true); >>> + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR >>> + || opcode == BIT_IOR_EXPR) >>> + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); >>> + else if (opcode == MULT_EXPR) >>> + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); >>> + else >>> + { >>> + gcc_assert (opcode == BIT_AND_EXPR); >>> + (*ops)[idx]->op >>> + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); >>> + } >>> + (*ops)[idx]->rank = 0; >>> + } >>> + sum_vec = gimple_get_lhs (sum); >>> + if (dump_file && (dump_flags & TDF_DETAILS)) >>> + { >>> + fprintf (dump_file, "Generating addition -> "); >>> + print_gimple_stmt (dump_file, sum, 0); >>> + } >>> + } >>> + >>> + /* Referring to any good shape VECTOR (here using ref_vec), generate the >>> + BIT_FIELD_REF statements accordingly. */ >>> + info = *(v_info_map.get (ref_vec)); >>> + gcc_assert (sum); >>> + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) >>> + { >>> + tree dst = make_ssa_name (elem_type); >>> + gimple *gs >>> + = gimple_build_assign (dst, BIT_FIELD_REF, >>> + build3 (BIT_FIELD_REF, elem_type, sum_vec, >>> + TYPE_SIZE (elem_type), >>> + bitsize_int (info->offsets[i]))); >>> + insert_stmt_after (gs, sum); >>> + update_stmt (gs); >>> + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); >>> + gimple_set_visited (def, true); >>> + (*ops)[idx]->op = gimple_assign_lhs (gs); >>> + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); >>> + if (dump_file && (dump_flags & TDF_DETAILS)) >>> + { >>> + fprintf (dump_file, "Generating bit_field_ref -> "); >>> + print_gimple_stmt (dump_file, gs, 0); >>> + } >>> + } >>> + >>> + if (dump_file && (dump_flags & TDF_DETAILS)) >>> + { >>> + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); >>> + } >>> + >>> + cleanup_vinfo_map (v_info_map); >>> + >>> + return true; >>> +} >>> + >>> /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison >>> expression, examine the other OPS to see if any of them are comparisons >>> of the same values, which we may be able to combine or eliminate. >>> @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb) >>> tree lhs, rhs1, rhs2; >>> enum tree_code rhs_code = gimple_assign_rhs_code (stmt); >>> >>> - /* If this is not a gimple binary expression, there is >>> - nothing for us to do with it. */ >>> - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) >>> - continue; >>> - >>> /* If this was part of an already processed statement, >>> we don't need to touch it again. */ >>> if (gimple_visited_p (stmt)) >>> @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb) >>> continue; >>> } >>> >>> + /* If this is not a gimple binary expression, there is >>> + nothing for us to do with it. */ >>> + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) >>> + continue; >>> + >>> lhs = gimple_assign_lhs (stmt); >>> rhs1 = gimple_assign_rhs1 (stmt); >>> rhs2 = gimple_assign_rhs2 (stmt); >>> @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb) >>> optimize_ops_list (rhs_code, &ops); >>> } >>> >>> + if (undistribute_bitref_for_vector (rhs_code, &ops, >>> + loop_containing_stmt (stmt))) >>> + { >>> + ops.qsort (sort_by_operand_rank); >>> + optimize_ops_list (rhs_code, &ops); >>> + } >>> + >>> if (rhs_code == PLUS_EXPR >>> && transform_add_to_multiply (&ops)) >>> ops.qsort (sort_by_operand_rank); >> >
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c new file mode 100644 index 0000000..c87caff --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref summation. + + arg1 and arg2 are two arrays whose elements of type vector double. + Assuming: + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], + + Then: + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, + + reassoc transforms + + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] + + V3[0] + V3[1]; + + into: + + T = V0 + V1 + V2 + V3 + accumulator += T[0] + T[1]; + + Fewer bit_field_refs, only two for 128 or more bits vector. */ + +typedef double v2df __attribute__ ((vector_size (16))); +double +test (double accumulator, v2df arg1[], v2df arg2[]) +{ + v2df temp; + temp = arg1[0] * arg2[0]; + accumulator += temp[0] + temp[1]; + temp = arg1[1] * arg2[1]; + accumulator += temp[0] + temp[1]; + temp = arg1[2] * arg2[2]; + accumulator += temp[0] + temp[1]; + temp = arg1[3] * arg2[3]; + accumulator += temp[0] + temp[1]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c new file mode 100644 index 0000000..d1851ff --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on multiplication. + + v1, v2, v3, v4 of type vector float. + + reassoc transforms + + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * + v2[0] * v2[1] * v2[2] * v2[3] * + v3[0] * v3[1] * v3[2] * v3[3] * + v4[0] * v4[1] * v4[2] * v4[3] ; + + into: + + T = v1 * v2 * v3 * v4; + accumulator *= T[0] * T[1] * T[2] * T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef float v4si __attribute__((vector_size(16))); +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c new file mode 100644 index 0000000..e774d25 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & + v2[0] & v2[1] & v2[2] & v2[3] & + v3[0] & v3[1] & v3[2] & v3[3] & + v4[0] & v4[1] & v4[2] & v4[3] ; + + into: + + T = v1 & v2 & v3 & v4; + accumulator &= T[0] & T[1] & T[2] & T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c new file mode 100644 index 0000000..7b75404 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | + v2[0] | v2[1] | v2[2] | v2[3] | + v3[0] | v3[1] | v3[2] | v3[3] | + v4[0] | v4[1] | v4[2] | v4[3] ; + + into: + + T = v1 | v2 | v3 | v4; + accumulator |= T[0] | T[1] | T[2] | T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c new file mode 100644 index 0000000..d069725 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */ + +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. + + v1, v2, v3, v4 of type vector int. + + reassoc transforms + + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; + + into: + + T = v1 ^ v2 ^ v3 ^ v4; + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; + + Fewer bit_field_refs, only four for 128 or more bits vector. */ + +typedef int v4si __attribute__((vector_size(16))); +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; + return accumulator; +} +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index e1c4dfe..d755911 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode, return changed; } +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ +struct v_info +{ + auto_vec<unsigned HOST_WIDE_INT, 32> offsets; + auto_vec<unsigned, 32> ops_indexes; +}; + +typedef struct v_info *v_info_ptr; + +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ +static int +unsigned_cmp (const void *p_i, const void *p_j) +{ + if (*(const unsigned *) p_i >= *(const unsigned *) p_j) + return 1; + else + return -1; +} + +/* Cleanup hash map for VECTOR information. */ +static void +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map) +{ + for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin (); + it != info_map.end (); ++it) + { + v_info_ptr info = (*it).second; + delete info; + (*it).second = NULL; + } +} + +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] + is transformed to + Vs = (V1 + V2 + ... + Vn) + Vs[0] + Vs[1] + ... + Vs[k] + + The basic steps are listed below: + + 1) Check the addition chain *OPS by looking those summands coming from + VECTOR bit_field_ref on VECTOR type. Put the information into + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. + + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are + continous, they can cover the whole VECTOR perfectly without any holes. + Obtain one VECTOR list which contain candidates to be transformed. + + 3) Build the addition statements for all VECTOR candidates, generate + BIT_FIELD_REFs accordingly. + + TODO: + 1) The current implementation restrict all candidate VECTORs should have + the same VECTOR type, but it can be extended into different groups by + VECTOR types in future if any profitable cases found. + 2) The current implementation requires the whole VECTORs should be fully + covered, but it can be extended to support partial, checking adjacent + but not fill the whole, it may need some cost model to define the + boundary to do or not. +*/ +static bool +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops, + struct loop *loop) +{ + if (ops->length () <= 1) + return false; + + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) + return false; + + hash_map<tree, v_info_ptr> v_info_map; + operand_entry *oe1; + unsigned i; + + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the + information into map. */ + FOR_EACH_VEC_ELT (*ops, i, oe1) + { + enum tree_code dcode; + gimple *oe1def; + + if (TREE_CODE (oe1->op) != SSA_NAME) + continue; + oe1def = SSA_NAME_DEF_STMT (oe1->op); + if (!is_gimple_assign (oe1def)) + continue; + dcode = gimple_assign_rhs_code (oe1def); + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) + continue; + + tree rhs = gimple_op (oe1def, 1); + tree op0 = TREE_OPERAND (rhs, 0); + tree vec_type = TREE_TYPE (op0); + + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) + continue; + + tree op1 = TREE_OPERAND (rhs, 1); + tree op2 = TREE_OPERAND (rhs, 2); + + tree elem_type = TREE_TYPE (vec_type); + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + if (size != TREE_INT_CST_LOW (op1)) + continue; + + /* Ignore it if target machine can't support this VECTOR type. */ + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) + continue; + + v_info_ptr *info_ptr = v_info_map.get (op0); + if (info_ptr) + { + v_info_ptr info = *info_ptr; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + } + else + { + v_info_ptr info = new v_info; + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); + info->ops_indexes.safe_push (i); + v_info_map.put (op0, info); + } + } + + /* At least two VECTOR to combine. */ + if (v_info_map.elements () <= 1) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + /* Use the first VECTOR and its information as the reference. + Firstly, we should validate it, that is: + 1) required VECTOR operation supported on target machine. + 2) sorted offsets are adjacent, no holes. + 3) can fill the whole VECTOR perfectly. */ + hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin (); + tree ref_vec = (*it).first; + tree vec_type = TREE_TYPE (ref_vec); + tree elem_type = TREE_TYPE (vec_type); + + /* Check VECTOR operation available on target machine. */ + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + v_info_ptr ref_info = (*it).second; + ref_info->offsets.qsort (unsigned_cmp); + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); + unsigned HOST_WIDE_INT curr; + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; + + /* Continous check. */ + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) + { + if (curr != (prev + elem_size)) + { + cleanup_vinfo_map (v_info_map); + return false; + } + prev = curr; + } + + /* Check whether fill the whole. */ + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + auto_vec<tree> vectors (v_info_map.elements ()); + vectors.quick_push (ref_vec); + + /* Use the ref_vec to filter others. */ + for (++it; it != v_info_map.end (); ++it) + { + tree vec = (*it).first; + v_info_ptr info = (*it).second; + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) + continue; + if (ref_info->offsets.length () != info->offsets.length ()) + continue; + bool same_offset = true; + info->offsets.qsort (unsigned_cmp); + for (unsigned i = 0; i < ref_info->offsets.length (); i++) + { + if (ref_info->offsets[i] != info->offsets[i]) + { + same_offset = false; + break; + } + } + if (!same_offset) + continue; + vectors.quick_push (vec); + } + + if (vectors.length () < 2) + { + cleanup_vinfo_map (v_info_map); + return false; + } + + tree tr; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); + FOR_EACH_VEC_ELT (vectors, i, tr) + { + print_generic_expr (dump_file, tr); + fprintf (dump_file, " "); + } + fprintf (dump_file, "\n"); + } + + /* Build the sum for all candidate VECTORs. */ + unsigned idx; + gimple *sum = NULL; + v_info_ptr info; + tree sum_vec = ref_vec; + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) + { + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); + info = *(v_info_map.get (tr)); + unsigned j; + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) + { + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR + || opcode == BIT_IOR_EXPR) + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); + else if (opcode == MULT_EXPR) + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); + else + { + gcc_assert (opcode == BIT_AND_EXPR); + (*ops)[idx]->op + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); + } + (*ops)[idx]->rank = 0; + } + sum_vec = gimple_get_lhs (sum); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating addition -> "); + print_gimple_stmt (dump_file, sum, 0); + } + } + + /* Referring to any good shape VECTOR (here using ref_vec), generate the + BIT_FIELD_REF statements accordingly. */ + info = *(v_info_map.get (ref_vec)); + gcc_assert (sum); + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) + { + tree dst = make_ssa_name (elem_type); + gimple *gs + = gimple_build_assign (dst, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, elem_type, sum_vec, + TYPE_SIZE (elem_type), + bitsize_int (info->offsets[i]))); + insert_stmt_after (gs, sum); + update_stmt (gs); + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); + gimple_set_visited (def, true); + (*ops)[idx]->op = gimple_assign_lhs (gs); + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Generating bit_field_ref -> "); + print_gimple_stmt (dump_file, gs, 0); + } + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); + } + + cleanup_vinfo_map (v_info_map); + + return true; +} + /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison expression, examine the other OPS to see if any of them are comparisons of the same values, which we may be able to combine or eliminate. @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb) tree lhs, rhs1, rhs2; enum tree_code rhs_code = gimple_assign_rhs_code (stmt); - /* If this is not a gimple binary expression, there is - nothing for us to do with it. */ - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) - continue; - /* If this was part of an already processed statement, we don't need to touch it again. */ if (gimple_visited_p (stmt)) @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb) continue; } + /* If this is not a gimple binary expression, there is + nothing for us to do with it. */ + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) + continue; + lhs = gimple_assign_lhs (stmt); rhs1 = gimple_assign_rhs1 (stmt); rhs2 = gimple_assign_rhs2 (stmt); @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb) optimize_ops_list (rhs_code, &ops); } + if (undistribute_bitref_for_vector (rhs_code, &ops, + loop_containing_stmt (stmt))) + { + ops.qsort (sort_by_operand_rank); + optimize_ops_list (rhs_code, &ops); + } + if (rhs_code == PLUS_EXPR && transform_add_to_multiply (&ops)) ops.qsort (sort_by_operand_rank);