diff mbox series

[V2] PR88497 - Extend reassoc for vector bit_field_ref

Message ID 814943f9-df8a-3049-f726-fa17f6f8d3b3@linux.ibm.com
State New
Headers show
Series [V2] PR88497 - Extend reassoc for vector bit_field_ref | expand

Commit Message

Kewen.Lin March 12, 2019, 12:57 p.m. UTC
Hi,

As the comments from Richard and Segher (Thanks!), I've made some 
changes by adding some checks and extensions. 
  *) Check whether vector type available on target machine,
  *) Check whether vector operation available on target machine, 
  *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR.
  *) Add more test cases for coverage.

Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, 
is it ok for GCC10?

gcc/ChangeLog

2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>

	PR target/88497
	* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of 
	GIMPLE_BINARY_RHS check and gimple_visited_p check, call new 
	function undistribute_bitref_for_vector.
	(undistribute_bitref_for_vector): New function.
	(cleanup_vinfo_map): Likewise.
	(unsigned_cmp): Likewise.

gcc/testsuite/ChangeLog

2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>

	* gcc.dg/tree-ssa/pr88497-1.c: New test.
	* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
	* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
	* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
	* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c |  42 ++++
 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c |  31 +++
 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c |  31 +++
 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c |  31 +++
 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c |  31 +++
 gcc/tree-ssa-reassoc.c                    | 309 +++++++++++++++++++++++++++++-
 6 files changed, 470 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c

Comments

Bill Schmidt March 12, 2019, 2:29 p.m. UTC | #1
On 3/12/19 7:57 AM, Kewen.Lin wrote:
> Hi,
>
> As the comments from Richard and Segher (Thanks!), I've made some 
> changes by adding some checks and extensions. 
>   *) Check whether vector type available on target machine,
>   *) Check whether vector operation available on target machine, 
>   *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR.
>   *) Add more test cases for coverage.
>
> Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, 
> is it ok for GCC10?
>
> gcc/ChangeLog
>
> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>
> 	PR target/88497
> 	* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of 
> 	GIMPLE_BINARY_RHS check and gimple_visited_p check, call new 
> 	function undistribute_bitref_for_vector.
> 	(undistribute_bitref_for_vector): New function.
> 	(cleanup_vinfo_map): Likewise.
> 	(unsigned_cmp): Likewise.
>
> gcc/testsuite/ChangeLog
>
> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>
> 	* gcc.dg/tree-ssa/pr88497-1.c: New test.
> 	* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
> 	* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
> 	* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
> 	* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c |  42 ++++
>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c |  31 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c |  31 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c |  31 +++
>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c |  31 +++
>  gcc/tree-ssa-reassoc.c                    | 309 +++++++++++++++++++++++++++++-
>  6 files changed, 470 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
> new file mode 100644
> index 0000000..c87caff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
> @@ -0,0 +1,42 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
> +
> +/* To test reassoc can undistribute vector bit_field_ref summation.
> +
> +   arg1 and arg2 are two arrays whose elements of type vector double.
> +   Assuming:
> +     A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
> +     B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
> +
> +   Then:
> +     V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
> +
> +   reassoc transforms
> +
> +     accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
> +		  + V3[0] + V3[1];
> +
> +   into:
> +
> +     T = V0 + V1 + V2 + V3
> +     accumulator += T[0] + T[1];
> +
> +   Fewer bit_field_refs, only two for 128 or more bits vector.  */
> +
> +typedef double v2df __attribute__ ((vector_size (16)));
> +double
> +test (double accumulator, v2df arg1[], v2df arg2[])
> +{
> +  v2df temp;
> +  temp = arg1[0] * arg2[0];
> +  accumulator += temp[0] + temp[1];
> +  temp = arg1[1] * arg2[1];
> +  accumulator += temp[0] + temp[1];
> +  temp = arg1[2] * arg2[2];
> +  accumulator += temp[0] + temp[1];
> +  temp = arg1[3] * arg2[3];
> +  accumulator += temp[0] + temp[1];
> +  return accumulator;
> +}
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
> new file mode 100644
> index 0000000..d1851ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_float } */
> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
> +
> +/* To test reassoc can undistribute vector bit_field_ref on multiplication.
> +
> +   v1, v2, v3, v4 of type vector float.
> +
> +   reassoc transforms
> +
> +     accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
> +                    v2[0] * v2[1] * v2[2] * v2[3] *
> +                    v3[0] * v3[1] * v3[2] * v3[3] *
> +                    v4[0] * v4[1] * v4[2] * v4[3] ;
> +
> +   into:
> +
> +     T = v1 * v2 * v3 * v4;
> +     accumulator *= T[0] * T[1] * T[2] * T[3];
> +
> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
> +
> +typedef float v4si __attribute__((vector_size(16)));
> +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
> +  accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
> +  accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
> +  accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
> +  accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
> +  return accumulator;
> +}
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
> new file mode 100644
> index 0000000..e774d25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
> +
> +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
> +
> +   v1, v2, v3, v4 of type vector int.
> +
> +   reassoc transforms
> +
> +     accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
> +                    v2[0] & v2[1] & v2[2] & v2[3] &
> +                    v3[0] & v3[1] & v3[2] & v3[3] &
> +                    v4[0] & v4[1] & v4[2] & v4[3] ;
> +
> +   into:
> +
> +     T = v1 & v2 & v3 & v4;
> +     accumulator &= T[0] & T[1] & T[2] & T[3];
> +
> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
> +  accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
> +  accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
> +  accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
> +  accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
> +  return accumulator;
> +}
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
> new file mode 100644
> index 0000000..7b75404
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
> +
> +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
> +
> +   v1, v2, v3, v4 of type vector int.
> +
> +   reassoc transforms
> +
> +     accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
> +                    v2[0] | v2[1] | v2[2] | v2[3] |
> +                    v3[0] | v3[1] | v3[2] | v3[3] |
> +                    v4[0] | v4[1] | v4[2] | v4[3] ;
> +
> +   into:
> +
> +     T = v1 | v2 | v3 | v4;
> +     accumulator |= T[0] | T[1] | T[2] | T[3];
> +
> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
> +  accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
> +  accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
> +  accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
> +  accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
> +  return accumulator;
> +}
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
> new file mode 100644
> index 0000000..d069725
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
> +
> +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
> +
> +   v1, v2, v3, v4 of type vector int.
> +
> +   reassoc transforms
> +
> +     accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
> +                    v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
> +                    v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
> +                    v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
> +
> +   into:
> +
> +     T = v1 ^ v2 ^ v3 ^ v4;
> +     accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
> +
> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
> +
> +typedef int v4si __attribute__((vector_size(16)));
> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
> +  accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
> +  accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
> +  accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
> +  accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
> +  return accumulator;
> +}
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
> index e1c4dfe..d755911 100644
> --- a/gcc/tree-ssa-reassoc.c
> +++ b/gcc/tree-ssa-reassoc.c
> @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode,
>    return changed;
>  }
>  
> +/* Hold the information of one specific VECTOR_TYPE SSA_NAME.
> +    - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR.
> +    - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF.  */
> +struct v_info
> +{
> +  auto_vec<unsigned HOST_WIDE_INT, 32> offsets;
> +  auto_vec<unsigned, 32> ops_indexes;
> +};
> +
> +typedef struct v_info *v_info_ptr;
> +
> +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets.  */
> +static int
> +unsigned_cmp (const void *p_i, const void *p_j)
> +{
> +  if (*(const unsigned *) p_i >= *(const unsigned *) p_j)
> +    return 1;
> +  else
> +    return -1;
> +}
> +
> +/* Cleanup hash map for VECTOR information.  */
> +static void
> +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
> +{
> +  for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
> +       it != info_map.end (); ++it)
> +    {
> +      v_info_ptr info = (*it).second;
> +      delete info;
> +      (*it).second = NULL;
> +    }
> +}
> +
> +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
> +     V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
> +   is transformed to
> +     Vs = (V1 + V2 + ... + Vn)
> +     Vs[0] + Vs[1] + ... + Vs[k]
> +
> +   The basic steps are listed below:
> +
> +    1) Check the addition chain *OPS by looking those summands coming from
> +       VECTOR bit_field_ref on VECTOR type. Put the information into
> +       v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
> +
> +    2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
> +       continous, they can cover the whole VECTOR perfectly without any holes.
> +       Obtain one VECTOR list which contain candidates to be transformed.
> +
> +    3) Build the addition statements for all VECTOR candidates, generate
> +       BIT_FIELD_REFs accordingly.
> +
> +   TODO:
> +    1) The current implementation restrict all candidate VECTORs should have
> +       the same VECTOR type, but it can be extended into different groups by
> +       VECTOR types in future if any profitable cases found.
> +    2) The current implementation requires the whole VECTORs should be fully
> +       covered, but it can be extended to support partial, checking adjacent
> +       but not fill the whole, it may need some cost model to define the
> +       boundary to do or not.
> +*/
> +static bool
> +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops,
> +			     struct loop *loop)
> +{
> +  if (ops->length () <= 1)
> +    return false;
> +
> +  if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
> +      && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
> +    return false;
> +
> +  hash_map<tree, v_info_ptr> v_info_map;
> +  operand_entry *oe1;
> +  unsigned i;
> +
> +  /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
> +     information into map.  */
> +  FOR_EACH_VEC_ELT (*ops, i, oe1)
> +    {
> +      enum tree_code dcode;
> +      gimple *oe1def;
> +
> +      if (TREE_CODE (oe1->op) != SSA_NAME)
> +	continue;
> +      oe1def = SSA_NAME_DEF_STMT (oe1->op);
> +      if (!is_gimple_assign (oe1def))
> +	continue;
> +      dcode = gimple_assign_rhs_code (oe1def);
> +      if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
> +	continue;
> +
> +      tree rhs = gimple_op (oe1def, 1);
> +      tree op0 = TREE_OPERAND (rhs, 0);
> +      tree vec_type = TREE_TYPE (op0);
> +
> +      if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE)
> +	continue;
> +
> +      tree op1 = TREE_OPERAND (rhs, 1);
> +      tree op2 = TREE_OPERAND (rhs, 2);
> +
> +      tree elem_type = TREE_TYPE (vec_type);
> +      unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> +      if (size != TREE_INT_CST_LOW (op1))
> +	continue;
> +
> +      /* Ignore it if target machine can't support this VECTOR type.  */
> +      if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
> +	continue;

I don't think this is sufficient.  I think you need to use something like

  (targetm.vector_mode_supported_p (TYPE_MODE (vec_type))
   && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing)

I might not have spelled all of this right.  There are examples of testing
for target support in tree-vect-stmts.c.

Thanks,
Bill

> +
> +      v_info_ptr *info_ptr = v_info_map.get (op0);
> +      if (info_ptr)
> +	{
> +	  v_info_ptr info = *info_ptr;
> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
> +	  info->ops_indexes.safe_push (i);
> +	}
> +      else
> +	{
> +	  v_info_ptr info = new v_info;
> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
> +	  info->ops_indexes.safe_push (i);
> +	  v_info_map.put (op0, info);
> +	}
> +    }
> +
> +  /* At least two VECTOR to combine.  */
> +  if (v_info_map.elements () <= 1)
> +    {
> +      cleanup_vinfo_map (v_info_map);
> +      return false;
> +    }
> +
> +  /* Use the first VECTOR and its information as the reference.
> +     Firstly, we should validate it, that is:
> +       1) required VECTOR operation supported on target machine.
> +       2) sorted offsets are adjacent, no holes.
> +       3) can fill the whole VECTOR perfectly.  */
> +  hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
> +  tree ref_vec = (*it).first;
> +  tree vec_type = TREE_TYPE (ref_vec);
> +  tree elem_type = TREE_TYPE (vec_type);
> +
> +  /* Check VECTOR operation available on target machine.  */
> +  optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
> +  if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
> +    {
> +      cleanup_vinfo_map (v_info_map);
> +      return false;
> +    }
> +
> +  v_info_ptr ref_info = (*it).second;
> +  ref_info->offsets.qsort (unsigned_cmp);
> +  unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> +  unsigned HOST_WIDE_INT curr;
> +  unsigned HOST_WIDE_INT prev = ref_info->offsets[0];
> +
> +  /* Continous check.  */
> +  FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1)
> +    {
> +      if (curr != (prev + elem_size))
> +	{
> +	  cleanup_vinfo_map (v_info_map);
> +	  return false;
> +	}
> +      prev = curr;
> +    }
> +
> +  /* Check whether fill the whole.  */
> +  if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec))))
> +    {
> +      cleanup_vinfo_map (v_info_map);
> +      return false;
> +    }
> +
> +  auto_vec<tree> vectors (v_info_map.elements ());
> +  vectors.quick_push (ref_vec);
> +
> +  /* Use the ref_vec to filter others.  */
> +  for (++it; it != v_info_map.end (); ++it)
> +    {
> +      tree vec = (*it).first;
> +      v_info_ptr info = (*it).second;
> +      if (TREE_TYPE (ref_vec) != TREE_TYPE (vec))
> +	continue;
> +      if (ref_info->offsets.length () != info->offsets.length ())
> +	continue;
> +      bool same_offset = true;
> +      info->offsets.qsort (unsigned_cmp);
> +      for (unsigned i = 0; i < ref_info->offsets.length (); i++)
> +	{
> +	  if (ref_info->offsets[i] != info->offsets[i])
> +	    {
> +	      same_offset = false;
> +	      break;
> +	    }
> +	}
> +      if (!same_offset)
> +	continue;
> +      vectors.quick_push (vec);
> +    }
> +
> +  if (vectors.length () < 2)
> +    {
> +      cleanup_vinfo_map (v_info_map);
> +      return false;
> +    }
> +
> +  tree tr;
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "The bit_field_ref vector list for undistribute: ");
> +      FOR_EACH_VEC_ELT (vectors, i, tr)
> +	{
> +	  print_generic_expr (dump_file, tr);
> +	  fprintf (dump_file, "  ");
> +	}
> +      fprintf (dump_file, "\n");
> +    }
> +
> +  /* Build the sum for all candidate VECTORs.  */
> +  unsigned idx;
> +  gimple *sum = NULL;
> +  v_info_ptr info;
> +  tree sum_vec = ref_vec;
> +  FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1)
> +    {
> +      sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode);
> +      info = *(v_info_map.get (tr));
> +      unsigned j;
> +      FOR_EACH_VEC_ELT (info->ops_indexes, j, idx)
> +	{
> +	  gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
> +	  gimple_set_visited (def, true);
> +	  if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
> +	      || opcode == BIT_IOR_EXPR)
> +	    (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
> +	  else if (opcode == MULT_EXPR)
> +	    (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
> +	  else
> +	    {
> +	      gcc_assert (opcode == BIT_AND_EXPR);
> +	      (*ops)[idx]->op
> +		= build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
> +	    }
> +	  (*ops)[idx]->rank = 0;
> +	}
> +      sum_vec = gimple_get_lhs (sum);
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	{
> +	  fprintf (dump_file, "Generating addition -> ");
> +	  print_gimple_stmt (dump_file, sum, 0);
> +	}
> +    }
> +
> +  /* Referring to any good shape VECTOR (here using ref_vec), generate the
> +     BIT_FIELD_REF statements accordingly.  */
> +  info = *(v_info_map.get (ref_vec));
> +  gcc_assert (sum);
> +  FOR_EACH_VEC_ELT (info->ops_indexes, i, idx)
> +    {
> +      tree dst = make_ssa_name (elem_type);
> +      gimple *gs
> +	= gimple_build_assign (dst, BIT_FIELD_REF,
> +			       build3 (BIT_FIELD_REF, elem_type, sum_vec,
> +				       TYPE_SIZE (elem_type),
> +				       bitsize_int (info->offsets[i])));
> +      insert_stmt_after (gs, sum);
> +      update_stmt (gs);
> +      gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
> +      gimple_set_visited (def, true);
> +      (*ops)[idx]->op = gimple_assign_lhs (gs);
> +      (*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	{
> +	  fprintf (dump_file, "Generating bit_field_ref -> ");
> +	  print_gimple_stmt (dump_file, gs, 0);
> +	}
> +    }
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
> +    }
> +
> +  cleanup_vinfo_map (v_info_map);
> +
> +  return true;
> +}
> +
>  /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
>     expression, examine the other OPS to see if any of them are comparisons
>     of the same values, which we may be able to combine or eliminate.
> @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb)
>  	  tree lhs, rhs1, rhs2;
>  	  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
>  
> -	  /* If this is not a gimple binary expression, there is
> -	     nothing for us to do with it.  */
> -	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
> -	    continue;
> -
>  	  /* If this was part of an already processed statement,
>  	     we don't need to touch it again. */
>  	  if (gimple_visited_p (stmt))
> @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb)
>  	      continue;
>  	    }
>  
> +	  /* If this is not a gimple binary expression, there is
> +	     nothing for us to do with it.  */
> +	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
> +	    continue;
> +
>  	  lhs = gimple_assign_lhs (stmt);
>  	  rhs1 = gimple_assign_rhs1 (stmt);
>  	  rhs2 = gimple_assign_rhs2 (stmt);
> @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb)
>  		  optimize_ops_list (rhs_code, &ops);
>  		}
>  
> +	      if (undistribute_bitref_for_vector (rhs_code, &ops,
> +						  loop_containing_stmt (stmt)))
> +		{
> +		  ops.qsort (sort_by_operand_rank);
> +		  optimize_ops_list (rhs_code, &ops);
> +		}
> +
>  	      if (rhs_code == PLUS_EXPR
>  		  && transform_add_to_multiply (&ops))
>  		ops.qsort (sort_by_operand_rank);
Bill Schmidt March 12, 2019, 3:22 p.m. UTC | #2
On 3/12/19 9:29 AM, Bill Schmidt wrote:
> On 3/12/19 7:57 AM, Kewen.Lin wrote:
>> Hi,
>>
>> As the comments from Richard and Segher (Thanks!), I've made some 
>> changes by adding some checks and extensions. 
>>   *) Check whether vector type available on target machine,
>>   *) Check whether vector operation available on target machine, 
>>   *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR.
>>   *) Add more test cases for coverage.
>>
>> Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, 
>> is it ok for GCC10?
>>
>> gcc/ChangeLog
>>
>> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>>
>> 	PR target/88497
>> 	* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of 
>> 	GIMPLE_BINARY_RHS check and gimple_visited_p check, call new 
>> 	function undistribute_bitref_for_vector.
>> 	(undistribute_bitref_for_vector): New function.
>> 	(cleanup_vinfo_map): Likewise.
>> 	(unsigned_cmp): Likewise.
>>
>> gcc/testsuite/ChangeLog
>>
>> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>>
>> 	* gcc.dg/tree-ssa/pr88497-1.c: New test.
>> 	* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
>> 	* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
>> 	* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
>> 	* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
>> ---
>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c |  42 ++++
>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c |  31 +++
>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c |  31 +++
>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c |  31 +++
>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c |  31 +++
>>  gcc/tree-ssa-reassoc.c                    | 309 +++++++++++++++++++++++++++++-
>>  6 files changed, 470 insertions(+), 5 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>> new file mode 100644
>> index 0000000..c87caff
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>> @@ -0,0 +1,42 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_double } */
>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>> +
>> +/* To test reassoc can undistribute vector bit_field_ref summation.
>> +
>> +   arg1 and arg2 are two arrays whose elements of type vector double.
>> +   Assuming:
>> +     A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
>> +     B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
>> +
>> +   Then:
>> +     V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
>> +
>> +   reassoc transforms
>> +
>> +     accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
>> +		  + V3[0] + V3[1];
>> +
>> +   into:
>> +
>> +     T = V0 + V1 + V2 + V3
>> +     accumulator += T[0] + T[1];
>> +
>> +   Fewer bit_field_refs, only two for 128 or more bits vector.  */
>> +
>> +typedef double v2df __attribute__ ((vector_size (16)));
>> +double
>> +test (double accumulator, v2df arg1[], v2df arg2[])
>> +{
>> +  v2df temp;
>> +  temp = arg1[0] * arg2[0];
>> +  accumulator += temp[0] + temp[1];
>> +  temp = arg1[1] * arg2[1];
>> +  accumulator += temp[0] + temp[1];
>> +  temp = arg1[2] * arg2[2];
>> +  accumulator += temp[0] + temp[1];
>> +  temp = arg1[3] * arg2[3];
>> +  accumulator += temp[0] + temp[1];
>> +  return accumulator;
>> +}
>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>> new file mode 100644
>> index 0000000..d1851ff
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_float } */
>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>> +
>> +/* To test reassoc can undistribute vector bit_field_ref on multiplication.
>> +
>> +   v1, v2, v3, v4 of type vector float.
>> +
>> +   reassoc transforms
>> +
>> +     accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
>> +                    v2[0] * v2[1] * v2[2] * v2[3] *
>> +                    v3[0] * v3[1] * v3[2] * v3[3] *
>> +                    v4[0] * v4[1] * v4[2] * v4[3] ;
>> +
>> +   into:
>> +
>> +     T = v1 * v2 * v3 * v4;
>> +     accumulator *= T[0] * T[1] * T[2] * T[3];
>> +
>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>> +
>> +typedef float v4si __attribute__((vector_size(16)));
>> +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>> +  accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
>> +  accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
>> +  accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
>> +  accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
>> +  return accumulator;
>> +}
>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>> new file mode 100644
>> index 0000000..e774d25
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
>> +
>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
>> +
>> +   v1, v2, v3, v4 of type vector int.
>> +
>> +   reassoc transforms
>> +
>> +     accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
>> +                    v2[0] & v2[1] & v2[2] & v2[3] &
>> +                    v3[0] & v3[1] & v3[2] & v3[3] &
>> +                    v4[0] & v4[1] & v4[2] & v4[3] ;
>> +
>> +   into:
>> +
>> +     T = v1 & v2 & v3 & v4;
>> +     accumulator &= T[0] & T[1] & T[2] & T[3];
>> +
>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>> +
>> +typedef int v4si __attribute__((vector_size(16)));
>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>> +  accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
>> +  accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
>> +  accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
>> +  accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
>> +  return accumulator;
>> +}
>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>> new file mode 100644
>> index 0000000..7b75404
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
>> +
>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
>> +
>> +   v1, v2, v3, v4 of type vector int.
>> +
>> +   reassoc transforms
>> +
>> +     accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
>> +                    v2[0] | v2[1] | v2[2] | v2[3] |
>> +                    v3[0] | v3[1] | v3[2] | v3[3] |
>> +                    v4[0] | v4[1] | v4[2] | v4[3] ;
>> +
>> +   into:
>> +
>> +     T = v1 | v2 | v3 | v4;
>> +     accumulator |= T[0] | T[1] | T[2] | T[3];
>> +
>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>> +
>> +typedef int v4si __attribute__((vector_size(16)));
>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>> +  accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
>> +  accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
>> +  accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
>> +  accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
>> +  return accumulator;
>> +}
>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>> new file mode 100644
>> index 0000000..d069725
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target vect_int } */
>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>> +
>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
>> +
>> +   v1, v2, v3, v4 of type vector int.
>> +
>> +   reassoc transforms
>> +
>> +     accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
>> +                    v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
>> +                    v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
>> +                    v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
>> +
>> +   into:
>> +
>> +     T = v1 ^ v2 ^ v3 ^ v4;
>> +     accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
>> +
>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>> +
>> +typedef int v4si __attribute__((vector_size(16)));
>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>> +  accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
>> +  accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
>> +  accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
>> +  accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
>> +  return accumulator;
>> +}
>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
>> index e1c4dfe..d755911 100644
>> --- a/gcc/tree-ssa-reassoc.c
>> +++ b/gcc/tree-ssa-reassoc.c
>> @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode,
>>    return changed;
>>  }
>>  
>> +/* Hold the information of one specific VECTOR_TYPE SSA_NAME.
>> +    - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR.
>> +    - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF.  */
>> +struct v_info
>> +{
>> +  auto_vec<unsigned HOST_WIDE_INT, 32> offsets;
>> +  auto_vec<unsigned, 32> ops_indexes;
>> +};
>> +
>> +typedef struct v_info *v_info_ptr;
>> +
>> +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets.  */
>> +static int
>> +unsigned_cmp (const void *p_i, const void *p_j)
>> +{
>> +  if (*(const unsigned *) p_i >= *(const unsigned *) p_j)
>> +    return 1;
>> +  else
>> +    return -1;
>> +}
>> +
>> +/* Cleanup hash map for VECTOR information.  */
>> +static void
>> +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
>> +{
>> +  for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
>> +       it != info_map.end (); ++it)
>> +    {
>> +      v_info_ptr info = (*it).second;
>> +      delete info;
>> +      (*it).second = NULL;
>> +    }
>> +}
>> +
>> +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
>> +     V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
>> +   is transformed to
>> +     Vs = (V1 + V2 + ... + Vn)
>> +     Vs[0] + Vs[1] + ... + Vs[k]
>> +
>> +   The basic steps are listed below:
>> +
>> +    1) Check the addition chain *OPS by looking those summands coming from
>> +       VECTOR bit_field_ref on VECTOR type. Put the information into
>> +       v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
>> +
>> +    2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
>> +       continous, they can cover the whole VECTOR perfectly without any holes.
>> +       Obtain one VECTOR list which contain candidates to be transformed.
>> +
>> +    3) Build the addition statements for all VECTOR candidates, generate
>> +       BIT_FIELD_REFs accordingly.
>> +
>> +   TODO:
>> +    1) The current implementation restrict all candidate VECTORs should have
>> +       the same VECTOR type, but it can be extended into different groups by
>> +       VECTOR types in future if any profitable cases found.
>> +    2) The current implementation requires the whole VECTORs should be fully
>> +       covered, but it can be extended to support partial, checking adjacent
>> +       but not fill the whole, it may need some cost model to define the
>> +       boundary to do or not.
>> +*/
>> +static bool
>> +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops,
>> +			     struct loop *loop)
>> +{
>> +  if (ops->length () <= 1)
>> +    return false;
>> +
>> +  if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
>> +      && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
>> +    return false;
>> +
>> +  hash_map<tree, v_info_ptr> v_info_map;
>> +  operand_entry *oe1;
>> +  unsigned i;
>> +
>> +  /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
>> +     information into map.  */
>> +  FOR_EACH_VEC_ELT (*ops, i, oe1)
>> +    {
>> +      enum tree_code dcode;
>> +      gimple *oe1def;
>> +
>> +      if (TREE_CODE (oe1->op) != SSA_NAME)
>> +	continue;
>> +      oe1def = SSA_NAME_DEF_STMT (oe1->op);
>> +      if (!is_gimple_assign (oe1def))
>> +	continue;
>> +      dcode = gimple_assign_rhs_code (oe1def);
>> +      if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
>> +	continue;
>> +
>> +      tree rhs = gimple_op (oe1def, 1);
>> +      tree op0 = TREE_OPERAND (rhs, 0);
>> +      tree vec_type = TREE_TYPE (op0);
>> +
>> +      if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE)
>> +	continue;
>> +
>> +      tree op1 = TREE_OPERAND (rhs, 1);
>> +      tree op2 = TREE_OPERAND (rhs, 2);
>> +
>> +      tree elem_type = TREE_TYPE (vec_type);
>> +      unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>> +      if (size != TREE_INT_CST_LOW (op1))
>> +	continue;
>> +
>> +      /* Ignore it if target machine can't support this VECTOR type.  */
>> +      if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
>> +	continue;
> I don't think this is sufficient.  I think you need to use something like
>
>   (targetm.vector_mode_supported_p (TYPE_MODE (vec_type))
>    && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing)
>
> I might not have spelled all of this right.  There are examples of testing
> for target support in tree-vect-stmts.c.

As we discussed offline, I see now that what you have is correct -- it's just
not as clear or efficient as I would like.  I think it's better to check the
type support explicitly, and do the optab check up front also, so that you
can exit quickly if this doesn't apply.

But please test!  I haven't tried this at all. ;-)

Thanks,
Bill

>
> Thanks,
> Bill
>> +
>> +      v_info_ptr *info_ptr = v_info_map.get (op0);
>> +      if (info_ptr)
>> +	{
>> +	  v_info_ptr info = *info_ptr;
>> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
>> +	  info->ops_indexes.safe_push (i);
>> +	}
>> +      else
>> +	{
>> +	  v_info_ptr info = new v_info;
>> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
>> +	  info->ops_indexes.safe_push (i);
>> +	  v_info_map.put (op0, info);
>> +	}
>> +    }
>> +
>> +  /* At least two VECTOR to combine.  */
>> +  if (v_info_map.elements () <= 1)
>> +    {
>> +      cleanup_vinfo_map (v_info_map);
>> +      return false;
>> +    }
>> +
>> +  /* Use the first VECTOR and its information as the reference.
>> +     Firstly, we should validate it, that is:
>> +       1) required VECTOR operation supported on target machine.
>> +       2) sorted offsets are adjacent, no holes.
>> +       3) can fill the whole VECTOR perfectly.  */
>> +  hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
>> +  tree ref_vec = (*it).first;
>> +  tree vec_type = TREE_TYPE (ref_vec);
>> +  tree elem_type = TREE_TYPE (vec_type);
>> +
>> +  /* Check VECTOR operation available on target machine.  */
>> +  optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
>> +  if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
>> +    {
>> +      cleanup_vinfo_map (v_info_map);
>> +      return false;
>> +    }
>> +
>> +  v_info_ptr ref_info = (*it).second;
>> +  ref_info->offsets.qsort (unsigned_cmp);
>> +  unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>> +  unsigned HOST_WIDE_INT curr;
>> +  unsigned HOST_WIDE_INT prev = ref_info->offsets[0];
>> +
>> +  /* Continous check.  */
>> +  FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1)
>> +    {
>> +      if (curr != (prev + elem_size))
>> +	{
>> +	  cleanup_vinfo_map (v_info_map);
>> +	  return false;
>> +	}
>> +      prev = curr;
>> +    }
>> +
>> +  /* Check whether fill the whole.  */
>> +  if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec))))
>> +    {
>> +      cleanup_vinfo_map (v_info_map);
>> +      return false;
>> +    }
>> +
>> +  auto_vec<tree> vectors (v_info_map.elements ());
>> +  vectors.quick_push (ref_vec);
>> +
>> +  /* Use the ref_vec to filter others.  */
>> +  for (++it; it != v_info_map.end (); ++it)
>> +    {
>> +      tree vec = (*it).first;
>> +      v_info_ptr info = (*it).second;
>> +      if (TREE_TYPE (ref_vec) != TREE_TYPE (vec))
>> +	continue;
>> +      if (ref_info->offsets.length () != info->offsets.length ())
>> +	continue;
>> +      bool same_offset = true;
>> +      info->offsets.qsort (unsigned_cmp);
>> +      for (unsigned i = 0; i < ref_info->offsets.length (); i++)
>> +	{
>> +	  if (ref_info->offsets[i] != info->offsets[i])
>> +	    {
>> +	      same_offset = false;
>> +	      break;
>> +	    }
>> +	}
>> +      if (!same_offset)
>> +	continue;
>> +      vectors.quick_push (vec);
>> +    }
>> +
>> +  if (vectors.length () < 2)
>> +    {
>> +      cleanup_vinfo_map (v_info_map);
>> +      return false;
>> +    }
>> +
>> +  tree tr;
>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>> +    {
>> +      fprintf (dump_file, "The bit_field_ref vector list for undistribute: ");
>> +      FOR_EACH_VEC_ELT (vectors, i, tr)
>> +	{
>> +	  print_generic_expr (dump_file, tr);
>> +	  fprintf (dump_file, "  ");
>> +	}
>> +      fprintf (dump_file, "\n");
>> +    }
>> +
>> +  /* Build the sum for all candidate VECTORs.  */
>> +  unsigned idx;
>> +  gimple *sum = NULL;
>> +  v_info_ptr info;
>> +  tree sum_vec = ref_vec;
>> +  FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1)
>> +    {
>> +      sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode);
>> +      info = *(v_info_map.get (tr));
>> +      unsigned j;
>> +      FOR_EACH_VEC_ELT (info->ops_indexes, j, idx)
>> +	{
>> +	  gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
>> +	  gimple_set_visited (def, true);
>> +	  if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
>> +	      || opcode == BIT_IOR_EXPR)
>> +	    (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
>> +	  else if (opcode == MULT_EXPR)
>> +	    (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
>> +	  else
>> +	    {
>> +	      gcc_assert (opcode == BIT_AND_EXPR);
>> +	      (*ops)[idx]->op
>> +		= build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
>> +	    }
>> +	  (*ops)[idx]->rank = 0;
>> +	}
>> +      sum_vec = gimple_get_lhs (sum);
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	{
>> +	  fprintf (dump_file, "Generating addition -> ");
>> +	  print_gimple_stmt (dump_file, sum, 0);
>> +	}
>> +    }
>> +
>> +  /* Referring to any good shape VECTOR (here using ref_vec), generate the
>> +     BIT_FIELD_REF statements accordingly.  */
>> +  info = *(v_info_map.get (ref_vec));
>> +  gcc_assert (sum);
>> +  FOR_EACH_VEC_ELT (info->ops_indexes, i, idx)
>> +    {
>> +      tree dst = make_ssa_name (elem_type);
>> +      gimple *gs
>> +	= gimple_build_assign (dst, BIT_FIELD_REF,
>> +			       build3 (BIT_FIELD_REF, elem_type, sum_vec,
>> +				       TYPE_SIZE (elem_type),
>> +				       bitsize_int (info->offsets[i])));
>> +      insert_stmt_after (gs, sum);
>> +      update_stmt (gs);
>> +      gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
>> +      gimple_set_visited (def, true);
>> +      (*ops)[idx]->op = gimple_assign_lhs (gs);
>> +      (*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>> +	{
>> +	  fprintf (dump_file, "Generating bit_field_ref -> ");
>> +	  print_gimple_stmt (dump_file, gs, 0);
>> +	}
>> +    }
>> +
>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>> +    {
>> +      fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
>> +    }
>> +
>> +  cleanup_vinfo_map (v_info_map);
>> +
>> +  return true;
>> +}
>> +
>>  /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
>>     expression, examine the other OPS to see if any of them are comparisons
>>     of the same values, which we may be able to combine or eliminate.
>> @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb)
>>  	  tree lhs, rhs1, rhs2;
>>  	  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
>>  
>> -	  /* If this is not a gimple binary expression, there is
>> -	     nothing for us to do with it.  */
>> -	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
>> -	    continue;
>> -
>>  	  /* If this was part of an already processed statement,
>>  	     we don't need to touch it again. */
>>  	  if (gimple_visited_p (stmt))
>> @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb)
>>  	      continue;
>>  	    }
>>  
>> +	  /* If this is not a gimple binary expression, there is
>> +	     nothing for us to do with it.  */
>> +	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
>> +	    continue;
>> +
>>  	  lhs = gimple_assign_lhs (stmt);
>>  	  rhs1 = gimple_assign_rhs1 (stmt);
>>  	  rhs2 = gimple_assign_rhs2 (stmt);
>> @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb)
>>  		  optimize_ops_list (rhs_code, &ops);
>>  		}
>>  
>> +	      if (undistribute_bitref_for_vector (rhs_code, &ops,
>> +						  loop_containing_stmt (stmt)))
>> +		{
>> +		  ops.qsort (sort_by_operand_rank);
>> +		  optimize_ops_list (rhs_code, &ops);
>> +		}
>> +
>>  	      if (rhs_code == PLUS_EXPR
>>  		  && transform_add_to_multiply (&ops))
>>  		ops.qsort (sort_by_operand_rank);
>
Kewen.Lin March 12, 2019, 3:43 p.m. UTC | #3
on 2019/3/12 下午11:22, Bill Schmidt wrote:
> On 3/12/19 9:29 AM, Bill Schmidt wrote:
>> On 3/12/19 7:57 AM, Kewen.Lin wrote:
>>> Hi,
>>>
>>> As the comments from Richard and Segher (Thanks!), I've made some 
>>> changes by adding some checks and extensions. 
>>>   *) Check whether vector type available on target machine,
>>>   *) Check whether vector operation available on target machine, 
>>>   *) Extend the vector operation support to MULT/BIT_AND/BIT_IOR/BIT_XOR.
>>>   *) Add more test cases for coverage.
>>>
>>> Re-bootstrapped/re-regtested successfully on powerpc64le-linux-gnu, 
>>> is it ok for GCC10?
>>>
>>> gcc/ChangeLog
>>>
>>> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>>>
>>> 	PR target/88497
>>> 	* tree-ssa-reassoc.c (reassociate_bb): Swap the positions of 
>>> 	GIMPLE_BINARY_RHS check and gimple_visited_p check, call new 
>>> 	function undistribute_bitref_for_vector.
>>> 	(undistribute_bitref_for_vector): New function.
>>> 	(cleanup_vinfo_map): Likewise.
>>> 	(unsigned_cmp): Likewise.
>>>
>>> gcc/testsuite/ChangeLog
>>>
>>> 2019-03-12  Kewen Lin  <linkw@gcc.gnu.org>
>>>
>>> 	* gcc.dg/tree-ssa/pr88497-1.c: New test.
>>> 	* gcc.dg/tree-ssa/pr88497-2.c: Likewise.
>>> 	* gcc.dg/tree-ssa/pr88497-3.c: Likewise.
>>> 	* gcc.dg/tree-ssa/pr88497-4.c: Likewise.
>>> 	* gcc.dg/tree-ssa/pr88497-5.c: Likewise.
>>> ---
>>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c |  42 ++++
>>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c |  31 +++
>>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c |  31 +++
>>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c |  31 +++
>>>  gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c |  31 +++
>>>  gcc/tree-ssa-reassoc.c                    | 309 +++++++++++++++++++++++++++++-
>>>  6 files changed, 470 insertions(+), 5 deletions(-)
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>>>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>>>
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>>> new file mode 100644
>>> index 0000000..c87caff
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
>>> @@ -0,0 +1,42 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target vect_double } */
>>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>>> +
>>> +/* To test reassoc can undistribute vector bit_field_ref summation.
>>> +
>>> +   arg1 and arg2 are two arrays whose elements of type vector double.
>>> +   Assuming:
>>> +     A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
>>> +     B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
>>> +
>>> +   Then:
>>> +     V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
>>> +
>>> +   reassoc transforms
>>> +
>>> +     accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
>>> +		  + V3[0] + V3[1];
>>> +
>>> +   into:
>>> +
>>> +     T = V0 + V1 + V2 + V3
>>> +     accumulator += T[0] + T[1];
>>> +
>>> +   Fewer bit_field_refs, only two for 128 or more bits vector.  */
>>> +
>>> +typedef double v2df __attribute__ ((vector_size (16)));
>>> +double
>>> +test (double accumulator, v2df arg1[], v2df arg2[])
>>> +{
>>> +  v2df temp;
>>> +  temp = arg1[0] * arg2[0];
>>> +  accumulator += temp[0] + temp[1];
>>> +  temp = arg1[1] * arg2[1];
>>> +  accumulator += temp[0] + temp[1];
>>> +  temp = arg1[2] * arg2[2];
>>> +  accumulator += temp[0] + temp[1];
>>> +  temp = arg1[3] * arg2[3];
>>> +  accumulator += temp[0] + temp[1];
>>> +  return accumulator;
>>> +}
>>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>>> new file mode 100644
>>> index 0000000..d1851ff
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target vect_float } */
>>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>>> +
>>> +/* To test reassoc can undistribute vector bit_field_ref on multiplication.
>>> +
>>> +   v1, v2, v3, v4 of type vector float.
>>> +
>>> +   reassoc transforms
>>> +
>>> +     accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
>>> +                    v2[0] * v2[1] * v2[2] * v2[3] *
>>> +                    v3[0] * v3[1] * v3[2] * v3[3] *
>>> +                    v4[0] * v4[1] * v4[2] * v4[3] ;
>>> +
>>> +   into:
>>> +
>>> +     T = v1 * v2 * v3 * v4;
>>> +     accumulator *= T[0] * T[1] * T[2] * T[3];
>>> +
>>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>>> +
>>> +typedef float v4si __attribute__((vector_size(16)));
>>> +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>>> +  accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
>>> +  accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
>>> +  accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
>>> +  accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
>>> +  return accumulator;
>>> +}
>>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>>> new file mode 100644
>>> index 0000000..e774d25
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target vect_int } */
>>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
>>> +
>>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
>>> +
>>> +   v1, v2, v3, v4 of type vector int.
>>> +
>>> +   reassoc transforms
>>> +
>>> +     accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
>>> +                    v2[0] & v2[1] & v2[2] & v2[3] &
>>> +                    v3[0] & v3[1] & v3[2] & v3[3] &
>>> +                    v4[0] & v4[1] & v4[2] & v4[3] ;
>>> +
>>> +   into:
>>> +
>>> +     T = v1 & v2 & v3 & v4;
>>> +     accumulator &= T[0] & T[1] & T[2] & T[3];
>>> +
>>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>>> +
>>> +typedef int v4si __attribute__((vector_size(16)));
>>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>>> +  accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
>>> +  accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
>>> +  accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
>>> +  accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
>>> +  return accumulator;
>>> +}
>>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>>> new file mode 100644
>>> index 0000000..7b75404
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target vect_int } */
>>> +/* { dg-options "-O2 -fdump-tree-reassoc1" } */
>>> +
>>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
>>> +
>>> +   v1, v2, v3, v4 of type vector int.
>>> +
>>> +   reassoc transforms
>>> +
>>> +     accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
>>> +                    v2[0] | v2[1] | v2[2] | v2[3] |
>>> +                    v3[0] | v3[1] | v3[2] | v3[3] |
>>> +                    v4[0] | v4[1] | v4[2] | v4[3] ;
>>> +
>>> +   into:
>>> +
>>> +     T = v1 | v2 | v3 | v4;
>>> +     accumulator |= T[0] | T[1] | T[2] | T[3];
>>> +
>>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>>> +
>>> +typedef int v4si __attribute__((vector_size(16)));
>>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>>> +  accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
>>> +  accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
>>> +  accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
>>> +  accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
>>> +  return accumulator;
>>> +}
>>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>>> new file mode 100644
>>> index 0000000..d069725
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target vect_int } */
>>> +/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
>>> +
>>> +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
>>> +
>>> +   v1, v2, v3, v4 of type vector int.
>>> +
>>> +   reassoc transforms
>>> +
>>> +     accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
>>> +                    v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
>>> +                    v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
>>> +                    v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
>>> +
>>> +   into:
>>> +
>>> +     T = v1 ^ v2 ^ v3 ^ v4;
>>> +     accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
>>> +
>>> +   Fewer bit_field_refs, only four for 128 or more bits vector.  */
>>> +
>>> +typedef int v4si __attribute__((vector_size(16)));
>>> +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
>>> +  accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
>>> +  accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
>>> +  accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
>>> +  accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
>>> +  return accumulator;
>>> +}
>>> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
>>> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
>>> index e1c4dfe..d755911 100644
>>> --- a/gcc/tree-ssa-reassoc.c
>>> +++ b/gcc/tree-ssa-reassoc.c
>>> @@ -1772,6 +1772,298 @@ undistribute_ops_list (enum tree_code opcode,
>>>    return changed;
>>>  }
>>>  
>>> +/* Hold the information of one specific VECTOR_TYPE SSA_NAME.
>>> +    - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR.
>>> +    - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF.  */
>>> +struct v_info
>>> +{
>>> +  auto_vec<unsigned HOST_WIDE_INT, 32> offsets;
>>> +  auto_vec<unsigned, 32> ops_indexes;
>>> +};
>>> +
>>> +typedef struct v_info *v_info_ptr;
>>> +
>>> +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets.  */
>>> +static int
>>> +unsigned_cmp (const void *p_i, const void *p_j)
>>> +{
>>> +  if (*(const unsigned *) p_i >= *(const unsigned *) p_j)
>>> +    return 1;
>>> +  else
>>> +    return -1;
>>> +}
>>> +
>>> +/* Cleanup hash map for VECTOR information.  */
>>> +static void
>>> +cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
>>> +{
>>> +  for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
>>> +       it != info_map.end (); ++it)
>>> +    {
>>> +      v_info_ptr info = (*it).second;
>>> +      delete info;
>>> +      (*it).second = NULL;
>>> +    }
>>> +}
>>> +
>>> +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
>>> +     V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
>>> +   is transformed to
>>> +     Vs = (V1 + V2 + ... + Vn)
>>> +     Vs[0] + Vs[1] + ... + Vs[k]
>>> +
>>> +   The basic steps are listed below:
>>> +
>>> +    1) Check the addition chain *OPS by looking those summands coming from
>>> +       VECTOR bit_field_ref on VECTOR type. Put the information into
>>> +       v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
>>> +
>>> +    2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
>>> +       continous, they can cover the whole VECTOR perfectly without any holes.
>>> +       Obtain one VECTOR list which contain candidates to be transformed.
>>> +
>>> +    3) Build the addition statements for all VECTOR candidates, generate
>>> +       BIT_FIELD_REFs accordingly.
>>> +
>>> +   TODO:
>>> +    1) The current implementation restrict all candidate VECTORs should have
>>> +       the same VECTOR type, but it can be extended into different groups by
>>> +       VECTOR types in future if any profitable cases found.
>>> +    2) The current implementation requires the whole VECTORs should be fully
>>> +       covered, but it can be extended to support partial, checking adjacent
>>> +       but not fill the whole, it may need some cost model to define the
>>> +       boundary to do or not.
>>> +*/
>>> +static bool
>>> +undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops,
>>> +			     struct loop *loop)
>>> +{
>>> +  if (ops->length () <= 1)
>>> +    return false;
>>> +
>>> +  if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
>>> +      && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
>>> +    return false;
>>> +
>>> +  hash_map<tree, v_info_ptr> v_info_map;
>>> +  operand_entry *oe1;
>>> +  unsigned i;
>>> +
>>> +  /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
>>> +     information into map.  */
>>> +  FOR_EACH_VEC_ELT (*ops, i, oe1)
>>> +    {
>>> +      enum tree_code dcode;
>>> +      gimple *oe1def;
>>> +
>>> +      if (TREE_CODE (oe1->op) != SSA_NAME)
>>> +	continue;
>>> +      oe1def = SSA_NAME_DEF_STMT (oe1->op);
>>> +      if (!is_gimple_assign (oe1def))
>>> +	continue;
>>> +      dcode = gimple_assign_rhs_code (oe1def);
>>> +      if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
>>> +	continue;
>>> +
>>> +      tree rhs = gimple_op (oe1def, 1);
>>> +      tree op0 = TREE_OPERAND (rhs, 0);
>>> +      tree vec_type = TREE_TYPE (op0);
>>> +
>>> +      if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE)
>>> +	continue;
>>> +
>>> +      tree op1 = TREE_OPERAND (rhs, 1);
>>> +      tree op2 = TREE_OPERAND (rhs, 2);
>>> +
>>> +      tree elem_type = TREE_TYPE (vec_type);
>>> +      unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>>> +      if (size != TREE_INT_CST_LOW (op1))
>>> +	continue;
>>> +
>>> +      /* Ignore it if target machine can't support this VECTOR type.  */
>>> +      if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
>>> +	continue;
>> I don't think this is sufficient.  I think you need to use something like
>>
>>   (targetm.vector_mode_supported_p (TYPE_MODE (vec_type))
>>    && optab_handler (add_optab, TYPE_MODE (vec_type)) != CODE_FOR_nothing)
>>
>> I might not have spelled all of this right.  There are examples of testing
>> for target support in tree-vect-stmts.c.
> 
> As we discussed offline, I see now that what you have is correct -- it's just
> not as clear or efficient as I would like.  I think it's better to check the
> type support explicitly, and do the optab check up front also, so that you
> can exit quickly if this doesn't apply.
> 
> But please test!  I haven't tried this at all. ;-)
> 
> Thanks,
> Bill
> 

Thanks Bill!  I put the vector operation check as part of validating ref 
vector.  You are right, it's better to move it up to vector type check.

>> Thanks,
>> Bill
>>> +
>>> +      v_info_ptr *info_ptr = v_info_map.get (op0);
>>> +      if (info_ptr)
>>> +	{
>>> +	  v_info_ptr info = *info_ptr;
>>> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
>>> +	  info->ops_indexes.safe_push (i);
>>> +	}
>>> +      else
>>> +	{
>>> +	  v_info_ptr info = new v_info;
>>> +	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
>>> +	  info->ops_indexes.safe_push (i);
>>> +	  v_info_map.put (op0, info);
>>> +	}
>>> +    }
>>> +
>>> +  /* At least two VECTOR to combine.  */
>>> +  if (v_info_map.elements () <= 1)
>>> +    {
>>> +      cleanup_vinfo_map (v_info_map);
>>> +      return false;
>>> +    }
>>> +
>>> +  /* Use the first VECTOR and its information as the reference.
>>> +     Firstly, we should validate it, that is:
>>> +       1) required VECTOR operation supported on target machine.
>>> +       2) sorted offsets are adjacent, no holes.
>>> +       3) can fill the whole VECTOR perfectly.  */
>>> +  hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
>>> +  tree ref_vec = (*it).first;
>>> +  tree vec_type = TREE_TYPE (ref_vec);
>>> +  tree elem_type = TREE_TYPE (vec_type);
>>> +
>>> +  /* Check VECTOR operation available on target machine.  */
>>> +  optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
>>> +  if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
     
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

>>> +    {
>>> +      cleanup_vinfo_map (v_info_map);
>>> +      return false;
>>> +    }
>>> +
>>> +  v_info_ptr ref_info = (*it).second;
>>> +  ref_info->offsets.qsort (unsigned_cmp);
>>> +  unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>>> +  unsigned HOST_WIDE_INT curr;
>>> +  unsigned HOST_WIDE_INT prev = ref_info->offsets[0];
>>> +
>>> +  /* Continous check.  */
>>> +  FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1)
>>> +    {
>>> +      if (curr != (prev + elem_size))
>>> +	{
>>> +	  cleanup_vinfo_map (v_info_map);
>>> +	  return false;
>>> +	}
>>> +      prev = curr;
>>> +    }
>>> +
>>> +  /* Check whether fill the whole.  */
>>> +  if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec))))
>>> +    {
>>> +      cleanup_vinfo_map (v_info_map);
>>> +      return false;
>>> +    }
>>> +
>>> +  auto_vec<tree> vectors (v_info_map.elements ());
>>> +  vectors.quick_push (ref_vec);
>>> +
>>> +  /* Use the ref_vec to filter others.  */
>>> +  for (++it; it != v_info_map.end (); ++it)
>>> +    {
>>> +      tree vec = (*it).first;
>>> +      v_info_ptr info = (*it).second;
>>> +      if (TREE_TYPE (ref_vec) != TREE_TYPE (vec))
>>> +	continue;
>>> +      if (ref_info->offsets.length () != info->offsets.length ())
>>> +	continue;
>>> +      bool same_offset = true;
>>> +      info->offsets.qsort (unsigned_cmp);
>>> +      for (unsigned i = 0; i < ref_info->offsets.length (); i++)
>>> +	{
>>> +	  if (ref_info->offsets[i] != info->offsets[i])
>>> +	    {
>>> +	      same_offset = false;
>>> +	      break;
>>> +	    }
>>> +	}
>>> +      if (!same_offset)
>>> +	continue;
>>> +      vectors.quick_push (vec);
>>> +    }
>>> +
>>> +  if (vectors.length () < 2)
>>> +    {
>>> +      cleanup_vinfo_map (v_info_map);
>>> +      return false;
>>> +    }
>>> +
>>> +  tree tr;
>>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>>> +    {
>>> +      fprintf (dump_file, "The bit_field_ref vector list for undistribute: ");
>>> +      FOR_EACH_VEC_ELT (vectors, i, tr)
>>> +	{
>>> +	  print_generic_expr (dump_file, tr);
>>> +	  fprintf (dump_file, "  ");
>>> +	}
>>> +      fprintf (dump_file, "\n");
>>> +    }
>>> +
>>> +  /* Build the sum for all candidate VECTORs.  */
>>> +  unsigned idx;
>>> +  gimple *sum = NULL;
>>> +  v_info_ptr info;
>>> +  tree sum_vec = ref_vec;
>>> +  FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1)
>>> +    {
>>> +      sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode);
>>> +      info = *(v_info_map.get (tr));
>>> +      unsigned j;
>>> +      FOR_EACH_VEC_ELT (info->ops_indexes, j, idx)
>>> +	{
>>> +	  gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
>>> +	  gimple_set_visited (def, true);
>>> +	  if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
>>> +	      || opcode == BIT_IOR_EXPR)
>>> +	    (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
>>> +	  else if (opcode == MULT_EXPR)
>>> +	    (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
>>> +	  else
>>> +	    {
>>> +	      gcc_assert (opcode == BIT_AND_EXPR);
>>> +	      (*ops)[idx]->op
>>> +		= build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
>>> +	    }
>>> +	  (*ops)[idx]->rank = 0;
>>> +	}
>>> +      sum_vec = gimple_get_lhs (sum);
>>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>>> +	{
>>> +	  fprintf (dump_file, "Generating addition -> ");
>>> +	  print_gimple_stmt (dump_file, sum, 0);
>>> +	}
>>> +    }
>>> +
>>> +  /* Referring to any good shape VECTOR (here using ref_vec), generate the
>>> +     BIT_FIELD_REF statements accordingly.  */
>>> +  info = *(v_info_map.get (ref_vec));
>>> +  gcc_assert (sum);
>>> +  FOR_EACH_VEC_ELT (info->ops_indexes, i, idx)
>>> +    {
>>> +      tree dst = make_ssa_name (elem_type);
>>> +      gimple *gs
>>> +	= gimple_build_assign (dst, BIT_FIELD_REF,
>>> +			       build3 (BIT_FIELD_REF, elem_type, sum_vec,
>>> +				       TYPE_SIZE (elem_type),
>>> +				       bitsize_int (info->offsets[i])));
>>> +      insert_stmt_after (gs, sum);
>>> +      update_stmt (gs);
>>> +      gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
>>> +      gimple_set_visited (def, true);
>>> +      (*ops)[idx]->op = gimple_assign_lhs (gs);
>>> +      (*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
>>> +      if (dump_file && (dump_flags & TDF_DETAILS))
>>> +	{
>>> +	  fprintf (dump_file, "Generating bit_field_ref -> ");
>>> +	  print_gimple_stmt (dump_file, gs, 0);
>>> +	}
>>> +    }
>>> +
>>> +  if (dump_file && (dump_flags & TDF_DETAILS))
>>> +    {
>>> +      fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
>>> +    }
>>> +
>>> +  cleanup_vinfo_map (v_info_map);
>>> +
>>> +  return true;
>>> +}
>>> +
>>>  /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
>>>     expression, examine the other OPS to see if any of them are comparisons
>>>     of the same values, which we may be able to combine or eliminate.
>>> @@ -5880,11 +6172,6 @@ reassociate_bb (basic_block bb)
>>>  	  tree lhs, rhs1, rhs2;
>>>  	  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
>>>  
>>> -	  /* If this is not a gimple binary expression, there is
>>> -	     nothing for us to do with it.  */
>>> -	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
>>> -	    continue;
>>> -
>>>  	  /* If this was part of an already processed statement,
>>>  	     we don't need to touch it again. */
>>>  	  if (gimple_visited_p (stmt))
>>> @@ -5911,6 +6198,11 @@ reassociate_bb (basic_block bb)
>>>  	      continue;
>>>  	    }
>>>  
>>> +	  /* If this is not a gimple binary expression, there is
>>> +	     nothing for us to do with it.  */
>>> +	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
>>> +	    continue;
>>> +
>>>  	  lhs = gimple_assign_lhs (stmt);
>>>  	  rhs1 = gimple_assign_rhs1 (stmt);
>>>  	  rhs2 = gimple_assign_rhs2 (stmt);
>>> @@ -5950,6 +6242,13 @@ reassociate_bb (basic_block bb)
>>>  		  optimize_ops_list (rhs_code, &ops);
>>>  		}
>>>  
>>> +	      if (undistribute_bitref_for_vector (rhs_code, &ops,
>>> +						  loop_containing_stmt (stmt)))
>>> +		{
>>> +		  ops.qsort (sort_by_operand_rank);
>>> +		  optimize_ops_list (rhs_code, &ops);
>>> +		}
>>> +
>>>  	      if (rhs_code == PLUS_EXPR
>>>  		  && transform_add_to_multiply (&ops))
>>>  		ops.qsort (sort_by_operand_rank);
>>
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
new file mode 100644
index 0000000..c87caff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c
@@ -0,0 +1,42 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref summation.
+
+   arg1 and arg2 are two arrays whose elements of type vector double.
+   Assuming:
+     A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3],
+     B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3],
+
+   Then:
+     V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3,
+
+   reassoc transforms
+
+     accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1]
+		  + V3[0] + V3[1];
+
+   into:
+
+     T = V0 + V1 + V2 + V3
+     accumulator += T[0] + T[1];
+
+   Fewer bit_field_refs, only two for 128 or more bits vector.  */
+
+typedef double v2df __attribute__ ((vector_size (16)));
+double
+test (double accumulator, v2df arg1[], v2df arg2[])
+{
+  v2df temp;
+  temp = arg1[0] * arg2[0];
+  accumulator += temp[0] + temp[1];
+  temp = arg1[1] * arg2[1];
+  accumulator += temp[0] + temp[1];
+  temp = arg1[2] * arg2[2];
+  accumulator += temp[0] + temp[1];
+  temp = arg1[3] * arg2[3];
+  accumulator += temp[0] + temp[1];
+  return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
new file mode 100644
index 0000000..d1851ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on multiplication.
+
+   v1, v2, v3, v4 of type vector float.
+
+   reassoc transforms
+
+     accumulator *= v1[0] * v1[1] * v1[2] * v1[3] *
+                    v2[0] * v2[1] * v2[2] * v2[3] *
+                    v3[0] * v3[1] * v3[2] * v3[3] *
+                    v4[0] * v4[1] * v4[2] * v4[3] ;
+
+   into:
+
+     T = v1 * v2 * v3 * v4;
+     accumulator *= T[0] * T[1] * T[2] * T[3];
+
+   Fewer bit_field_refs, only four for 128 or more bits vector.  */
+
+typedef float v4si __attribute__((vector_size(16)));
+float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+  accumulator *= v1[0] * v1[1] * v1[2] * v1[3];
+  accumulator *= v2[0] * v2[1] * v2[2] * v2[3];
+  accumulator *= v3[0] * v3[1] * v3[2] * v3[3];
+  accumulator *= v4[0] * v4[1] * v4[2] * v4[3];
+  return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
new file mode 100644
index 0000000..e774d25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise AND.
+
+   v1, v2, v3, v4 of type vector int.
+
+   reassoc transforms
+
+     accumulator &= v1[0] & v1[1] & v1[2] & v1[3] &
+                    v2[0] & v2[1] & v2[2] & v2[3] &
+                    v3[0] & v3[1] & v3[2] & v3[3] &
+                    v4[0] & v4[1] & v4[2] & v4[3] ;
+
+   into:
+
+     T = v1 & v2 & v3 & v4;
+     accumulator &= T[0] & T[1] & T[2] & T[3];
+
+   Fewer bit_field_refs, only four for 128 or more bits vector.  */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+  accumulator &= v1[0] & v1[1] & v1[2] & v1[3];
+  accumulator &= v2[0] & v2[1] & v2[2] & v2[3];
+  accumulator &= v3[0] & v3[1] & v3[2] & v3[3];
+  accumulator &= v4[0] & v4[1] & v4[2] & v4[3];
+  return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
new file mode 100644
index 0000000..7b75404
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR.
+
+   v1, v2, v3, v4 of type vector int.
+
+   reassoc transforms
+
+     accumulator |= v1[0] | v1[1] | v1[2] | v1[3] |
+                    v2[0] | v2[1] | v2[2] | v2[3] |
+                    v3[0] | v3[1] | v3[2] | v3[3] |
+                    v4[0] | v4[1] | v4[2] | v4[3] ;
+
+   into:
+
+     T = v1 | v2 | v3 | v4;
+     accumulator |= T[0] | T[1] | T[2] | T[3];
+
+   Fewer bit_field_refs, only four for 128 or more bits vector.  */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+  accumulator |= v1[0] | v1[1] | v1[2] | v1[3];
+  accumulator |= v2[0] | v2[1] | v2[2] | v2[3];
+  accumulator |= v3[0] | v3[1] | v3[2] | v3[3];
+  accumulator |= v4[0] | v4[1] | v4[2] | v4[3];
+  return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
new file mode 100644
index 0000000..d069725
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-reassoc1" } */
+
+/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR.
+
+   v1, v2, v3, v4 of type vector int.
+
+   reassoc transforms
+
+     accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^
+                    v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^
+                    v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^
+                    v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ;
+
+   into:
+
+     T = v1 ^ v2 ^ v3 ^ v4;
+     accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3];
+
+   Fewer bit_field_refs, only four for 128 or more bits vector.  */
+
+typedef int v4si __attribute__((vector_size(16)));
+int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) {
+  accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3];
+  accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3];
+  accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3];
+  accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3];
+  return accumulator;
+}
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */
diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
index e1c4dfe..d755911 100644
--- a/gcc/tree-ssa-reassoc.c
+++ b/gcc/tree-ssa-reassoc.c
@@ -1772,6 +1772,298 @@  undistribute_ops_list (enum tree_code opcode,
   return changed;
 }
 
+/* Hold the information of one specific VECTOR_TYPE SSA_NAME.
+    - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR.
+    - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF.  */
+struct v_info
+{
+  auto_vec<unsigned HOST_WIDE_INT, 32> offsets;
+  auto_vec<unsigned, 32> ops_indexes;
+};
+
+typedef struct v_info *v_info_ptr;
+
+/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets.  */
+static int
+unsigned_cmp (const void *p_i, const void *p_j)
+{
+  if (*(const unsigned *) p_i >= *(const unsigned *) p_j)
+    return 1;
+  else
+    return -1;
+}
+
+/* Cleanup hash map for VECTOR information.  */
+static void
+cleanup_vinfo_map (hash_map<tree, v_info_ptr> &info_map)
+{
+  for (hash_map<tree, v_info_ptr>::iterator it = info_map.begin ();
+       it != info_map.end (); ++it)
+    {
+      v_info_ptr info = (*it).second;
+      delete info;
+      (*it).second = NULL;
+    }
+}
+
+/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE.
+     V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k]
+   is transformed to
+     Vs = (V1 + V2 + ... + Vn)
+     Vs[0] + Vs[1] + ... + Vs[k]
+
+   The basic steps are listed below:
+
+    1) Check the addition chain *OPS by looking those summands coming from
+       VECTOR bit_field_ref on VECTOR type. Put the information into
+       v_info_map for each satisfied summand, using VECTOR SSA_NAME as key.
+
+    2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are
+       continous, they can cover the whole VECTOR perfectly without any holes.
+       Obtain one VECTOR list which contain candidates to be transformed.
+
+    3) Build the addition statements for all VECTOR candidates, generate
+       BIT_FIELD_REFs accordingly.
+
+   TODO:
+    1) The current implementation restrict all candidate VECTORs should have
+       the same VECTOR type, but it can be extended into different groups by
+       VECTOR types in future if any profitable cases found.
+    2) The current implementation requires the whole VECTORs should be fully
+       covered, but it can be extended to support partial, checking adjacent
+       but not fill the whole, it may need some cost model to define the
+       boundary to do or not.
+*/
+static bool
+undistribute_bitref_for_vector (enum tree_code opcode, vec<operand_entry *> *ops,
+			     struct loop *loop)
+{
+  if (ops->length () <= 1)
+    return false;
+
+  if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR
+      && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR)
+    return false;
+
+  hash_map<tree, v_info_ptr> v_info_map;
+  operand_entry *oe1;
+  unsigned i;
+
+  /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the
+     information into map.  */
+  FOR_EACH_VEC_ELT (*ops, i, oe1)
+    {
+      enum tree_code dcode;
+      gimple *oe1def;
+
+      if (TREE_CODE (oe1->op) != SSA_NAME)
+	continue;
+      oe1def = SSA_NAME_DEF_STMT (oe1->op);
+      if (!is_gimple_assign (oe1def))
+	continue;
+      dcode = gimple_assign_rhs_code (oe1def);
+      if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop))
+	continue;
+
+      tree rhs = gimple_op (oe1def, 1);
+      tree op0 = TREE_OPERAND (rhs, 0);
+      tree vec_type = TREE_TYPE (op0);
+
+      if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE)
+	continue;
+
+      tree op1 = TREE_OPERAND (rhs, 1);
+      tree op2 = TREE_OPERAND (rhs, 2);
+
+      tree elem_type = TREE_TYPE (vec_type);
+      unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+      if (size != TREE_INT_CST_LOW (op1))
+	continue;
+
+      /* Ignore it if target machine can't support this VECTOR type.  */
+      if (!VECTOR_MODE_P (TYPE_MODE (vec_type)))
+	continue;
+
+      v_info_ptr *info_ptr = v_info_map.get (op0);
+      if (info_ptr)
+	{
+	  v_info_ptr info = *info_ptr;
+	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
+	  info->ops_indexes.safe_push (i);
+	}
+      else
+	{
+	  v_info_ptr info = new v_info;
+	  info->offsets.safe_push (TREE_INT_CST_LOW (op2));
+	  info->ops_indexes.safe_push (i);
+	  v_info_map.put (op0, info);
+	}
+    }
+
+  /* At least two VECTOR to combine.  */
+  if (v_info_map.elements () <= 1)
+    {
+      cleanup_vinfo_map (v_info_map);
+      return false;
+    }
+
+  /* Use the first VECTOR and its information as the reference.
+     Firstly, we should validate it, that is:
+       1) required VECTOR operation supported on target machine.
+       2) sorted offsets are adjacent, no holes.
+       3) can fill the whole VECTOR perfectly.  */
+  hash_map<tree, v_info_ptr>::iterator it = v_info_map.begin ();
+  tree ref_vec = (*it).first;
+  tree vec_type = TREE_TYPE (ref_vec);
+  tree elem_type = TREE_TYPE (vec_type);
+
+  /* Check VECTOR operation available on target machine.  */
+  optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector);
+  if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing)
+    {
+      cleanup_vinfo_map (v_info_map);
+      return false;
+    }
+
+  v_info_ptr ref_info = (*it).second;
+  ref_info->offsets.qsort (unsigned_cmp);
+  unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+  unsigned HOST_WIDE_INT curr;
+  unsigned HOST_WIDE_INT prev = ref_info->offsets[0];
+
+  /* Continous check.  */
+  FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1)
+    {
+      if (curr != (prev + elem_size))
+	{
+	  cleanup_vinfo_map (v_info_map);
+	  return false;
+	}
+      prev = curr;
+    }
+
+  /* Check whether fill the whole.  */
+  if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec))))
+    {
+      cleanup_vinfo_map (v_info_map);
+      return false;
+    }
+
+  auto_vec<tree> vectors (v_info_map.elements ());
+  vectors.quick_push (ref_vec);
+
+  /* Use the ref_vec to filter others.  */
+  for (++it; it != v_info_map.end (); ++it)
+    {
+      tree vec = (*it).first;
+      v_info_ptr info = (*it).second;
+      if (TREE_TYPE (ref_vec) != TREE_TYPE (vec))
+	continue;
+      if (ref_info->offsets.length () != info->offsets.length ())
+	continue;
+      bool same_offset = true;
+      info->offsets.qsort (unsigned_cmp);
+      for (unsigned i = 0; i < ref_info->offsets.length (); i++)
+	{
+	  if (ref_info->offsets[i] != info->offsets[i])
+	    {
+	      same_offset = false;
+	      break;
+	    }
+	}
+      if (!same_offset)
+	continue;
+      vectors.quick_push (vec);
+    }
+
+  if (vectors.length () < 2)
+    {
+      cleanup_vinfo_map (v_info_map);
+      return false;
+    }
+
+  tree tr;
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "The bit_field_ref vector list for undistribute: ");
+      FOR_EACH_VEC_ELT (vectors, i, tr)
+	{
+	  print_generic_expr (dump_file, tr);
+	  fprintf (dump_file, "  ");
+	}
+      fprintf (dump_file, "\n");
+    }
+
+  /* Build the sum for all candidate VECTORs.  */
+  unsigned idx;
+  gimple *sum = NULL;
+  v_info_ptr info;
+  tree sum_vec = ref_vec;
+  FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1)
+    {
+      sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode);
+      info = *(v_info_map.get (tr));
+      unsigned j;
+      FOR_EACH_VEC_ELT (info->ops_indexes, j, idx)
+	{
+	  gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
+	  gimple_set_visited (def, true);
+	  if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR
+	      || opcode == BIT_IOR_EXPR)
+	    (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op));
+	  else if (opcode == MULT_EXPR)
+	    (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op));
+	  else
+	    {
+	      gcc_assert (opcode == BIT_AND_EXPR);
+	      (*ops)[idx]->op
+		= build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op));
+	    }
+	  (*ops)[idx]->rank = 0;
+	}
+      sum_vec = gimple_get_lhs (sum);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Generating addition -> ");
+	  print_gimple_stmt (dump_file, sum, 0);
+	}
+    }
+
+  /* Referring to any good shape VECTOR (here using ref_vec), generate the
+     BIT_FIELD_REF statements accordingly.  */
+  info = *(v_info_map.get (ref_vec));
+  gcc_assert (sum);
+  FOR_EACH_VEC_ELT (info->ops_indexes, i, idx)
+    {
+      tree dst = make_ssa_name (elem_type);
+      gimple *gs
+	= gimple_build_assign (dst, BIT_FIELD_REF,
+			       build3 (BIT_FIELD_REF, elem_type, sum_vec,
+				       TYPE_SIZE (elem_type),
+				       bitsize_int (info->offsets[i])));
+      insert_stmt_after (gs, sum);
+      update_stmt (gs);
+      gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op);
+      gimple_set_visited (def, true);
+      (*ops)[idx]->op = gimple_assign_lhs (gs);
+      (*ops)[idx]->rank = get_rank ((*ops)[idx]->op);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file, "Generating bit_field_ref -> ");
+	  print_gimple_stmt (dump_file, gs, 0);
+	}
+    }
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    {
+      fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n");
+    }
+
+  cleanup_vinfo_map (v_info_map);
+
+  return true;
+}
+
 /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison
    expression, examine the other OPS to see if any of them are comparisons
    of the same values, which we may be able to combine or eliminate.
@@ -5880,11 +6172,6 @@  reassociate_bb (basic_block bb)
 	  tree lhs, rhs1, rhs2;
 	  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
 
-	  /* If this is not a gimple binary expression, there is
-	     nothing for us to do with it.  */
-	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
-	    continue;
-
 	  /* If this was part of an already processed statement,
 	     we don't need to touch it again. */
 	  if (gimple_visited_p (stmt))
@@ -5911,6 +6198,11 @@  reassociate_bb (basic_block bb)
 	      continue;
 	    }
 
+	  /* If this is not a gimple binary expression, there is
+	     nothing for us to do with it.  */
+	  if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS)
+	    continue;
+
 	  lhs = gimple_assign_lhs (stmt);
 	  rhs1 = gimple_assign_rhs1 (stmt);
 	  rhs2 = gimple_assign_rhs2 (stmt);
@@ -5950,6 +6242,13 @@  reassociate_bb (basic_block bb)
 		  optimize_ops_list (rhs_code, &ops);
 		}
 
+	      if (undistribute_bitref_for_vector (rhs_code, &ops,
+						  loop_containing_stmt (stmt)))
+		{
+		  ops.qsort (sort_by_operand_rank);
+		  optimize_ops_list (rhs_code, &ops);
+		}
+
 	      if (rhs_code == PLUS_EXPR
 		  && transform_add_to_multiply (&ops))
 		ops.qsort (sort_by_operand_rank);