diff mbox series

Handle different bit_not_p in store merging (PR tree-optimization/78821)

Message ID 20171110135936.GM14653@tucnak
State New
Headers show
Series Handle different bit_not_p in store merging (PR tree-optimization/78821) | expand

Commit Message

Jakub Jelinek Nov. 10, 2017, 1:59 p.m. UTC
Hi!

This is something Uros requested in the PR, at least with BIT_NOT_EXPRs
it is easy.  Previous store merging changes required that bit_not_p
is equal on all stores in the group (in all 3 spots, i.e. on the result
of BIT_{AND,IOR,XOR}_EXPR and on both of the operands).

This patch handles mixed values of that flag.  If none of the
orig_stores have a particular bit_not_p set, then as previously nothing
is inverted, if all of them have it set, then as previously we BIT_NOT_EXPR
the particular SSA_NAME, and newly if there is a mix of false and true
in a particular bit_not_p, we compute a mask and BIT_XOR_EXPR it, this
invert only the bits that were bit_not_p and not the others.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2017-11-10  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/78821
	* gimple-ssa-store-merging.c (compatible_load_p): Don't require
	that bit_not_p is the same.
	(imm_store_chain_info::coalesce_immediate_stores): Likewise.
	(split_group): Count precisely bit_not_p bits in each statement.
	(invert_op): New function.
	(imm_store_chain_info::output_merged_store): Use invert_op to
	emit BIT_XOR_EXPR with a xor_mask instead of BIT_NOT_EXPR if some
	but not all orig_stores have BIT_NOT_EXPR in the corresponding spots.

	* gcc.dg/store_merging_15.c: New test.


	Jakub

Comments

Kyrill Tkachov Nov. 10, 2017, 4:51 p.m. UTC | #1
Hi Jakub,

On 10/11/17 13:59, Jakub Jelinek wrote:
> Hi!
>
> This is something Uros requested in the PR, at least with BIT_NOT_EXPRs
> it is easy.  Previous store merging changes required that bit_not_p
> is equal on all stores in the group (in all 3 spots, i.e. on the result
> of BIT_{AND,IOR,XOR}_EXPR and on both of the operands).
>
> This patch handles mixed values of that flag.  If none of the
> orig_stores have a particular bit_not_p set, then as previously nothing
> is inverted, if all of them have it set, then as previously we 
> BIT_NOT_EXPR
> the particular SSA_NAME, and newly if there is a mix of false and true
> in a particular bit_not_p, we compute a mask and BIT_XOR_EXPR it, this
> invert only the bits that were bit_not_p and not the others.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>

Might be worth doing a sanity check test run on a big-endian target
since there is a bit of BYTES_BIG_ENDIAN logic in the patch (though it 
looks correct to me).

Thanks,
Kyrill

> 2017-11-10  Jakub Jelinek  <jakub@redhat.com>
>
>         PR tree-optimization/78821
>         * gimple-ssa-store-merging.c (compatible_load_p): Don't require
>         that bit_not_p is the same.
>         (imm_store_chain_info::coalesce_immediate_stores): Likewise.
>         (split_group): Count precisely bit_not_p bits in each statement.
>         (invert_op): New function.
>         (imm_store_chain_info::output_merged_store): Use invert_op to
>         emit BIT_XOR_EXPR with a xor_mask instead of BIT_NOT_EXPR if some
>         but not all orig_stores have BIT_NOT_EXPR in the corresponding 
> spots.
>
>         * gcc.dg/store_merging_15.c: New test.
>
> --- gcc/gimple-ssa-store-merging.c.jj   2017-11-10 08:04:49.000000000 
> +0100
> +++ gcc/gimple-ssa-store-merging.c      2017-11-10 10:53:13.152731543 
> +0100
> @@ -1036,7 +1036,6 @@ compatible_load_p (merged_store_group *m
>  {
>    store_immediate_info *infof = merged_store->stores[0];
>    if (!info->ops[idx].base_addr
> -      || info->ops[idx].bit_not_p != infof->ops[idx].bit_not_p
>        || (info->ops[idx].bitpos - infof->ops[idx].bitpos
>            != info->bitpos - infof->bitpos)
>        || !operand_equal_p (info->ops[idx].base_addr,
> @@ -1176,8 +1175,7 @@ imm_store_chain_info::coalesce_immediate
>           Merge it into the current store group.  There can be gaps in 
> between
>           the stores, but there can't be gaps in between bitregions.  */
>        else if (info->bitregion_start <= merged_store->bitregion_end
> -              && info->rhs_code == merged_store->stores[0]->rhs_code
> -              && info->bit_not_p == merged_store->stores[0]->bit_not_p)
> +              && info->rhs_code == merged_store->stores[0]->rhs_code)
>          {
>            store_immediate_info *infof = merged_store->stores[0];
>
> @@ -1496,16 +1494,14 @@ split_group (merged_store_group *group,
>        total_orig[0] = 1; /* The orig store.  */
>        info = group->stores[0];
>        if (info->ops[0].base_addr)
> -       total_orig[0] += 1 + info->ops[0].bit_not_p;
> +       total_orig[0]++;
>        if (info->ops[1].base_addr)
> -       total_orig[0] += 1 + info->ops[1].bit_not_p;
> +       total_orig[0]++;
>        switch (info->rhs_code)
>          {
>          case BIT_AND_EXPR:
>          case BIT_IOR_EXPR:
>          case BIT_XOR_EXPR:
> -         if (info->bit_not_p)
> -           total_orig[0]++; /* The orig BIT_NOT_EXPR stmt. */
>            total_orig[0]++; /* The orig BIT_*_EXPR stmt.  */
>            break;
>          default:
> @@ -1514,7 +1510,12 @@ split_group (merged_store_group *group,
>        total_orig[0] *= group->stores.length ();
>
>        FOR_EACH_VEC_ELT (group->stores, i, info)
> -       total_new[0] += count_multiple_uses (info);
> +       {
> +         total_new[0] += count_multiple_uses (info);
> +         total_orig[0] += (info->bit_not_p
> +                           + info->ops[0].bit_not_p
> +                           + info->ops[1].bit_not_p);
> +       }
>      }
>
>    if (!allow_unaligned_load)
> @@ -1654,13 +1655,13 @@ split_group (merged_store_group *group,
>
>    if (total_orig)
>      {
> +      unsigned int i;
> +      struct split_store *store;
>        /* If we are reusing some original stores and any of the
>           original SSA_NAMEs had multiple uses, we need to subtract
>           those now before we add the new ones.  */
>        if (total_new[0] && any_orig)
>          {
> -         unsigned int i;
> -         struct split_store *store;
>            FOR_EACH_VEC_ELT (*split_stores, i, store)
>              if (store->orig)
>                total_new[0] -= count_multiple_uses 
> (store->orig_stores[0]);
> @@ -1668,26 +1669,105 @@ split_group (merged_store_group *group,
>        total_new[0] += ret; /* The new store.  */
>        store_immediate_info *info = group->stores[0];
>        if (info->ops[0].base_addr)
> -       total_new[0] += ret * (1 + info->ops[0].bit_not_p);
> +       total_new[0] += ret;
>        if (info->ops[1].base_addr)
> -       total_new[0] += ret * (1 + info->ops[1].bit_not_p);
> +       total_new[0] += ret;
>        switch (info->rhs_code)
>          {
>          case BIT_AND_EXPR:
>          case BIT_IOR_EXPR:
>          case BIT_XOR_EXPR:
> -         if (info->bit_not_p)
> -           total_new[0] += ret; /* The new BIT_NOT_EXPR stmt.  */
>            total_new[0] += ret; /* The new BIT_*_EXPR stmt. */
>            break;
>          default:
>            break;
>          }
> +      FOR_EACH_VEC_ELT (*split_stores, i, store)
> +       {
> +         unsigned int j;
> +         bool bit_not_p[3] = { false, false, false };
> +         /* If all orig_stores have certain bit_not_p set, then
> +            we'd use a BIT_NOT_EXPR stmt and need to account for it.
> +            If some orig_stores have certain bit_not_p set, then
> +            we'd use a BIT_XOR_EXPR with a mask and need to account for
> +            it.  */
> +         FOR_EACH_VEC_ELT (store->orig_stores, j, info)
> +           {
> +             if (info->ops[0].bit_not_p)
> +               bit_not_p[0] = true;
> +             if (info->ops[1].bit_not_p)
> +               bit_not_p[1] = true;
> +             if (info->bit_not_p)
> +               bit_not_p[2] = true;
> +           }
> +         total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
> +       }
> +
>      }
>
>    return ret;
>  }
>
> +/* Return the operation through which the operand IDX (if < 2) or
> +   result (IDX == 2) should be inverted.  If NOP_EXPR, no inversion
> +   is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
> +   the bits should be xored with mask.  */
> +
> +static enum tree_code
> +invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
> +{
> +  unsigned int i;
> +  store_immediate_info *info;
> +  unsigned int cnt = 0;
> +  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
> +    {
> +      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : 
> info->bit_not_p;
> +      if (bit_not_p)
> +       ++cnt;
> +    }
> +  mask = NULL_TREE;
> +  if (cnt == 0)
> +    return NOP_EXPR;
> +  if (cnt == split_store->orig_stores.length ())
> +    return BIT_NOT_EXPR;
> +
> +  unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * 
> BITS_PER_UNIT;
> +  unsigned buf_size = split_store->size / BITS_PER_UNIT;
> +  unsigned char *buf
> +    = XALLOCAVEC (unsigned char, buf_size);
> +  memset (buf, ~0U, buf_size);
> +  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
> +    {
> +      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : 
> info->bit_not_p;
> +      if (!bit_not_p)
> +       continue;
> +      /* Clear regions with bit_not_p and invert afterwards, rather than
> +        clear regions with !bit_not_p, so that gaps in between stores 
> aren't
> +        set in the mask.  */
> +      unsigned HOST_WIDE_INT bitsize = info->bitsize;
> +      unsigned int pos_in_buffer = 0;
> +      if (info->bitpos < try_bitpos)
> +       {
> +         gcc_assert (info->bitpos + bitsize > try_bitpos);
> +         bitsize -= (try_bitpos - info->bitpos);
> +       }
> +      else
> +       pos_in_buffer = info->bitpos - try_bitpos;
> +      if (pos_in_buffer + bitsize > split_store->size)
> +       bitsize = split_store->size - pos_in_buffer;
> +      unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
> +      if (BYTES_BIG_ENDIAN)
> +       clear_bit_region_be (p, (BITS_PER_UNIT - 1
> +                                - (pos_in_buffer % BITS_PER_UNIT)), 
> bitsize);
> +      else
> +       clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
> +    }
> +  for (unsigned int i = 0; i < buf_size; ++i)
> +    buf[i] = ~buf[i];
> +  mask = native_interpret_expr (int_type, buf, buf_size);
> +  return BIT_XOR_EXPR;
> +}
> +
>  /* Given a merged store group GROUP output the widened version of it.
>     The store chain is against the base object BASE.
>     Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't 
> output
> @@ -1894,10 +1974,13 @@ imm_store_chain_info::output_merged_stor
>                        gimple_seq_add_stmt_without_update (&seq, stmt);
>                      }
>                    ops[j] = gimple_assign_lhs (stmt);
> -                 if (op.bit_not_p)
> +                 tree xor_mask;
> +                 enum tree_code inv_op
> +                   = invert_op (split_store, j, int_type, xor_mask);
> +                 if (inv_op != NOP_EXPR)
>                      {
>                        stmt = gimple_build_assign (make_ssa_name 
> (int_type),
> - BIT_NOT_EXPR, ops[j]);
> +                                                 inv_op, ops[j], 
> xor_mask);
>                        gimple_set_location (stmt, load_loc);
>                        ops[j] = gimple_assign_lhs (stmt);
>
> @@ -1947,10 +2030,13 @@ imm_store_chain_info::output_merged_stor
>                else
>                  gimple_seq_add_stmt_without_update (&seq, stmt);
>                src = gimple_assign_lhs (stmt);
> -             if (split_store->orig_stores[0]->bit_not_p)
> +             tree xor_mask;
> +             enum tree_code inv_op;
> +             inv_op = invert_op (split_store, 2, int_type, xor_mask);
> +             if (inv_op != NOP_EXPR)
>                  {
>                    stmt = gimple_build_assign (make_ssa_name (int_type),
> -                                             BIT_NOT_EXPR, src);
> +                                             inv_op, src, xor_mask);
>                    gimple_set_location (stmt, bit_loc);
>                    if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
>                      gimple_seq_add_stmt_without_update (&load_seq[0], 
> stmt);
> --- gcc/testsuite/gcc.dg/store_merging_15.c.jj  2017-11-10 
> 10:57:54.481369637 +0100
> +++ gcc/testsuite/gcc.dg/store_merging_15.c     2017-11-10 
> 11:14:12.762646259 +0100
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target store_merge } */
> +/* { dg-options "-O2 -fdump-tree-store-merging" } */
> +
> +struct S { unsigned char a, b; unsigned short c; unsigned char d, e, 
> f, g; unsigned long long h; };
> +
> +__attribute__((noipa)) void
> +f1 (struct S *__restrict p, struct S *__restrict q)
> +{
> +  p->a = ~q->a;
> +  p->b = q->b;
> +  p->c = ~q->c;
> +  p->d = ~q->d;
> +  p->e = q->e;
> +  p->f = ~q->f;
> +  p->g = ~q->g;
> +}
> +
> +__attribute__((noipa)) void
> +f2 (struct S *__restrict p, struct S *__restrict q)
> +{
> +  p->a = ~(unsigned char) (p->a & q->a);
> +  p->b = ((unsigned char) ~p->b) & q->b;
> +  p->c = p->c & (unsigned short) ~q->c;
> +  p->d = p->d & q->d;
> +  p->e = p->e & (unsigned char) ~q->e;
> +  p->f = p->f & (unsigned char) ~q->f;
> +  p->g = ~(unsigned char) (p->g & q->g);
> +}
> +
> +struct S s = { 20, 21, 22, 23, 24, 25, 26, 27 };
> +struct S u = { 28, 29, 30, 31, 32, 33, 34, 35 };
> +struct S v = { 36, 37, 38, 39, 40, 41, 42, 43 };
> +
> +int
> +main ()
> +{
> +  asm volatile ("" : : : "memory");
> +  f1 (&s, &u);
> +  asm volatile ("" : : : "memory");
> +  if (s.a != (unsigned char) ~28 || s.b != 29
> +      || s.c != (unsigned short) ~30 || s.d != (unsigned char) ~31
> +      || s.e != 32 || s.f != (unsigned char) ~33 || s.g != (unsigned 
> char) ~34
> +      || s.h != 27)
> +    __builtin_abort ();
> +  f2 (&u, &v);
> +  asm volatile ("" : : : "memory");
> +  if (u.a != (unsigned char) ~(28 & 36) || u.b != (((unsigned char) 
> ~29) & 37)
> +      || u.c != (30 & (unsigned short) ~38) || u.d != (31 & 39)
> +      || u.e != (32 & (unsigned char) ~40) || u.f != (33 & (unsigned 
> char) ~41)
> +      || u.g != (unsigned char) ~(34 & 42) || u.h != 35)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Merging successful" 2 
> "store-merging" } } */
>
>         Jakub
Jakub Jelinek Nov. 12, 2017, 9:17 p.m. UTC | #2
On Fri, Nov 10, 2017 at 04:51:19PM +0000, Kyrill Tkachov wrote:
> Hi Jakub,
> 
> On 10/11/17 13:59, Jakub Jelinek wrote:
> > This is something Uros requested in the PR, at least with BIT_NOT_EXPRs
> > it is easy.  Previous store merging changes required that bit_not_p
> > is equal on all stores in the group (in all 3 spots, i.e. on the result
> > of BIT_{AND,IOR,XOR}_EXPR and on both of the operands).
> > 
> > This patch handles mixed values of that flag.  If none of the
> > orig_stores have a particular bit_not_p set, then as previously nothing
> > is inverted, if all of them have it set, then as previously we
> > BIT_NOT_EXPR
> > the particular SSA_NAME, and newly if there is a mix of false and true
> > in a particular bit_not_p, we compute a mask and BIT_XOR_EXPR it, this
> > invert only the bits that were bit_not_p and not the others.
> > 
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> > 
> 
> Might be worth doing a sanity check test run on a big-endian target
> since there is a bit of BYTES_BIG_ENDIAN logic in the patch (though it looks
> correct to me).

Indeed.  Successfully bootstrapped/regtested also on powerpc64{,le}-linux
now.

	Jakub
Richard Biener Nov. 13, 2017, 10:18 a.m. UTC | #3
On Fri, 10 Nov 2017, Jakub Jelinek wrote:

> Hi!
> 
> This is something Uros requested in the PR, at least with BIT_NOT_EXPRs
> it is easy.  Previous store merging changes required that bit_not_p
> is equal on all stores in the group (in all 3 spots, i.e. on the result
> of BIT_{AND,IOR,XOR}_EXPR and on both of the operands).
> 
> This patch handles mixed values of that flag.  If none of the
> orig_stores have a particular bit_not_p set, then as previously nothing
> is inverted, if all of them have it set, then as previously we BIT_NOT_EXPR
> the particular SSA_NAME, and newly if there is a mix of false and true
> in a particular bit_not_p, we compute a mask and BIT_XOR_EXPR it, this
> invert only the bits that were bit_not_p and not the others.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok.

Thanks,
Richard.

> 2017-11-10  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/78821
> 	* gimple-ssa-store-merging.c (compatible_load_p): Don't require
> 	that bit_not_p is the same.
> 	(imm_store_chain_info::coalesce_immediate_stores): Likewise.
> 	(split_group): Count precisely bit_not_p bits in each statement.
> 	(invert_op): New function.
> 	(imm_store_chain_info::output_merged_store): Use invert_op to
> 	emit BIT_XOR_EXPR with a xor_mask instead of BIT_NOT_EXPR if some
> 	but not all orig_stores have BIT_NOT_EXPR in the corresponding spots.
> 
> 	* gcc.dg/store_merging_15.c: New test.
> 
> --- gcc/gimple-ssa-store-merging.c.jj	2017-11-10 08:04:49.000000000 +0100
> +++ gcc/gimple-ssa-store-merging.c	2017-11-10 10:53:13.152731543 +0100
> @@ -1036,7 +1036,6 @@ compatible_load_p (merged_store_group *m
>  {
>    store_immediate_info *infof = merged_store->stores[0];
>    if (!info->ops[idx].base_addr
> -      || info->ops[idx].bit_not_p != infof->ops[idx].bit_not_p
>        || (info->ops[idx].bitpos - infof->ops[idx].bitpos
>  	  != info->bitpos - infof->bitpos)
>        || !operand_equal_p (info->ops[idx].base_addr,
> @@ -1176,8 +1175,7 @@ imm_store_chain_info::coalesce_immediate
>  	 Merge it into the current store group.  There can be gaps in between
>  	 the stores, but there can't be gaps in between bitregions.  */
>        else if (info->bitregion_start <= merged_store->bitregion_end
> -	       && info->rhs_code == merged_store->stores[0]->rhs_code
> -	       && info->bit_not_p == merged_store->stores[0]->bit_not_p)
> +	       && info->rhs_code == merged_store->stores[0]->rhs_code)
>  	{
>  	  store_immediate_info *infof = merged_store->stores[0];
>  
> @@ -1496,16 +1494,14 @@ split_group (merged_store_group *group,
>        total_orig[0] = 1; /* The orig store.  */
>        info = group->stores[0];
>        if (info->ops[0].base_addr)
> -	total_orig[0] += 1 + info->ops[0].bit_not_p;
> +	total_orig[0]++;
>        if (info->ops[1].base_addr)
> -	total_orig[0] += 1 + info->ops[1].bit_not_p;
> +	total_orig[0]++;
>        switch (info->rhs_code)
>  	{
>  	case BIT_AND_EXPR:
>  	case BIT_IOR_EXPR:
>  	case BIT_XOR_EXPR:
> -	  if (info->bit_not_p)
> -	    total_orig[0]++; /* The orig BIT_NOT_EXPR stmt.  */
>  	  total_orig[0]++; /* The orig BIT_*_EXPR stmt.  */
>  	  break;
>  	default:
> @@ -1514,7 +1510,12 @@ split_group (merged_store_group *group,
>        total_orig[0] *= group->stores.length ();
>  
>        FOR_EACH_VEC_ELT (group->stores, i, info)
> -	total_new[0] += count_multiple_uses (info);
> +	{
> +	  total_new[0] += count_multiple_uses (info);
> +	  total_orig[0] += (info->bit_not_p
> +			    + info->ops[0].bit_not_p
> +			    + info->ops[1].bit_not_p);
> +	}
>      }
>  
>    if (!allow_unaligned_load)
> @@ -1654,13 +1655,13 @@ split_group (merged_store_group *group,
>  
>    if (total_orig)
>      {
> +      unsigned int i;
> +      struct split_store *store;
>        /* If we are reusing some original stores and any of the
>  	 original SSA_NAMEs had multiple uses, we need to subtract
>  	 those now before we add the new ones.  */
>        if (total_new[0] && any_orig)
>  	{
> -	  unsigned int i;
> -	  struct split_store *store;
>  	  FOR_EACH_VEC_ELT (*split_stores, i, store)
>  	    if (store->orig)
>  	      total_new[0] -= count_multiple_uses (store->orig_stores[0]);
> @@ -1668,26 +1669,105 @@ split_group (merged_store_group *group,
>        total_new[0] += ret; /* The new store.  */
>        store_immediate_info *info = group->stores[0];
>        if (info->ops[0].base_addr)
> -	total_new[0] += ret * (1 + info->ops[0].bit_not_p);
> +	total_new[0] += ret;
>        if (info->ops[1].base_addr)
> -	total_new[0] += ret * (1 + info->ops[1].bit_not_p);
> +	total_new[0] += ret;
>        switch (info->rhs_code)
>  	{
>  	case BIT_AND_EXPR:
>  	case BIT_IOR_EXPR:
>  	case BIT_XOR_EXPR:
> -	  if (info->bit_not_p)
> -	    total_new[0] += ret; /* The new BIT_NOT_EXPR stmt.  */
>  	  total_new[0] += ret; /* The new BIT_*_EXPR stmt.  */
>  	  break;
>  	default:
>  	  break;
>  	}
> +      FOR_EACH_VEC_ELT (*split_stores, i, store)
> +	{
> +	  unsigned int j;
> +	  bool bit_not_p[3] = { false, false, false };
> +	  /* If all orig_stores have certain bit_not_p set, then
> +	     we'd use a BIT_NOT_EXPR stmt and need to account for it.
> +	     If some orig_stores have certain bit_not_p set, then
> +	     we'd use a BIT_XOR_EXPR with a mask and need to account for
> +	     it.  */
> +	  FOR_EACH_VEC_ELT (store->orig_stores, j, info)
> +	    {
> +	      if (info->ops[0].bit_not_p)
> +		bit_not_p[0] = true;
> +	      if (info->ops[1].bit_not_p)
> +		bit_not_p[1] = true;
> +	      if (info->bit_not_p)
> +		bit_not_p[2] = true;
> +	    }
> +	  total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
> +	}
> +
>      }
>  
>    return ret;
>  }
>  
> +/* Return the operation through which the operand IDX (if < 2) or
> +   result (IDX == 2) should be inverted.  If NOP_EXPR, no inversion
> +   is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
> +   the bits should be xored with mask.  */
> +
> +static enum tree_code
> +invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
> +{
> +  unsigned int i;
> +  store_immediate_info *info;
> +  unsigned int cnt = 0;
> +  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
> +    {
> +      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
> +      if (bit_not_p)
> +	++cnt;
> +    }
> +  mask = NULL_TREE;
> +  if (cnt == 0)
> +    return NOP_EXPR;
> +  if (cnt == split_store->orig_stores.length ())
> +    return BIT_NOT_EXPR;
> +
> +  unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * BITS_PER_UNIT;
> +  unsigned buf_size = split_store->size / BITS_PER_UNIT;
> +  unsigned char *buf
> +    = XALLOCAVEC (unsigned char, buf_size);
> +  memset (buf, ~0U, buf_size);
> +  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
> +    {
> +      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
> +      if (!bit_not_p)
> +	continue;
> +      /* Clear regions with bit_not_p and invert afterwards, rather than
> +	 clear regions with !bit_not_p, so that gaps in between stores aren't
> +	 set in the mask.  */
> +      unsigned HOST_WIDE_INT bitsize = info->bitsize;
> +      unsigned int pos_in_buffer = 0;
> +      if (info->bitpos < try_bitpos)
> +	{
> +	  gcc_assert (info->bitpos + bitsize > try_bitpos);
> +	  bitsize -= (try_bitpos - info->bitpos);
> +	}
> +      else
> +	pos_in_buffer = info->bitpos - try_bitpos;
> +      if (pos_in_buffer + bitsize > split_store->size)
> +	bitsize = split_store->size - pos_in_buffer;
> +      unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
> +      if (BYTES_BIG_ENDIAN)
> +	clear_bit_region_be (p, (BITS_PER_UNIT - 1
> +				 - (pos_in_buffer % BITS_PER_UNIT)), bitsize);
> +      else
> +	clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
> +    }
> +  for (unsigned int i = 0; i < buf_size; ++i)
> +    buf[i] = ~buf[i];
> +  mask = native_interpret_expr (int_type, buf, buf_size);
> +  return BIT_XOR_EXPR;
> +}
> +
>  /* Given a merged store group GROUP output the widened version of it.
>     The store chain is against the base object BASE.
>     Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output
> @@ -1894,10 +1974,13 @@ imm_store_chain_info::output_merged_stor
>  		      gimple_seq_add_stmt_without_update (&seq, stmt);
>  		    }
>  		  ops[j] = gimple_assign_lhs (stmt);
> -		  if (op.bit_not_p)
> +		  tree xor_mask;
> +		  enum tree_code inv_op
> +		    = invert_op (split_store, j, int_type, xor_mask);
> +		  if (inv_op != NOP_EXPR)
>  		    {
>  		      stmt = gimple_build_assign (make_ssa_name (int_type),
> -						  BIT_NOT_EXPR, ops[j]);
> +						  inv_op, ops[j], xor_mask);
>  		      gimple_set_location (stmt, load_loc);
>  		      ops[j] = gimple_assign_lhs (stmt);
>  
> @@ -1947,10 +2030,13 @@ imm_store_chain_info::output_merged_stor
>  	      else
>  		gimple_seq_add_stmt_without_update (&seq, stmt);
>  	      src = gimple_assign_lhs (stmt);
> -	      if (split_store->orig_stores[0]->bit_not_p)
> +	      tree xor_mask;
> +	      enum tree_code inv_op;
> +	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
> +	      if (inv_op != NOP_EXPR)
>  		{
>  		  stmt = gimple_build_assign (make_ssa_name (int_type),
> -					      BIT_NOT_EXPR, src);
> +					      inv_op, src, xor_mask);
>  		  gimple_set_location (stmt, bit_loc);
>  		  if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
>  		    gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
> --- gcc/testsuite/gcc.dg/store_merging_15.c.jj	2017-11-10 10:57:54.481369637 +0100
> +++ gcc/testsuite/gcc.dg/store_merging_15.c	2017-11-10 11:14:12.762646259 +0100
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target store_merge } */
> +/* { dg-options "-O2 -fdump-tree-store-merging" } */
> +
> +struct S { unsigned char a, b; unsigned short c; unsigned char d, e, f, g; unsigned long long h; };
> +
> +__attribute__((noipa)) void
> +f1 (struct S *__restrict p, struct S *__restrict q)
> +{
> +  p->a = ~q->a;
> +  p->b = q->b;
> +  p->c = ~q->c;
> +  p->d = ~q->d;
> +  p->e = q->e;
> +  p->f = ~q->f;
> +  p->g = ~q->g;
> +}
> +
> +__attribute__((noipa)) void
> +f2 (struct S *__restrict p, struct S *__restrict q)
> +{
> +  p->a = ~(unsigned char) (p->a & q->a);
> +  p->b = ((unsigned char) ~p->b) & q->b;
> +  p->c = p->c & (unsigned short) ~q->c;
> +  p->d = p->d & q->d;
> +  p->e = p->e & (unsigned char) ~q->e;
> +  p->f = p->f & (unsigned char) ~q->f;
> +  p->g = ~(unsigned char) (p->g & q->g);
> +}
> +
> +struct S s = { 20, 21, 22, 23, 24, 25, 26, 27 };
> +struct S u = { 28, 29, 30, 31, 32, 33, 34, 35 };
> +struct S v = { 36, 37, 38, 39, 40, 41, 42, 43 };
> +
> +int
> +main ()
> +{
> +  asm volatile ("" : : : "memory");
> +  f1 (&s, &u);
> +  asm volatile ("" : : : "memory");
> +  if (s.a != (unsigned char) ~28 || s.b != 29
> +      || s.c != (unsigned short) ~30 || s.d != (unsigned char) ~31
> +      || s.e != 32 || s.f != (unsigned char) ~33 || s.g != (unsigned char) ~34
> +      || s.h != 27)
> +    __builtin_abort ();
> +  f2 (&u, &v);
> +  asm volatile ("" : : : "memory");
> +  if (u.a != (unsigned char) ~(28 & 36) || u.b != (((unsigned char) ~29) & 37)
> +      || u.c != (30 & (unsigned short) ~38) || u.d != (31 & 39)
> +      || u.e != (32 & (unsigned char) ~40) || u.f != (33 & (unsigned char) ~41)
> +      || u.g != (unsigned char) ~(34 & 42) || u.h != 35)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */
> 
> 	Jakub
> 
>
diff mbox series

Patch

--- gcc/gimple-ssa-store-merging.c.jj	2017-11-10 08:04:49.000000000 +0100
+++ gcc/gimple-ssa-store-merging.c	2017-11-10 10:53:13.152731543 +0100
@@ -1036,7 +1036,6 @@  compatible_load_p (merged_store_group *m
 {
   store_immediate_info *infof = merged_store->stores[0];
   if (!info->ops[idx].base_addr
-      || info->ops[idx].bit_not_p != infof->ops[idx].bit_not_p
       || (info->ops[idx].bitpos - infof->ops[idx].bitpos
 	  != info->bitpos - infof->bitpos)
       || !operand_equal_p (info->ops[idx].base_addr,
@@ -1176,8 +1175,7 @@  imm_store_chain_info::coalesce_immediate
 	 Merge it into the current store group.  There can be gaps in between
 	 the stores, but there can't be gaps in between bitregions.  */
       else if (info->bitregion_start <= merged_store->bitregion_end
-	       && info->rhs_code == merged_store->stores[0]->rhs_code
-	       && info->bit_not_p == merged_store->stores[0]->bit_not_p)
+	       && info->rhs_code == merged_store->stores[0]->rhs_code)
 	{
 	  store_immediate_info *infof = merged_store->stores[0];
 
@@ -1496,16 +1494,14 @@  split_group (merged_store_group *group,
       total_orig[0] = 1; /* The orig store.  */
       info = group->stores[0];
       if (info->ops[0].base_addr)
-	total_orig[0] += 1 + info->ops[0].bit_not_p;
+	total_orig[0]++;
       if (info->ops[1].base_addr)
-	total_orig[0] += 1 + info->ops[1].bit_not_p;
+	total_orig[0]++;
       switch (info->rhs_code)
 	{
 	case BIT_AND_EXPR:
 	case BIT_IOR_EXPR:
 	case BIT_XOR_EXPR:
-	  if (info->bit_not_p)
-	    total_orig[0]++; /* The orig BIT_NOT_EXPR stmt.  */
 	  total_orig[0]++; /* The orig BIT_*_EXPR stmt.  */
 	  break;
 	default:
@@ -1514,7 +1510,12 @@  split_group (merged_store_group *group,
       total_orig[0] *= group->stores.length ();
 
       FOR_EACH_VEC_ELT (group->stores, i, info)
-	total_new[0] += count_multiple_uses (info);
+	{
+	  total_new[0] += count_multiple_uses (info);
+	  total_orig[0] += (info->bit_not_p
+			    + info->ops[0].bit_not_p
+			    + info->ops[1].bit_not_p);
+	}
     }
 
   if (!allow_unaligned_load)
@@ -1654,13 +1655,13 @@  split_group (merged_store_group *group,
 
   if (total_orig)
     {
+      unsigned int i;
+      struct split_store *store;
       /* If we are reusing some original stores and any of the
 	 original SSA_NAMEs had multiple uses, we need to subtract
 	 those now before we add the new ones.  */
       if (total_new[0] && any_orig)
 	{
-	  unsigned int i;
-	  struct split_store *store;
 	  FOR_EACH_VEC_ELT (*split_stores, i, store)
 	    if (store->orig)
 	      total_new[0] -= count_multiple_uses (store->orig_stores[0]);
@@ -1668,26 +1669,105 @@  split_group (merged_store_group *group,
       total_new[0] += ret; /* The new store.  */
       store_immediate_info *info = group->stores[0];
       if (info->ops[0].base_addr)
-	total_new[0] += ret * (1 + info->ops[0].bit_not_p);
+	total_new[0] += ret;
       if (info->ops[1].base_addr)
-	total_new[0] += ret * (1 + info->ops[1].bit_not_p);
+	total_new[0] += ret;
       switch (info->rhs_code)
 	{
 	case BIT_AND_EXPR:
 	case BIT_IOR_EXPR:
 	case BIT_XOR_EXPR:
-	  if (info->bit_not_p)
-	    total_new[0] += ret; /* The new BIT_NOT_EXPR stmt.  */
 	  total_new[0] += ret; /* The new BIT_*_EXPR stmt.  */
 	  break;
 	default:
 	  break;
 	}
+      FOR_EACH_VEC_ELT (*split_stores, i, store)
+	{
+	  unsigned int j;
+	  bool bit_not_p[3] = { false, false, false };
+	  /* If all orig_stores have certain bit_not_p set, then
+	     we'd use a BIT_NOT_EXPR stmt and need to account for it.
+	     If some orig_stores have certain bit_not_p set, then
+	     we'd use a BIT_XOR_EXPR with a mask and need to account for
+	     it.  */
+	  FOR_EACH_VEC_ELT (store->orig_stores, j, info)
+	    {
+	      if (info->ops[0].bit_not_p)
+		bit_not_p[0] = true;
+	      if (info->ops[1].bit_not_p)
+		bit_not_p[1] = true;
+	      if (info->bit_not_p)
+		bit_not_p[2] = true;
+	    }
+	  total_new[0] += bit_not_p[0] + bit_not_p[1] + bit_not_p[2];
+	}
+
     }
 
   return ret;
 }
 
+/* Return the operation through which the operand IDX (if < 2) or
+   result (IDX == 2) should be inverted.  If NOP_EXPR, no inversion
+   is done, if BIT_NOT_EXPR, all bits are inverted, if BIT_XOR_EXPR,
+   the bits should be xored with mask.  */
+
+static enum tree_code
+invert_op (split_store *split_store, int idx, tree int_type, tree &mask)
+{
+  unsigned int i;
+  store_immediate_info *info;
+  unsigned int cnt = 0;
+  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
+    {
+      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
+      if (bit_not_p)
+	++cnt;
+    }
+  mask = NULL_TREE;
+  if (cnt == 0)
+    return NOP_EXPR;
+  if (cnt == split_store->orig_stores.length ())
+    return BIT_NOT_EXPR;
+
+  unsigned HOST_WIDE_INT try_bitpos = split_store->bytepos * BITS_PER_UNIT;
+  unsigned buf_size = split_store->size / BITS_PER_UNIT;
+  unsigned char *buf
+    = XALLOCAVEC (unsigned char, buf_size);
+  memset (buf, ~0U, buf_size);
+  FOR_EACH_VEC_ELT (split_store->orig_stores, i, info)
+    {
+      bool bit_not_p = idx < 2 ? info->ops[idx].bit_not_p : info->bit_not_p;
+      if (!bit_not_p)
+	continue;
+      /* Clear regions with bit_not_p and invert afterwards, rather than
+	 clear regions with !bit_not_p, so that gaps in between stores aren't
+	 set in the mask.  */
+      unsigned HOST_WIDE_INT bitsize = info->bitsize;
+      unsigned int pos_in_buffer = 0;
+      if (info->bitpos < try_bitpos)
+	{
+	  gcc_assert (info->bitpos + bitsize > try_bitpos);
+	  bitsize -= (try_bitpos - info->bitpos);
+	}
+      else
+	pos_in_buffer = info->bitpos - try_bitpos;
+      if (pos_in_buffer + bitsize > split_store->size)
+	bitsize = split_store->size - pos_in_buffer;
+      unsigned char *p = buf + (pos_in_buffer / BITS_PER_UNIT);
+      if (BYTES_BIG_ENDIAN)
+	clear_bit_region_be (p, (BITS_PER_UNIT - 1
+				 - (pos_in_buffer % BITS_PER_UNIT)), bitsize);
+      else
+	clear_bit_region (p, pos_in_buffer % BITS_PER_UNIT, bitsize);
+    }
+  for (unsigned int i = 0; i < buf_size; ++i)
+    buf[i] = ~buf[i];
+  mask = native_interpret_expr (int_type, buf, buf_size);
+  return BIT_XOR_EXPR;
+}
+
 /* Given a merged store group GROUP output the widened version of it.
    The store chain is against the base object BASE.
    Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output
@@ -1894,10 +1974,13 @@  imm_store_chain_info::output_merged_stor
 		      gimple_seq_add_stmt_without_update (&seq, stmt);
 		    }
 		  ops[j] = gimple_assign_lhs (stmt);
-		  if (op.bit_not_p)
+		  tree xor_mask;
+		  enum tree_code inv_op
+		    = invert_op (split_store, j, int_type, xor_mask);
+		  if (inv_op != NOP_EXPR)
 		    {
 		      stmt = gimple_build_assign (make_ssa_name (int_type),
-						  BIT_NOT_EXPR, ops[j]);
+						  inv_op, ops[j], xor_mask);
 		      gimple_set_location (stmt, load_loc);
 		      ops[j] = gimple_assign_lhs (stmt);
 
@@ -1947,10 +2030,13 @@  imm_store_chain_info::output_merged_stor
 	      else
 		gimple_seq_add_stmt_without_update (&seq, stmt);
 	      src = gimple_assign_lhs (stmt);
-	      if (split_store->orig_stores[0]->bit_not_p)
+	      tree xor_mask;
+	      enum tree_code inv_op;
+	      inv_op = invert_op (split_store, 2, int_type, xor_mask);
+	      if (inv_op != NOP_EXPR)
 		{
 		  stmt = gimple_build_assign (make_ssa_name (int_type),
-					      BIT_NOT_EXPR, src);
+					      inv_op, src, xor_mask);
 		  gimple_set_location (stmt, bit_loc);
 		  if (load_addr[1] == NULL_TREE && gsi_bb (load_gsi[0]))
 		    gimple_seq_add_stmt_without_update (&load_seq[0], stmt);
--- gcc/testsuite/gcc.dg/store_merging_15.c.jj	2017-11-10 10:57:54.481369637 +0100
+++ gcc/testsuite/gcc.dg/store_merging_15.c	2017-11-10 11:14:12.762646259 +0100
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target store_merge } */
+/* { dg-options "-O2 -fdump-tree-store-merging" } */
+
+struct S { unsigned char a, b; unsigned short c; unsigned char d, e, f, g; unsigned long long h; };
+
+__attribute__((noipa)) void
+f1 (struct S *__restrict p, struct S *__restrict q)
+{
+  p->a = ~q->a;
+  p->b = q->b;
+  p->c = ~q->c;
+  p->d = ~q->d;
+  p->e = q->e;
+  p->f = ~q->f;
+  p->g = ~q->g;
+}
+
+__attribute__((noipa)) void
+f2 (struct S *__restrict p, struct S *__restrict q)
+{
+  p->a = ~(unsigned char) (p->a & q->a);
+  p->b = ((unsigned char) ~p->b) & q->b;
+  p->c = p->c & (unsigned short) ~q->c;
+  p->d = p->d & q->d;
+  p->e = p->e & (unsigned char) ~q->e;
+  p->f = p->f & (unsigned char) ~q->f;
+  p->g = ~(unsigned char) (p->g & q->g);
+}
+
+struct S s = { 20, 21, 22, 23, 24, 25, 26, 27 };
+struct S u = { 28, 29, 30, 31, 32, 33, 34, 35 };
+struct S v = { 36, 37, 38, 39, 40, 41, 42, 43 };
+
+int
+main ()
+{
+  asm volatile ("" : : : "memory");
+  f1 (&s, &u);
+  asm volatile ("" : : : "memory");
+  if (s.a != (unsigned char) ~28 || s.b != 29
+      || s.c != (unsigned short) ~30 || s.d != (unsigned char) ~31
+      || s.e != 32 || s.f != (unsigned char) ~33 || s.g != (unsigned char) ~34
+      || s.h != 27)
+    __builtin_abort ();
+  f2 (&u, &v);
+  asm volatile ("" : : : "memory");
+  if (u.a != (unsigned char) ~(28 & 36) || u.b != (((unsigned char) ~29) & 37)
+      || u.c != (30 & (unsigned short) ~38) || u.d != (31 & 39)
+      || u.e != (32 & (unsigned char) ~40) || u.f != (33 & (unsigned char) ~41)
+      || u.g != (unsigned char) ~(34 & 42) || u.h != 35)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */