diff mbox series

optabs: Fix up expand_doubleword_shift_condmove for shift_mask == 0 [PR108803]

Message ID Y+9TodOHE09e9Vwq@tucnak
State New
Headers show
Series optabs: Fix up expand_doubleword_shift_condmove for shift_mask == 0 [PR108803] | expand

Commit Message

Jakub Jelinek Feb. 17, 2023, 10:14 a.m. UTC
Hi!

The following testcase is miscompiled on aarch64.  The problem is that
aarch64 with TARGET_SIMD is !SHIFT_COUNT_TRUNCATED target with
targetm.shift_truncation_mask (DImode) == 0 which has HAVE_conditional_move
true.  If a doubleword shift (in this case TImode) doesn't have its own
expander (as is the case of e.g. x86) and is handled in generic code,
there are 3 possible expansions.  One is when the shift count is constant,
the code computes in that case superword_op1 as op1 - BITS_PER_WORD,
and chooses at compile time which of expand_superword_shift or
expand_subword_shift to call, which ensures that whatever is used
actually has its shift count (superword_op1 or op1) in [0, BITS_PER_WORD - 1]
range.  If !HAVE_conditional_move or that expansion fails, the function
handles non-constant op1 similarly (there are some special cases for
shift_mask >= BITS_PER_WORD - 1 but let's talk here just about
shift_mask < BITS_PER_WORD - 1), except that the selection is done at
runtime, with branches around the stuff.  While superword_op1 can be
[-BITS_PER_WORD, BITS_PER_WORD - 1] and op1 [0, 2 * BITS_PER_WORD - 1],
the runtime selection ensures that the instructions executed at runtime
have their corresponding shift count again in the right range of
[0, BITS_PER_WORD - 1].
The problematic is the HAVE_conditional_move case, which emits both
sequences into the actually executed code, so necessarily one of them
has an out of range shift count and then using 2 conditional moves
picks up a result.
Now, in the testcase because -Og doesn't perform VRP/EVRP the shift
count isn't optimized to constant during GIMPLE passes, but is determined
to be constant during/after expansion into RTL.  The shift count is
actually const0_rtx later, so superword_op1 is (const_int -64) and we end
up with miscompilation during combine because of that.
I'm afraid on targetm.shift_truncation_mask (DImode) == 0 targets we have
to mask the shift counts when the doubleshift count is in range but
one of subword_op1/superword_op1 is not, which is what the following
patch does and what fixes the wrong-code.  Now, targets like x86 or aarch64,
while they are !SHIFT_COUNT_TRUNCATED, have actually patterns to catch
shift with masked counter, so the ANDs can be optimized out.  On the other
side, when we know the result will be masked this way we can use the
simpler ~op1 instead of (BITS_PER_WORD - 1) - op1 in expand_subword_shift.
So, say on
__attribute__((noipa)) __int128
foo (__int128 a, unsigned k)
{
  return a << k;
}
on aarch64 at -Og the difference is:
 foo:
        subs    w5, w2, #64
-       lsl     x6, x0, x5
+       lsl     x4, x0, x2
        lsr     x3, x0, 1
-       mov     w4, 63
-       sub     w4, w4, w2
-       lsr     x3, x3, x4
+       mvn     w6, w2
+       and     w2, w2, 63
+       lsr     x3, x3, x6
        lsl     x1, x1, x2
        orr     x1, x3, x1
        lsl     x0, x0, x2
        csel    x0, xzr, x0, pl
-       csel    x1, x6, x1, pl
+       csel    x1, x4, x1, pl
        ret
We could do even better and optimize the and w2, w2, 63 instruction out,
but it is a matter of costs and so IMHO should be handled incrementally.
For that case consider say:
void corge (int, int, int);

void
qux (int x, int y, int z, int n)
{
  n &= 31;
  corge (x << n, y << n, z >> n);
}
with -O2 -fno-tree-vectorize, on x86_64 one gets
        sarl    %cl, %edx
        sall    %cl, %esi
        sall    %cl, %edi
        jmp     corge
but on aarch64
        and     w3, w3, 31
        lsl     w0, w0, w3
        lsl     w1, w1, w3
        asr     w2, w2, w3
        b       corge
The reason it works on x86 is that its rtx_costs hook recognizes
that the AND in shift+mask patterns is for free.
Trying 9 -> 11:
    9: {r85:SI=r96:SI&0x1f;clobber flags:CC;}
      REG_UNUSED flags:CC
   11: {r91:SI=r87:SI<<r85:SI#0;clobber flags:CC;}
      REG_DEAD r87:SI
      REG_UNUSED flags:CC
Failed to match this instruction:
...
Failed to match this instruction:
...
Successfully matched this instruction:
(set (reg/v:SI 85 [ n ])
    (and:SI (reg:SI 96)
        (const_int 31 [0x1f])))
Successfully matched this instruction:
(set (reg:SI 91)
    (ashift:SI (reg/v:SI 87 [ y ])
        (subreg:QI (and:SI (reg:SI 96)
                (const_int 31 [0x1f])) 0)))
allowing combination of insns 9 and 11
original costs 4 + 4 = 8
replacement costs 4 + 4 = 8
Compare that to the aarch64 case:
Trying 9 -> 11:
    9: r95:SI=r106:SI&0x1f
      REG_DEAD r106:SI
   11: r101:SI=r104:SI<<r95:SI#0
      REG_DEAD r104:SI
Failed to match this instruction:
...
Failed to match this instruction:
...
Successfully matched this instruction:
(set (reg/v:SI 95 [ n ])
    (and:SI (reg:SI 106)
        (const_int 31 [0x1f])))
Successfully matched this instruction:
(set (reg:SI 101)
    (ashift:SI (reg:SI 104)
        (subreg:QI (and:SI (reg:SI 106)
                (const_int 31 [0x1f])) 0)))
rejecting combination of insns 9 and 11
original costs 4 + 4 = 8
replacement costs 4 + 8 = 12
Now, if the rtx costs on the *aarch64_ashl_reg_si3_mask1
and similar would be the same as for normal shift without the
masking, this would succeed and we could get rid of the and.

Bootstrapped/regtested on aarch64-linux, ok for trunk?

2023-02-17  Jakub Jelinek  <jakub@redhat.com>

	PR middle-end/108803
	* optabs.cc (expand_subword_shift): Add MASK_COUNT argument,
	if true, use ~op1 & (BITS_PER_WORD - 1) instead of
	(BITS_PER_WORD - 1) - op1 as tmp and op1 & (BITS_PER_WORD - 1)
	instead of op1 on the other two shifts.
	(expand_doubleword_shift_condmove): Use
	superword_op1 & (BITS_PER_WORD - 1) instead of superword_op1.  Pass
	shift_mask < BITS_PER_WORD - 1 as MASK_COUNT to expand_subword_shift.
	(expand_doubleword_shift): Pass false as MASK_COUNT to
	expand_subword_shift.

	* gcc.dg/pr108803.c: New test.


	Jakub

Comments

Richard Sandiford Feb. 27, 2023, 3:34 p.m. UTC | #1
Jakub Jelinek <jakub@redhat.com> writes:
> Hi!
>
> The following testcase is miscompiled on aarch64.  The problem is that
> aarch64 with TARGET_SIMD is !SHIFT_COUNT_TRUNCATED target with
> targetm.shift_truncation_mask (DImode) == 0 which has HAVE_conditional_move
> true.  If a doubleword shift (in this case TImode) doesn't have its own
> expander (as is the case of e.g. x86) and is handled in generic code,
> there are 3 possible expansions.  One is when the shift count is constant,
> the code computes in that case superword_op1 as op1 - BITS_PER_WORD,
> and chooses at compile time which of expand_superword_shift or
> expand_subword_shift to call, which ensures that whatever is used
> actually has its shift count (superword_op1 or op1) in [0, BITS_PER_WORD - 1]
> range.  If !HAVE_conditional_move or that expansion fails, the function
> handles non-constant op1 similarly (there are some special cases for
> shift_mask >= BITS_PER_WORD - 1 but let's talk here just about
> shift_mask < BITS_PER_WORD - 1), except that the selection is done at
> runtime, with branches around the stuff.  While superword_op1 can be
> [-BITS_PER_WORD, BITS_PER_WORD - 1] and op1 [0, 2 * BITS_PER_WORD - 1],
> the runtime selection ensures that the instructions executed at runtime
> have their corresponding shift count again in the right range of
> [0, BITS_PER_WORD - 1].
> The problematic is the HAVE_conditional_move case, which emits both
> sequences into the actually executed code, so necessarily one of them
> has an out of range shift count and then using 2 conditional moves
> picks up a result.
> Now, in the testcase because -Og doesn't perform VRP/EVRP the shift
> count isn't optimized to constant during GIMPLE passes, but is determined
> to be constant during/after expansion into RTL.  The shift count is
> actually const0_rtx later, so superword_op1 is (const_int -64) and we end
> up with miscompilation during combine because of that.

I haven't worked through the testcase yet, but how does that happen?
Having shifts with out-of-range shift counts shouldn't be a problem
in itself, provided that the conditional move selects the one with
the in-range shift.

Thanks,
Richard

> I'm afraid on targetm.shift_truncation_mask (DImode) == 0 targets we have
> to mask the shift counts when the doubleshift count is in range but
> one of subword_op1/superword_op1 is not, which is what the following
> patch does and what fixes the wrong-code.  Now, targets like x86 or aarch64,
> while they are !SHIFT_COUNT_TRUNCATED, have actually patterns to catch
> shift with masked counter, so the ANDs can be optimized out.  On the other
> side, when we know the result will be masked this way we can use the
> simpler ~op1 instead of (BITS_PER_WORD - 1) - op1 in expand_subword_shift.
> So, say on
> __attribute__((noipa)) __int128
> foo (__int128 a, unsigned k)
> {
>   return a << k;
> }
> on aarch64 at -Og the difference is:
>  foo:
>         subs    w5, w2, #64
> -       lsl     x6, x0, x5
> +       lsl     x4, x0, x2
>         lsr     x3, x0, 1
> -       mov     w4, 63
> -       sub     w4, w4, w2
> -       lsr     x3, x3, x4
> +       mvn     w6, w2
> +       and     w2, w2, 63
> +       lsr     x3, x3, x6
>         lsl     x1, x1, x2
>         orr     x1, x3, x1
>         lsl     x0, x0, x2
>         csel    x0, xzr, x0, pl
> -       csel    x1, x6, x1, pl
> +       csel    x1, x4, x1, pl
>         ret
> We could do even better and optimize the and w2, w2, 63 instruction out,
> but it is a matter of costs and so IMHO should be handled incrementally.
> For that case consider say:
> void corge (int, int, int);
>
> void
> qux (int x, int y, int z, int n)
> {
>   n &= 31;
>   corge (x << n, y << n, z >> n);
> }
> with -O2 -fno-tree-vectorize, on x86_64 one gets
>         sarl    %cl, %edx
>         sall    %cl, %esi
>         sall    %cl, %edi
>         jmp     corge
> but on aarch64
>         and     w3, w3, 31
>         lsl     w0, w0, w3
>         lsl     w1, w1, w3
>         asr     w2, w2, w3
>         b       corge
> The reason it works on x86 is that its rtx_costs hook recognizes
> that the AND in shift+mask patterns is for free.
> Trying 9 -> 11:
>     9: {r85:SI=r96:SI&0x1f;clobber flags:CC;}
>       REG_UNUSED flags:CC
>    11: {r91:SI=r87:SI<<r85:SI#0;clobber flags:CC;}
>       REG_DEAD r87:SI
>       REG_UNUSED flags:CC
> Failed to match this instruction:
> ...
> Failed to match this instruction:
> ...
> Successfully matched this instruction:
> (set (reg/v:SI 85 [ n ])
>     (and:SI (reg:SI 96)
>         (const_int 31 [0x1f])))
> Successfully matched this instruction:
> (set (reg:SI 91)
>     (ashift:SI (reg/v:SI 87 [ y ])
>         (subreg:QI (and:SI (reg:SI 96)
>                 (const_int 31 [0x1f])) 0)))
> allowing combination of insns 9 and 11
> original costs 4 + 4 = 8
> replacement costs 4 + 4 = 8
> Compare that to the aarch64 case:
> Trying 9 -> 11:
>     9: r95:SI=r106:SI&0x1f
>       REG_DEAD r106:SI
>    11: r101:SI=r104:SI<<r95:SI#0
>       REG_DEAD r104:SI
> Failed to match this instruction:
> ...
> Failed to match this instruction:
> ...
> Successfully matched this instruction:
> (set (reg/v:SI 95 [ n ])
>     (and:SI (reg:SI 106)
>         (const_int 31 [0x1f])))
> Successfully matched this instruction:
> (set (reg:SI 101)
>     (ashift:SI (reg:SI 104)
>         (subreg:QI (and:SI (reg:SI 106)
>                 (const_int 31 [0x1f])) 0)))
> rejecting combination of insns 9 and 11
> original costs 4 + 4 = 8
> replacement costs 4 + 8 = 12
> Now, if the rtx costs on the *aarch64_ashl_reg_si3_mask1
> and similar would be the same as for normal shift without the
> masking, this would succeed and we could get rid of the and.
>
> Bootstrapped/regtested on aarch64-linux, ok for trunk?
>
> 2023-02-17  Jakub Jelinek  <jakub@redhat.com>
>
> 	PR middle-end/108803
> 	* optabs.cc (expand_subword_shift): Add MASK_COUNT argument,
> 	if true, use ~op1 & (BITS_PER_WORD - 1) instead of
> 	(BITS_PER_WORD - 1) - op1 as tmp and op1 & (BITS_PER_WORD - 1)
> 	instead of op1 on the other two shifts.
> 	(expand_doubleword_shift_condmove): Use
> 	superword_op1 & (BITS_PER_WORD - 1) instead of superword_op1.  Pass
> 	shift_mask < BITS_PER_WORD - 1 as MASK_COUNT to expand_subword_shift.
> 	(expand_doubleword_shift): Pass false as MASK_COUNT to
> 	expand_subword_shift.
>
> 	* gcc.dg/pr108803.c: New test.
>
> --- gcc/optabs.cc.jj	2023-01-02 09:32:53.309838465 +0100
> +++ gcc/optabs.cc	2023-02-16 20:44:21.862031535 +0100
> @@ -507,7 +507,7 @@ expand_subword_shift (scalar_int_mode op
>  		      rtx outof_input, rtx into_input, rtx op1,
>  		      rtx outof_target, rtx into_target,
>  		      int unsignedp, enum optab_methods methods,
> -		      unsigned HOST_WIDE_INT shift_mask)
> +		      unsigned HOST_WIDE_INT shift_mask, bool mask_count)
>  {
>    optab reverse_unsigned_shift, unsigned_shift;
>    rtx tmp, carries;
> @@ -535,7 +535,7 @@ expand_subword_shift (scalar_int_mode op
>  	 are truncated to the mode size.  */
>        carries = expand_binop (word_mode, reverse_unsigned_shift,
>  			      outof_input, const1_rtx, 0, unsignedp, methods);
> -      if (shift_mask == BITS_PER_WORD - 1)
> +      if (shift_mask == BITS_PER_WORD - 1 || mask_count)
>  	{
>  	  tmp = immed_wide_int_const
>  	    (wi::minus_one (GET_MODE_PRECISION (op1_mode)), op1_mode);
> @@ -549,6 +549,15 @@ expand_subword_shift (scalar_int_mode op
>  	  tmp = simplify_expand_binop (op1_mode, sub_optab, tmp, op1,
>  				       0, true, methods);
>  	}
> +      if (mask_count)
> +	{
> +	  rtx tmp2 = immed_wide_int_const (wi::shwi (BITS_PER_WORD - 1,
> +					   op1_mode), op1_mode);
> +	  tmp = simplify_expand_binop (op1_mode, and_optab, tmp, tmp2, 0,
> +				       true, methods);
> +	  op1 = simplify_expand_binop (op1_mode, and_optab, op1, tmp2, 0,
> +				       true, methods);
> +	}
>      }
>    if (tmp == 0 || carries == 0)
>      return false;
> @@ -596,6 +605,15 @@ expand_doubleword_shift_condmove (scalar
>  {
>    rtx outof_superword, into_superword;
>  
> +  if (shift_mask < BITS_PER_WORD - 1)
> +    {
> +      rtx tmp = immed_wide_int_const (wi::shwi (BITS_PER_WORD - 1, op1_mode),
> +				      op1_mode);
> +      superword_op1
> +	= simplify_expand_binop (op1_mode, and_optab, superword_op1, tmp,
> +				 0, true, methods);
> +    }
> +
>    /* Put the superword version of the output into OUTOF_SUPERWORD and
>       INTO_SUPERWORD.  */
>    outof_superword = outof_target != 0 ? gen_reg_rtx (word_mode) : 0;
> @@ -621,7 +639,8 @@ expand_doubleword_shift_condmove (scalar
>    if (!expand_subword_shift (op1_mode, binoptab,
>  			     outof_input, into_input, subword_op1,
>  			     outof_target, into_target,
> -			     unsignedp, methods, shift_mask))
> +			     unsignedp, methods, shift_mask,
> +			     shift_mask < BITS_PER_WORD - 1))
>      return false;
>  
>    /* Select between them.  Do the INTO half first because INTO_SUPERWORD
> @@ -742,7 +761,7 @@ expand_doubleword_shift (scalar_int_mode
>  	return expand_subword_shift (op1_mode, binoptab,
>  				     outof_input, into_input, op1,
>  				     outof_target, into_target,
> -				     unsignedp, methods, shift_mask);
> +				     unsignedp, methods, shift_mask, false);
>      }
>  
>    /* Try using conditional moves to generate straight-line code.  */
> @@ -781,7 +800,7 @@ expand_doubleword_shift (scalar_int_mode
>    if (!expand_subword_shift (op1_mode, binoptab,
>  			     outof_input, into_input, op1,
>  			     outof_target, into_target,
> -			     unsignedp, methods, shift_mask))
> +			     unsignedp, methods, shift_mask, false))
>      return false;
>  
>    emit_label (done_label);
> --- gcc/testsuite/gcc.dg/pr108803.c.jj	2023-02-16 20:48:53.517032422 +0100
> +++ gcc/testsuite/gcc.dg/pr108803.c	2023-02-16 20:48:45.971143498 +0100
> @@ -0,0 +1,24 @@
> +/* PR middle-end/108803 */
> +/* { dg-do run { target int128 } } */
> +/* { dg-options "-Og" } */
> +
> +unsigned i, j;
> +
> +__int128
> +foo (__int128 a)
> +{
> +  a &= 9;
> +  unsigned k = __builtin_add_overflow_p (i, j, a);
> +  __int128 b = (63 + a) << k | ((63 + a) >> (63 & k));
> +  __int128 c = a + b;
> +  return c;
> +}
> +
> +int
> +main (void)
> +{
> +  __int128 x = foo (0);
> +  if (x != 63)
> +    __builtin_abort ();
> +  return 0;
> +}
>
> 	Jakub
Jakub Jelinek Feb. 27, 2023, 7:11 p.m. UTC | #2
On Mon, Feb 27, 2023 at 03:34:11PM +0000, Richard Sandiford wrote:
> > The following testcase is miscompiled on aarch64.  The problem is that
> > aarch64 with TARGET_SIMD is !SHIFT_COUNT_TRUNCATED target with
> > targetm.shift_truncation_mask (DImode) == 0 which has HAVE_conditional_move
> > true.  If a doubleword shift (in this case TImode) doesn't have its own
> > expander (as is the case of e.g. x86) and is handled in generic code,
> > there are 3 possible expansions.  One is when the shift count is constant,
> > the code computes in that case superword_op1 as op1 - BITS_PER_WORD,
> > and chooses at compile time which of expand_superword_shift or
> > expand_subword_shift to call, which ensures that whatever is used
> > actually has its shift count (superword_op1 or op1) in [0, BITS_PER_WORD - 1]
> > range.  If !HAVE_conditional_move or that expansion fails, the function
> > handles non-constant op1 similarly (there are some special cases for
> > shift_mask >= BITS_PER_WORD - 1 but let's talk here just about
> > shift_mask < BITS_PER_WORD - 1), except that the selection is done at
> > runtime, with branches around the stuff.  While superword_op1 can be
> > [-BITS_PER_WORD, BITS_PER_WORD - 1] and op1 [0, 2 * BITS_PER_WORD - 1],
> > the runtime selection ensures that the instructions executed at runtime
> > have their corresponding shift count again in the right range of
> > [0, BITS_PER_WORD - 1].
> > The problematic is the HAVE_conditional_move case, which emits both
> > sequences into the actually executed code, so necessarily one of them
> > has an out of range shift count and then using 2 conditional moves
> > picks up a result.
> > Now, in the testcase because -Og doesn't perform VRP/EVRP the shift
> > count isn't optimized to constant during GIMPLE passes, but is determined
> > to be constant during/after expansion into RTL.  The shift count is
> > actually const0_rtx later, so superword_op1 is (const_int -64) and we end
> > up with miscompilation during combine because of that.
> 
> I haven't worked through the testcase yet, but how does that happen?
> Having shifts with out-of-range shift counts shouldn't be a problem
> in itself, provided that the conditional move selects the one with
> the in-range shift.

It depends on if we (hw or the compiler) treats out-of-range shifts in RTL
as undefined behavior or just some kind of constrained unspecified behavior
(destination register can be anything, but no traps/exceptions/program
termination etc.).  Of course SHIFT_COUNT_TRUNCATED makes it well defined,
but that is not the case here.
I've always thoughts we really treat it as undefined behavior, you shouldn't
reach this spot at runtime; we certainly treat it that way in GIMPLE.
If that is the case, then invoking UB and then having a conditional move
not to select its result is not well defined.
And the patch as posted fixes that problem while not really resulting in
worse code e.g. on aarch64.

Anyway, back to the particular testcase rather than the general idea,
it goes wrong during combine, CCing Segher.  I've added debug_bb_n after
every successful combine attempt,
--- gcc/combine.cc.jj	2023-01-02 09:32:43.720977011 +0100
+++ gcc/combine.cc	2023-02-27 19:27:28.151289055 +0100
@@ -4755,6 +4755,16 @@ try_combine (rtx_insn *i3, rtx_insn *i2,
   if (added_notes_insn && DF_INSN_LUID (added_notes_insn) < DF_INSN_LUID (ret))
     ret = added_notes_insn;
 
+{
+static int cnt = 0;
+char buf[64];
+sprintf (buf, "/tmp/combine.%d", cnt++);
+FILE *f = fopen (buf, "a");
+fprintf (f, "%d\n", combine_successes);
+basic_block bb = BASIC_BLOCK_FOR_FN (cfun, 2);
+dump_bb (f, bb, 0, dump_flags_t ());
+fclose (f);
+}
   return ret;
 }
 
and I see
(insn 52 48 53 2 (set (reg:CC 66 cc)
        (compare:CC (reg:SI 130)
            (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
     (expr_list:REG_DEAD (reg:SI 130)
        (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
                (const_int 0 [0]))
            (nil))))
(insn 53 52 57 2 (set (reg:DI 152 [ _6+8 ])
        (if_then_else:DI (ge (reg:CC 66 cc)
                (const_int 0 [0]))
            (reg:DI 132)
            (const_int 0 [0]))) "pr108803.c":12:25 490 {*cmovdi_insn}
     (expr_list:REG_DEAD (reg:DI 132)
        (nil)))
(insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
        (if_then_else:DI (ge (reg:CC 66 cc)
                (const_int 0 [0]))
            (const_int 0 [0])
            (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
     (expr_list:REG_DEAD (reg:CC 66 cc)
        (nil)))
...
(insn 71 68 72 2 (set (reg:CC 66 cc)
        (compare:CC (reg:SI 137)
            (const_int 0 [0]))) "pr108803.c":12:42 437 {cmpsi}
     (expr_list:REG_DEAD (reg:SI 137)
        (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
                (const_int 0 [0]))
            (nil))))
(insn 72 71 76 2 (set (reg:DI 153 [ _8 ])
        (if_then_else:DI (ge (reg:CC 66 cc)
                (const_int 0 [0]))
            (reg:DI 139)
            (reg:DI 153 [ _8 ]))) "pr108803.c":12:42 490 {*cmovdi_insn}
     (expr_list:REG_DEAD (reg:DI 139)
        (nil)))
(insn 76 72 77 2 (set (reg:DI 154 [ _8+8 ])
        (if_then_else:DI (ge (reg:CC 66 cc)
                (const_int 0 [0]))
            (reg:DI 138)
            (reg:DI 127))) "pr108803.c":12:42 490 {*cmovdi_insn}
     (expr_list:REG_DEAD (reg:DI 138)
        (expr_list:REG_DEAD (reg:DI 127)
            (expr_list:REG_DEAD (reg:CC 66 cc)
                (nil)))))
(insn 77 76 78 2 (set (reg:DI 159 [ b ])
        (ior:DI (reg:DI 151 [ _6 ])
            (reg:DI 126))) "pr108803.c":12:12 537 {iordi3}
     (expr_list:REG_DEAD (reg:DI 126)
        (expr_list:REG_DEAD (reg:DI 151 [ _6 ])
            (nil))))
(insn 78 77 80 2 (set (reg:DI 160 [ b+8 ])
        (reg:DI 152 [ _6+8 ])) "pr108803.c":12:12 65 {*movdi_aarch64}
     (expr_list:REG_DEAD (reg:DI 152 [ _6+8 ])
        (nil)))
in one of the dumps which still looks correct from brief skimming,
r130 here is k - 64 aka -64 and it is the pair of conditional moves of the
first TImode shift, effectively (a + 63) << 0 but 0 is obfuscated at
expansion time, while the second pair of conditional moves are for the
second TImode shift, effectively (a + 63) >> (0 & 63) and then both ored
together.
When doing
Trying 53 -> 78:
   53: r152:DI={(cc:CC>=0)?r132:DI:0}
      REG_DEAD r132:DI
   78: r160:DI=r152:DI
      REG_DEAD r152:DI
Successfully matched this instruction:
(set (reg:DI 160 [ b+8 ])
    (if_then_else:DI (ge (reg:CC 66 cc)
            (const_int 0 [0]))
        (reg:DI 132)
        (const_int 0 [0])))
allowing combination of insns 53 and 78
original costs 4 + 4 = 8
replacement cost 4
deferring deletion of insn with uid = 53.
modifying insn i3    78: r160:DI={(cc:CC>=0)?r132:DI:0}
      REG_DEAD cc:CC
      REG_DEAD r132:DI
deferring rescan insn with uid = 78.
combination, the IL changes:
@@ -38,21 +38,15 @@
 (insn 52 48 53 2 (set (reg:CC 66 cc)
         (compare:CC (reg:SI 130)
             (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
      (expr_list:REG_DEAD (reg:SI 130)
         (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
                 (const_int 0 [0]))
             (nil))))
-(insn 53 52 57 2 (set (reg:DI 152 [ _6+8 ])
-        (if_then_else:DI (ge (reg:CC 66 cc)
-                (const_int 0 [0]))
-            (reg:DI 132)
-            (const_int 0 [0]))) "pr108803.c":12:25 490 {*cmovdi_insn}
-     (expr_list:REG_DEAD (reg:DI 132)
-        (nil)))
+(note 53 52 57 2 NOTE_INSN_DELETED)
 (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
         (if_then_else:DI (ge (reg:CC 66 cc)
                 (const_int 0 [0]))
             (const_int 0 [0])
             (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
      (expr_list:REG_DEAD (reg:CC 66 cc)
         (nil)))
@@ -72,17 +66,21 @@
 (insn 77 76 78 2 (set (reg:DI 159 [ b ])
         (ior:DI (reg:DI 151 [ _6 ])
             (reg:DI 126))) "pr108803.c":12:12 537 {iordi3}
      (expr_list:REG_DEAD (reg:DI 126)
         (expr_list:REG_DEAD (reg:DI 151 [ _6 ])
             (nil))))
 (insn 78 77 80 2 (set (reg:DI 160 [ b+8 ])
-        (reg:DI 152 [ _6+8 ])) "pr108803.c":12:12 65 {*movdi_aarch64}
-     (expr_list:REG_DEAD (reg:DI 152 [ _6+8 ])
-        (nil)))
+        (if_then_else:DI (ge (reg:CC 66 cc)
+                (const_int 0 [0]))
+            (reg:DI 132)
+            (const_int 0 [0]))) "pr108803.c":12:12 490 {*cmovdi_insn}
+     (expr_list:REG_DEAD (reg:CC 66 cc)
+        (expr_list:REG_DEAD (reg:DI 132)
+            (nil))))
 (insn 80 78 82 2 (parallel [
             (set (reg:CC_C 66 cc)
                 (compare:CC_C (plus:DI (reg:DI 155 [ a ])
                         (reg:DI 159 [ b ]))
                     (reg:DI 155 [ a ])))
             (set (reg:DI 144)
                 (plus:DI (reg:DI 155 [ a ])
but as you can see, because cc reg has been REG_DEAD before on insn 57
rather than on insn 53, nothing really removed REG_DEAD note from there
and just adds it on insn 78 (note, besides this REG_DEAD issue the
IL is otherwise still sane, the previous cc setter 71 and its previous
uses 72 and 76 in between the move have been optimized away already in
an earlier successful combination).
And things go wild with the next successful combination:
Trying 52 -> 57:
   52: cc:CC=cmp(0xffffffffffffffc0,0)
      REG_DEAD r130:SI
      REG_EQUAL cmp(0xffffffffffffffc0,0)
   57: r151:DI={(cc:CC>=0)?0:r126:DI}
      REG_DEAD cc:CC
Successfully matched this instruction:
(set (reg:DI 151 [ _6 ])
    (reg:DI 126))
allowing combination of insns 52 and 57
original costs 4 + 4 = 8
replacement cost 4
deferring deletion of insn with uid = 52.
modifying insn i3    57: r151:DI=r126:DI
deferring rescan insn with uid = 57.
where the IL changes:
@@ -29,27 +29,18 @@
 (insn 41 40 43 2 (set (reg:DI 132)
         (ashift:DI (reg:DI 126)
             (subreg:QI (reg:SI 130) 0))) "pr108803.c":12:25 781 {*aarch64_ashl_sisd_or_int_di3}
-     (expr_list:REG_EQUAL (ashift:DI (reg:DI 126)
-            (const_int -64 [0xffffffffffffffc0]))
-        (nil)))
+     (expr_list:REG_DEAD (reg:SI 130)
+        (expr_list:REG_EQUAL (ashift:DI (reg:DI 126)
+                (const_int -64 [0xffffffffffffffc0]))
+            (nil))))
 (note 43 41 46 2 NOTE_INSN_DELETED)
 (note 46 43 48 2 NOTE_INSN_DELETED)
 (note 48 46 52 2 NOTE_INSN_DELETED)
-(insn 52 48 53 2 (set (reg:CC 66 cc)
-        (compare:CC (reg:SI 130)
-            (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
-     (expr_list:REG_DEAD (reg:SI 130)
-        (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
-                (const_int 0 [0]))
-            (nil))))
+(note 52 48 53 2 NOTE_INSN_DELETED)
 (note 53 52 57 2 NOTE_INSN_DELETED)
 (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
-        (if_then_else:DI (ge (reg:CC 66 cc)
-                (const_int 0 [0]))
-            (const_int 0 [0])
-            (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
-     (expr_list:REG_DEAD (reg:CC 66 cc)
-        (nil)))
+        (reg:DI 126)) "pr108803.c":12:25 65 {*movdi_aarch64}
+     (nil))
 (note 59 57 60 2 NOTE_INSN_DELETED)
 (insn 60 59 61 2 (set (reg:DI 139)
         (const_int 0 [0])) "pr108803.c":12:42 65 {*movdi_aarch64}
This removes the cc setter that was used by insn 57 (which has been
updated) but is also needed for the former insn 53 which has been
merged into insn 78.

	Jakub
Richard Sandiford Feb. 27, 2023, 7:51 p.m. UTC | #3
Jakub Jelinek <jakub@redhat.com> writes:
> On Mon, Feb 27, 2023 at 03:34:11PM +0000, Richard Sandiford wrote:
>> > The following testcase is miscompiled on aarch64.  The problem is that
>> > aarch64 with TARGET_SIMD is !SHIFT_COUNT_TRUNCATED target with
>> > targetm.shift_truncation_mask (DImode) == 0 which has HAVE_conditional_move
>> > true.  If a doubleword shift (in this case TImode) doesn't have its own
>> > expander (as is the case of e.g. x86) and is handled in generic code,
>> > there are 3 possible expansions.  One is when the shift count is constant,
>> > the code computes in that case superword_op1 as op1 - BITS_PER_WORD,
>> > and chooses at compile time which of expand_superword_shift or
>> > expand_subword_shift to call, which ensures that whatever is used
>> > actually has its shift count (superword_op1 or op1) in [0, BITS_PER_WORD - 1]
>> > range.  If !HAVE_conditional_move or that expansion fails, the function
>> > handles non-constant op1 similarly (there are some special cases for
>> > shift_mask >= BITS_PER_WORD - 1 but let's talk here just about
>> > shift_mask < BITS_PER_WORD - 1), except that the selection is done at
>> > runtime, with branches around the stuff.  While superword_op1 can be
>> > [-BITS_PER_WORD, BITS_PER_WORD - 1] and op1 [0, 2 * BITS_PER_WORD - 1],
>> > the runtime selection ensures that the instructions executed at runtime
>> > have their corresponding shift count again in the right range of
>> > [0, BITS_PER_WORD - 1].
>> > The problematic is the HAVE_conditional_move case, which emits both
>> > sequences into the actually executed code, so necessarily one of them
>> > has an out of range shift count and then using 2 conditional moves
>> > picks up a result.
>> > Now, in the testcase because -Og doesn't perform VRP/EVRP the shift
>> > count isn't optimized to constant during GIMPLE passes, but is determined
>> > to be constant during/after expansion into RTL.  The shift count is
>> > actually const0_rtx later, so superword_op1 is (const_int -64) and we end
>> > up with miscompilation during combine because of that.
>> 
>> I haven't worked through the testcase yet, but how does that happen?
>> Having shifts with out-of-range shift counts shouldn't be a problem
>> in itself, provided that the conditional move selects the one with
>> the in-range shift.
>
> It depends on if we (hw or the compiler) treats out-of-range shifts in RTL
> as undefined behavior or just some kind of constrained unspecified behavior
> (destination register can be anything, but no traps/exceptions/program
> termination etc.).  Of course SHIFT_COUNT_TRUNCATED makes it well defined,
> but that is not the case here.
> I've always thoughts we really treat it as undefined behavior, you shouldn't
> reach this spot at runtime; we certainly treat it that way in GIMPLE.
> If that is the case, then invoking UB and then having a conditional move
> not to select its result is not well defined.
> And the patch as posted fixes that problem while not really resulting in
> worse code e.g. on aarch64.

I think RTL and gimple are different in that respect.
SHIFT_COUNT_TRUNCATED's effect on shifts is IMO a bit like
CTZ_DEFINED_VALUE_AT_ZERO's effect on CTZ: it enumerates common
target-specific behaviour, but doesn't turn invalid/should-not-be-evaluated
values into valid values.  Not defining SHIFT_COUNT_TRUNCATED is like
defining CTZ_DEFINED_VALUE_AT_ZERO to 0.

The docs say:

  Note that regardless of this macro the ``definedness'' of @code{clz}
  and @code{ctz} at zero do @emph{not} extend to the builtin functions
  visible to the user.  Thus one may be free to adjust the value at will
  to match the target expansion of these operations without fear of
  breaking the API@.

So for CTZ this really is an RTL thing, which can leak into gimple
through ifns.  I'd argue that the same is true for SHIFT_COUNT_TRUNCATED
and conditional shifts like COND_SHL: normal gimple shifts aren't guaranteed
to honour SHIFT_COUNT_TRUNCATED, but COND_SHL should be.

The arm part relies on being able to generate shifts that take
advantage of port-specific knowledge, see arm_emit_coreregs_64bit_shift.

Haven't had time to work through the combine thing yet, but yeah,
it does look suspiciously like something is going wrong in the cc
handling or death note updates.

Thanks,
Richard

> Anyway, back to the particular testcase rather than the general idea,
> it goes wrong during combine, CCing Segher.  I've added debug_bb_n after
> every successful combine attempt,
> --- gcc/combine.cc.jj	2023-01-02 09:32:43.720977011 +0100
> +++ gcc/combine.cc	2023-02-27 19:27:28.151289055 +0100
> @@ -4755,6 +4755,16 @@ try_combine (rtx_insn *i3, rtx_insn *i2,
>    if (added_notes_insn && DF_INSN_LUID (added_notes_insn) < DF_INSN_LUID (ret))
>      ret = added_notes_insn;
>  
> +{
> +static int cnt = 0;
> +char buf[64];
> +sprintf (buf, "/tmp/combine.%d", cnt++);
> +FILE *f = fopen (buf, "a");
> +fprintf (f, "%d\n", combine_successes);
> +basic_block bb = BASIC_BLOCK_FOR_FN (cfun, 2);
> +dump_bb (f, bb, 0, dump_flags_t ());
> +fclose (f);
> +}
>    return ret;
>  }
>  
> and I see
> (insn 52 48 53 2 (set (reg:CC 66 cc)
>         (compare:CC (reg:SI 130)
>             (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
>      (expr_list:REG_DEAD (reg:SI 130)
>         (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
>                 (const_int 0 [0]))
>             (nil))))
> (insn 53 52 57 2 (set (reg:DI 152 [ _6+8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 132)
>             (const_int 0 [0]))) "pr108803.c":12:25 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 132)
>         (nil)))
> (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (const_int 0 [0])
>             (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:CC 66 cc)
>         (nil)))
> ...
> (insn 71 68 72 2 (set (reg:CC 66 cc)
>         (compare:CC (reg:SI 137)
>             (const_int 0 [0]))) "pr108803.c":12:42 437 {cmpsi}
>      (expr_list:REG_DEAD (reg:SI 137)
>         (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
>                 (const_int 0 [0]))
>             (nil))))
> (insn 72 71 76 2 (set (reg:DI 153 [ _8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 139)
>             (reg:DI 153 [ _8 ]))) "pr108803.c":12:42 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 139)
>         (nil)))
> (insn 76 72 77 2 (set (reg:DI 154 [ _8+8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 138)
>             (reg:DI 127))) "pr108803.c":12:42 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 138)
>         (expr_list:REG_DEAD (reg:DI 127)
>             (expr_list:REG_DEAD (reg:CC 66 cc)
>                 (nil)))))
> (insn 77 76 78 2 (set (reg:DI 159 [ b ])
>         (ior:DI (reg:DI 151 [ _6 ])
>             (reg:DI 126))) "pr108803.c":12:12 537 {iordi3}
>      (expr_list:REG_DEAD (reg:DI 126)
>         (expr_list:REG_DEAD (reg:DI 151 [ _6 ])
>             (nil))))
> (insn 78 77 80 2 (set (reg:DI 160 [ b+8 ])
>         (reg:DI 152 [ _6+8 ])) "pr108803.c":12:12 65 {*movdi_aarch64}
>      (expr_list:REG_DEAD (reg:DI 152 [ _6+8 ])
>         (nil)))
> in one of the dumps which still looks correct from brief skimming,
> r130 here is k - 64 aka -64 and it is the pair of conditional moves of the
> first TImode shift, effectively (a + 63) << 0 but 0 is obfuscated at
> expansion time, while the second pair of conditional moves are for the
> second TImode shift, effectively (a + 63) >> (0 & 63) and then both ored
> together.
> When doing
> Trying 53 -> 78:
>    53: r152:DI={(cc:CC>=0)?r132:DI:0}
>       REG_DEAD r132:DI
>    78: r160:DI=r152:DI
>       REG_DEAD r152:DI
> Successfully matched this instruction:
> (set (reg:DI 160 [ b+8 ])
>     (if_then_else:DI (ge (reg:CC 66 cc)
>             (const_int 0 [0]))
>         (reg:DI 132)
>         (const_int 0 [0])))
> allowing combination of insns 53 and 78
> original costs 4 + 4 = 8
> replacement cost 4
> deferring deletion of insn with uid = 53.
> modifying insn i3    78: r160:DI={(cc:CC>=0)?r132:DI:0}
>       REG_DEAD cc:CC
>       REG_DEAD r132:DI
> deferring rescan insn with uid = 78.
> combination, the IL changes:
> @@ -38,21 +38,15 @@
>  (insn 52 48 53 2 (set (reg:CC 66 cc)
>          (compare:CC (reg:SI 130)
>              (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
>       (expr_list:REG_DEAD (reg:SI 130)
>          (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
>                  (const_int 0 [0]))
>              (nil))))
> -(insn 53 52 57 2 (set (reg:DI 152 [ _6+8 ])
> -        (if_then_else:DI (ge (reg:CC 66 cc)
> -                (const_int 0 [0]))
> -            (reg:DI 132)
> -            (const_int 0 [0]))) "pr108803.c":12:25 490 {*cmovdi_insn}
> -     (expr_list:REG_DEAD (reg:DI 132)
> -        (nil)))
> +(note 53 52 57 2 NOTE_INSN_DELETED)
>  (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
>          (if_then_else:DI (ge (reg:CC 66 cc)
>                  (const_int 0 [0]))
>              (const_int 0 [0])
>              (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
>       (expr_list:REG_DEAD (reg:CC 66 cc)
>          (nil)))
> @@ -72,17 +66,21 @@
>  (insn 77 76 78 2 (set (reg:DI 159 [ b ])
>          (ior:DI (reg:DI 151 [ _6 ])
>              (reg:DI 126))) "pr108803.c":12:12 537 {iordi3}
>       (expr_list:REG_DEAD (reg:DI 126)
>          (expr_list:REG_DEAD (reg:DI 151 [ _6 ])
>              (nil))))
>  (insn 78 77 80 2 (set (reg:DI 160 [ b+8 ])
> -        (reg:DI 152 [ _6+8 ])) "pr108803.c":12:12 65 {*movdi_aarch64}
> -     (expr_list:REG_DEAD (reg:DI 152 [ _6+8 ])
> -        (nil)))
> +        (if_then_else:DI (ge (reg:CC 66 cc)
> +                (const_int 0 [0]))
> +            (reg:DI 132)
> +            (const_int 0 [0]))) "pr108803.c":12:12 490 {*cmovdi_insn}
> +     (expr_list:REG_DEAD (reg:CC 66 cc)
> +        (expr_list:REG_DEAD (reg:DI 132)
> +            (nil))))
>  (insn 80 78 82 2 (parallel [
>              (set (reg:CC_C 66 cc)
>                  (compare:CC_C (plus:DI (reg:DI 155 [ a ])
>                          (reg:DI 159 [ b ]))
>                      (reg:DI 155 [ a ])))
>              (set (reg:DI 144)
>                  (plus:DI (reg:DI 155 [ a ])
> but as you can see, because cc reg has been REG_DEAD before on insn 57
> rather than on insn 53, nothing really removed REG_DEAD note from there
> and just adds it on insn 78 (note, besides this REG_DEAD issue the
> IL is otherwise still sane, the previous cc setter 71 and its previous
> uses 72 and 76 in between the move have been optimized away already in
> an earlier successful combination).
> And things go wild with the next successful combination:
> Trying 52 -> 57:
>    52: cc:CC=cmp(0xffffffffffffffc0,0)
>       REG_DEAD r130:SI
>       REG_EQUAL cmp(0xffffffffffffffc0,0)
>    57: r151:DI={(cc:CC>=0)?0:r126:DI}
>       REG_DEAD cc:CC
> Successfully matched this instruction:
> (set (reg:DI 151 [ _6 ])
>     (reg:DI 126))
> allowing combination of insns 52 and 57
> original costs 4 + 4 = 8
> replacement cost 4
> deferring deletion of insn with uid = 52.
> modifying insn i3    57: r151:DI=r126:DI
> deferring rescan insn with uid = 57.
> where the IL changes:
> @@ -29,27 +29,18 @@
>  (insn 41 40 43 2 (set (reg:DI 132)
>          (ashift:DI (reg:DI 126)
>              (subreg:QI (reg:SI 130) 0))) "pr108803.c":12:25 781 {*aarch64_ashl_sisd_or_int_di3}
> -     (expr_list:REG_EQUAL (ashift:DI (reg:DI 126)
> -            (const_int -64 [0xffffffffffffffc0]))
> -        (nil)))
> +     (expr_list:REG_DEAD (reg:SI 130)
> +        (expr_list:REG_EQUAL (ashift:DI (reg:DI 126)
> +                (const_int -64 [0xffffffffffffffc0]))
> +            (nil))))
>  (note 43 41 46 2 NOTE_INSN_DELETED)
>  (note 46 43 48 2 NOTE_INSN_DELETED)
>  (note 48 46 52 2 NOTE_INSN_DELETED)
> -(insn 52 48 53 2 (set (reg:CC 66 cc)
> -        (compare:CC (reg:SI 130)
> -            (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
> -     (expr_list:REG_DEAD (reg:SI 130)
> -        (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
> -                (const_int 0 [0]))
> -            (nil))))
> +(note 52 48 53 2 NOTE_INSN_DELETED)
>  (note 53 52 57 2 NOTE_INSN_DELETED)
>  (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
> -        (if_then_else:DI (ge (reg:CC 66 cc)
> -                (const_int 0 [0]))
> -            (const_int 0 [0])
> -            (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
> -     (expr_list:REG_DEAD (reg:CC 66 cc)
> -        (nil)))
> +        (reg:DI 126)) "pr108803.c":12:25 65 {*movdi_aarch64}
> +     (nil))
>  (note 59 57 60 2 NOTE_INSN_DELETED)
>  (insn 60 59 61 2 (set (reg:DI 139)
>          (const_int 0 [0])) "pr108803.c":12:42 65 {*movdi_aarch64}
> This removes the cc setter that was used by insn 57 (which has been
> updated) but is also needed for the former insn 53 which has been
> merged into insn 78.
>
> 	Jakub
Jakub Jelinek Feb. 27, 2023, 8:02 p.m. UTC | #4
On Mon, Feb 27, 2023 at 07:51:21PM +0000, Richard Sandiford wrote:
> I think RTL and gimple are different in that respect.
> SHIFT_COUNT_TRUNCATED's effect on shifts is IMO a bit like
> CTZ_DEFINED_VALUE_AT_ZERO's effect on CTZ: it enumerates common
> target-specific behaviour, but doesn't turn invalid/should-not-be-evaluated
> values into valid values.  Not defining SHIFT_COUNT_TRUNCATED is like
> defining CTZ_DEFINED_VALUE_AT_ZERO to 0.
> 
> The docs say:
> 
>   Note that regardless of this macro the ``definedness'' of @code{clz}
>   and @code{ctz} at zero do @emph{not} extend to the builtin functions
>   visible to the user.  Thus one may be free to adjust the value at will
>   to match the target expansion of these operations without fear of
>   breaking the API@.
> 
> So for CTZ this really is an RTL thing, which can leak into gimple
> through ifns.  I'd argue that the same is true for SHIFT_COUNT_TRUNCATED
> and conditional shifts like COND_SHL: normal gimple shifts aren't guaranteed
> to honour SHIFT_COUNT_TRUNCATED, but COND_SHL should be.

I understand that if SHIFT_COUNT_TRUNCATED 1 is defined, then formerly
out of bounds shift is well defined on RTL. after all, for
SHIFT_COUNT_TRUNCATED the generic code removes shift count masking as
redundant, so code without UB in the source could otherwise appear to have
UB on RTL.
The question is what happens with SHIFT_COUNT_TRUNCATED 0 or
C?Z_DEFINED_VALUE_AT_ZERO 0, if encountering the RTL with invalid operand(s)
is undefined behavior, or simply undefined value but no other side-effects.
There are many RTL expressions which invoke on invalid values really
undefined behavior, it can crash the program etc.  The question is if
out of bounds shifts are like that too or not.  Ditto for CLZ/CTZ.

	Jakub
Richard Sandiford Feb. 27, 2023, 8:43 p.m. UTC | #5
Jakub Jelinek <jakub@redhat.com> writes:
> On Mon, Feb 27, 2023 at 07:51:21PM +0000, Richard Sandiford wrote:
>> I think RTL and gimple are different in that respect.
>> SHIFT_COUNT_TRUNCATED's effect on shifts is IMO a bit like
>> CTZ_DEFINED_VALUE_AT_ZERO's effect on CTZ: it enumerates common
>> target-specific behaviour, but doesn't turn invalid/should-not-be-evaluated
>> values into valid values.  Not defining SHIFT_COUNT_TRUNCATED is like
>> defining CTZ_DEFINED_VALUE_AT_ZERO to 0.
>> 
>> The docs say:
>> 
>>   Note that regardless of this macro the ``definedness'' of @code{clz}
>>   and @code{ctz} at zero do @emph{not} extend to the builtin functions
>>   visible to the user.  Thus one may be free to adjust the value at will
>>   to match the target expansion of these operations without fear of
>>   breaking the API@.
>> 
>> So for CTZ this really is an RTL thing, which can leak into gimple
>> through ifns.  I'd argue that the same is true for SHIFT_COUNT_TRUNCATED
>> and conditional shifts like COND_SHL: normal gimple shifts aren't guaranteed
>> to honour SHIFT_COUNT_TRUNCATED, but COND_SHL should be.
>
> I understand that if SHIFT_COUNT_TRUNCATED 1 is defined, then formerly
> out of bounds shift is well defined on RTL. after all, for
> SHIFT_COUNT_TRUNCATED the generic code removes shift count masking as
> redundant, so code without UB in the source could otherwise appear to have
> UB on RTL.
> The question is what happens with SHIFT_COUNT_TRUNCATED 0 or
> C?Z_DEFINED_VALUE_AT_ZERO 0, if encountering the RTL with invalid operand(s)
> is undefined behavior, or simply undefined value but no other side-effects.
> There are many RTL expressions which invoke on invalid values really
> undefined behavior, it can crash the program etc.  The question is if
> out of bounds shifts are like that too or not.  Ditto for CLZ/CTZ.

My argument was that !SHIFT_COUNT_TRUNCATED and
C?Z_DEFINED_VALUE_AT_ZERO==0 mean that the behaviour is undefined
only in the sense that target-independent code doesn't know what
the behaviour is.  !SHIFT_COUNT_TRUNCATED doesn't mean that
target-independent code can assume that out-of-range shift values
invoke program UB (and therefore target-independent code can optimise
shifts on the principle that all shifts are in-range).  Similarly
CTZ_DEFINED_VALUE_AT_ZERO==0 doesn't mean the corresponding thing for CTZ.

If !SHIFT_COUNT_TRUNCATED meant that all out-of-range shifts are UB then:

	    wide_int wop1 = pop1;
	    if (SHIFT_COUNT_TRUNCATED)
	      wop1 = wi::umod_trunc (wop1, GET_MODE_PRECISION (int_mode));
	    else if (wi::geu_p (wop1, GET_MODE_PRECISION (int_mode)))
	      return NULL_RTX;

in simplify_const_binary_operation wouldn't be necessary.  We could
just fold constant shifts in the SHIFT_COUNT_TRUNCATED way for all
values, like wide_int_binop folds all nonnegative shifts on trees.

As I say, arm_emit_coreregs_64bit_shift relies on being able to create
RTL shifts whose counts might be out-of-range (to a certain degree),
because the arm port knows how arm shifts behave.  Treating the
out-of-range shifts as UB would break the DI shift expansions.

Thanks,
Richard
Jakub Jelinek Feb. 27, 2023, 8:54 p.m. UTC | #6
On Mon, Feb 27, 2023 at 08:43:27PM +0000, Richard Sandiford wrote:
> My argument was that !SHIFT_COUNT_TRUNCATED and
> C?Z_DEFINED_VALUE_AT_ZERO==0 mean that the behaviour is undefined
> only in the sense that target-independent code doesn't know what
> the behaviour is.  !SHIFT_COUNT_TRUNCATED doesn't mean that
> target-independent code can assume that out-of-range shift values
> invoke program UB (and therefore target-independent code can optimise
> shifts on the principle that all shifts are in-range).  Similarly
> CTZ_DEFINED_VALUE_AT_ZERO==0 doesn't mean the corresponding thing for CTZ.

Even if the target-independent code doesn't know what the target dependent
code will do, I don't see how it could emit it safely.
In that case, I could understand some particular backend emitting such code
because it knows what to assume, and then generic code like simplify-rtx
simply trying to punt and not mess up with things which it doesn't know how
they'll behave.
But in this case it isn't aarch64 backend that emits this, but it is
the generic expansion code.  It even guarantees one of the two shifts
will be out of bounds...  And target didn't tell it that out of bound
shifts say won't trap or do something similar.
Sure, there could be a target hook or macro that would tell the expander
that the backend out of bound shift will just produce unspecified result
but not other effects.

	Jakub
Richard Sandiford Feb. 27, 2023, 9:01 p.m. UTC | #7
Jakub Jelinek <jakub@redhat.com> writes:
> On Mon, Feb 27, 2023 at 08:43:27PM +0000, Richard Sandiford wrote:
>> My argument was that !SHIFT_COUNT_TRUNCATED and
>> C?Z_DEFINED_VALUE_AT_ZERO==0 mean that the behaviour is undefined
>> only in the sense that target-independent code doesn't know what
>> the behaviour is.  !SHIFT_COUNT_TRUNCATED doesn't mean that
>> target-independent code can assume that out-of-range shift values
>> invoke program UB (and therefore target-independent code can optimise
>> shifts on the principle that all shifts are in-range).  Similarly
>> CTZ_DEFINED_VALUE_AT_ZERO==0 doesn't mean the corresponding thing for CTZ.
>
> Even if the target-independent code doesn't know what the target dependent
> code will do, I don't see how it could emit it safely.
> In that case, I could understand some particular backend emitting such code
> because it knows what to assume, and then generic code like simplify-rtx
> simply trying to punt and not mess up with things which it doesn't know how
> they'll behave.
> But in this case it isn't aarch64 backend that emits this, but it is
> the generic expansion code.  It even guarantees one of the two shifts
> will be out of bounds...  And target didn't tell it that out of bound
> shifts say won't trap or do something similar.

But we haven't had to handle trapping shifts because no supported target
behaves like that.  And yeah, even if you disagree with that being a
reasonable assumption...

> Sure, there could be a target hook or macro that would tell the expander
> that the backend out of bound shift will just produce unspecified result
> but not other effects.

...adding an explicit control, or putting the expansion in aarch64 code,
would allow us to generate the same RTL as we do now, and so hit the
same bug.

Richard
Jakub Jelinek Feb. 27, 2023, 9:15 p.m. UTC | #8
On Mon, Feb 27, 2023 at 09:01:15PM +0000, Richard Sandiford via Gcc-patches wrote:
> Jakub Jelinek <jakub@redhat.com> writes:
> > On Mon, Feb 27, 2023 at 08:43:27PM +0000, Richard Sandiford wrote:
> >> My argument was that !SHIFT_COUNT_TRUNCATED and
> >> C?Z_DEFINED_VALUE_AT_ZERO==0 mean that the behaviour is undefined
> >> only in the sense that target-independent code doesn't know what
> >> the behaviour is.  !SHIFT_COUNT_TRUNCATED doesn't mean that
> >> target-independent code can assume that out-of-range shift values
> >> invoke program UB (and therefore target-independent code can optimise
> >> shifts on the principle that all shifts are in-range).  Similarly
> >> CTZ_DEFINED_VALUE_AT_ZERO==0 doesn't mean the corresponding thing for CTZ.
> >
> > Even if the target-independent code doesn't know what the target dependent
> > code will do, I don't see how it could emit it safely.
> > In that case, I could understand some particular backend emitting such code
> > because it knows what to assume, and then generic code like simplify-rtx
> > simply trying to punt and not mess up with things which it doesn't know how
> > they'll behave.
> > But in this case it isn't aarch64 backend that emits this, but it is
> > the generic expansion code.  It even guarantees one of the two shifts
> > will be out of bounds...  And target didn't tell it that out of bound
> > shifts say won't trap or do something similar.
> 
> But we haven't had to handle trapping shifts because no supported target
> behaves like that.  And yeah, even if you disagree with that being a
> reasonable assumption...
> 
> > Sure, there could be a target hook or macro that would tell the expander
> > that the backend out of bound shift will just produce unspecified result
> > but not other effects.
> 
> ...adding an explicit control, or putting the expansion in aarch64 code,
> would allow us to generate the same RTL as we do now, and so hit the
> same bug.

Ok, but then we should document it (that at RTL an out of bounds shift
can only result in unspecified result value but not have other
side-effects).

Which would turn my patch from a correctness thing into an optimization
territory (especially when PR108840 will be fixed), because for
architectures which have these shifts with masking patterns it emits fewer
instructions.  Though, because for targets which don't do that it emits
more code, we'd need some target hook that could tell the expander
that is the case.  And of course make it a GCC14 material rather than GCC13.

	Jakub
Segher Boessenkool Feb. 27, 2023, 10:15 p.m. UTC | #9
On Mon, Feb 27, 2023 at 09:54:06PM +0100, Jakub Jelinek wrote:
> Even if the target-independent code doesn't know what the target dependent
> code will do, I don't see how it could emit it safely.

I always understood RTL to not have anything like C "undefined
behavior", but be closer in general (exceptions are integer division by
zero etc.) to C's "unspecified value".  This is quite close to what on
Power we call "boundedly undefined":
  The results of executing a given instruction are said to be boundedly
  undefined if they could have been achieved by executing an arbitrary
  finite sequence of instructions (none of which yields boundedly
  undefined results) in the state the processor was in before
  executing the given instruction. Boundedly undefined results may
  include the presentation of inconsistent state to the system error
  handler as described in Section 1.8.1 of Book II. Boundedly undefined
  results for a given instruction may vary between implementations, and
  between different executions on the same implementation.
So no trapping, everything writes all bits in all registers always, etc.

C undefined behaviour makes the *whole program* undefined, always: it
can have effects reaching unlimitedly far into the future, but also
unboundedly far into the past (well, not further back then program
start, there is that :-) ).  RTL unspecified stuff is way more limited
than that.  It also has a very different goal: C UB is to allow
compilers to optimise more programs and to optimise them better, while
for RTL we simply do not specify the operation result in some cases.
This does not mean such cases cannot happen!

And of course there are TARGET_SHIFT_COUNT_TRUNCATED and
TARGET_SHIFT_TRUNCATION_MASK, which allow to make more inputs result in
known (to the compiler) outputs.


Segher
Segher Boessenkool Feb. 27, 2023, 10:21 p.m. UTC | #10
Hi!

On Mon, Feb 27, 2023 at 08:11:09PM +0100, Jakub Jelinek wrote:
> (insn 52 48 53 2 (set (reg:CC 66 cc)
>         (compare:CC (reg:SI 130)
>             (const_int 0 [0]))) "pr108803.c":12:25 437 {cmpsi}
>      (expr_list:REG_DEAD (reg:SI 130)
>         (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
>                 (const_int 0 [0]))
>             (nil))))
> (insn 53 52 57 2 (set (reg:DI 152 [ _6+8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 132)
>             (const_int 0 [0]))) "pr108803.c":12:25 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 132)
>         (nil)))
> (insn 57 53 59 2 (set (reg:DI 151 [ _6 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (const_int 0 [0])
>             (reg:DI 126))) "pr108803.c":12:25 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:CC 66 cc)
>         (nil)))
> ...
> (insn 71 68 72 2 (set (reg:CC 66 cc)
>         (compare:CC (reg:SI 137)
>             (const_int 0 [0]))) "pr108803.c":12:42 437 {cmpsi}
>      (expr_list:REG_DEAD (reg:SI 137)
>         (expr_list:REG_EQUAL (compare:CC (const_int -64 [0xffffffffffffffc0])
>                 (const_int 0 [0]))
>             (nil))))
> (insn 72 71 76 2 (set (reg:DI 153 [ _8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 139)
>             (reg:DI 153 [ _8 ]))) "pr108803.c":12:42 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 139)
>         (nil)))
> (insn 76 72 77 2 (set (reg:DI 154 [ _8+8 ])
>         (if_then_else:DI (ge (reg:CC 66 cc)
>                 (const_int 0 [0]))
>             (reg:DI 138)
>             (reg:DI 127))) "pr108803.c":12:42 490 {*cmovdi_insn}
>      (expr_list:REG_DEAD (reg:DI 138)
>         (expr_list:REG_DEAD (reg:DI 127)
>             (expr_list:REG_DEAD (reg:CC 66 cc)
>                 (nil)))))
> (insn 77 76 78 2 (set (reg:DI 159 [ b ])
>         (ior:DI (reg:DI 151 [ _6 ])
>             (reg:DI 126))) "pr108803.c":12:12 537 {iordi3}
>      (expr_list:REG_DEAD (reg:DI 126)
>         (expr_list:REG_DEAD (reg:DI 151 [ _6 ])
>             (nil))))
> (insn 78 77 80 2 (set (reg:DI 160 [ b+8 ])
>         (reg:DI 152 [ _6+8 ])) "pr108803.c":12:12 65 {*movdi_aarch64}
>      (expr_list:REG_DEAD (reg:DI 152 [ _6+8 ])
>         (nil)))

Both CC's are used twice, in if_then_else all times, a situation that
does not happen frequently at all, and that combine is apparently not
prepared for at all.  It is the same (hard!) register in all cases as
well.

> but as you can see, because cc reg has been REG_DEAD before on insn 57
> rather than on insn 53, nothing really removed REG_DEAD note from there
> and just adds it on insn 78 (note, besides this REG_DEAD issue the
> IL is otherwise still sane, the previous cc setter 71 and its previous
> uses 72 and 76 in between the move have been optimized away already in
> an earlier successful combination).
> And things go wild with the next successful combination:

Yup.


Segher
diff mbox series

Patch

--- gcc/optabs.cc.jj	2023-01-02 09:32:53.309838465 +0100
+++ gcc/optabs.cc	2023-02-16 20:44:21.862031535 +0100
@@ -507,7 +507,7 @@  expand_subword_shift (scalar_int_mode op
 		      rtx outof_input, rtx into_input, rtx op1,
 		      rtx outof_target, rtx into_target,
 		      int unsignedp, enum optab_methods methods,
-		      unsigned HOST_WIDE_INT shift_mask)
+		      unsigned HOST_WIDE_INT shift_mask, bool mask_count)
 {
   optab reverse_unsigned_shift, unsigned_shift;
   rtx tmp, carries;
@@ -535,7 +535,7 @@  expand_subword_shift (scalar_int_mode op
 	 are truncated to the mode size.  */
       carries = expand_binop (word_mode, reverse_unsigned_shift,
 			      outof_input, const1_rtx, 0, unsignedp, methods);
-      if (shift_mask == BITS_PER_WORD - 1)
+      if (shift_mask == BITS_PER_WORD - 1 || mask_count)
 	{
 	  tmp = immed_wide_int_const
 	    (wi::minus_one (GET_MODE_PRECISION (op1_mode)), op1_mode);
@@ -549,6 +549,15 @@  expand_subword_shift (scalar_int_mode op
 	  tmp = simplify_expand_binop (op1_mode, sub_optab, tmp, op1,
 				       0, true, methods);
 	}
+      if (mask_count)
+	{
+	  rtx tmp2 = immed_wide_int_const (wi::shwi (BITS_PER_WORD - 1,
+					   op1_mode), op1_mode);
+	  tmp = simplify_expand_binop (op1_mode, and_optab, tmp, tmp2, 0,
+				       true, methods);
+	  op1 = simplify_expand_binop (op1_mode, and_optab, op1, tmp2, 0,
+				       true, methods);
+	}
     }
   if (tmp == 0 || carries == 0)
     return false;
@@ -596,6 +605,15 @@  expand_doubleword_shift_condmove (scalar
 {
   rtx outof_superword, into_superword;
 
+  if (shift_mask < BITS_PER_WORD - 1)
+    {
+      rtx tmp = immed_wide_int_const (wi::shwi (BITS_PER_WORD - 1, op1_mode),
+				      op1_mode);
+      superword_op1
+	= simplify_expand_binop (op1_mode, and_optab, superword_op1, tmp,
+				 0, true, methods);
+    }
+
   /* Put the superword version of the output into OUTOF_SUPERWORD and
      INTO_SUPERWORD.  */
   outof_superword = outof_target != 0 ? gen_reg_rtx (word_mode) : 0;
@@ -621,7 +639,8 @@  expand_doubleword_shift_condmove (scalar
   if (!expand_subword_shift (op1_mode, binoptab,
 			     outof_input, into_input, subword_op1,
 			     outof_target, into_target,
-			     unsignedp, methods, shift_mask))
+			     unsignedp, methods, shift_mask,
+			     shift_mask < BITS_PER_WORD - 1))
     return false;
 
   /* Select between them.  Do the INTO half first because INTO_SUPERWORD
@@ -742,7 +761,7 @@  expand_doubleword_shift (scalar_int_mode
 	return expand_subword_shift (op1_mode, binoptab,
 				     outof_input, into_input, op1,
 				     outof_target, into_target,
-				     unsignedp, methods, shift_mask);
+				     unsignedp, methods, shift_mask, false);
     }
 
   /* Try using conditional moves to generate straight-line code.  */
@@ -781,7 +800,7 @@  expand_doubleword_shift (scalar_int_mode
   if (!expand_subword_shift (op1_mode, binoptab,
 			     outof_input, into_input, op1,
 			     outof_target, into_target,
-			     unsignedp, methods, shift_mask))
+			     unsignedp, methods, shift_mask, false))
     return false;
 
   emit_label (done_label);
--- gcc/testsuite/gcc.dg/pr108803.c.jj	2023-02-16 20:48:53.517032422 +0100
+++ gcc/testsuite/gcc.dg/pr108803.c	2023-02-16 20:48:45.971143498 +0100
@@ -0,0 +1,24 @@ 
+/* PR middle-end/108803 */
+/* { dg-do run { target int128 } } */
+/* { dg-options "-Og" } */
+
+unsigned i, j;
+
+__int128
+foo (__int128 a)
+{
+  a &= 9;
+  unsigned k = __builtin_add_overflow_p (i, j, a);
+  __int128 b = (63 + a) << k | ((63 + a) >> (63 & k));
+  __int128 c = a + b;
+  return c;
+}
+
+int
+main (void)
+{
+  __int128 x = foo (0);
+  if (x != 63)
+    __builtin_abort ();
+  return 0;
+}