diff mbox series

tree-ssa-threadbackward.c (profitable_jump_thread_path): Do not allow __builtin_constant_p.

Message ID 20200604003752.2545591-1-iii@linux.ibm.com
State New
Headers show
Series tree-ssa-threadbackward.c (profitable_jump_thread_path): Do not allow __builtin_constant_p. | expand

Commit Message

Ilya Leoshkevich June 4, 2020, 12:37 a.m. UTC
Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-redhat-linux
and s390x-redhat-linux.


Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c) build
with GCC 10 fails on s390 with "impossible constraint".

The problem is that jump threading makes __builtin_constant_p lie when
an expression in question appears to be a constant on a threading path.

Fix by disallowing __builtin_constant_p on threading paths.

gcc/ChangeLog:

2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>

	* tree-ssa-threadbackward.c (thread_jumps::profitable_jump_thread_path):
	Do not allow __builtin_constant_p on a threading path.

gcc/testsuite/ChangeLog:

2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>

	* gcc.target/s390/builtin-constant-p-threading.c: New test.
---
 .../s390/builtin-constant-p-threading.c       | 46 +++++++++++++++++++
 gcc/tree-ssa-threadbackward.c                 |  7 ++-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c

Comments

Ilya Leoshkevich June 15, 2020, 11:27 a.m. UTC | #1
On Thu, 2020-06-04 at 02:37 +0200, Ilya Leoshkevich wrote:
> Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-redhat-
> linux
> and s390x-redhat-linux.
> 
> 
> Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c) build
> with GCC 10 fails on s390 with "impossible constraint".
> 
> The problem is that jump threading makes __builtin_constant_p lie
> when
> an expression in question appears to be a constant on a threading
> path.
> 
> Fix by disallowing __builtin_constant_p on threading paths.
> 
> gcc/ChangeLog:
> 
> 2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>
> 
> 	* tree-ssa-threadbackward.c
> (thread_jumps::profitable_jump_thread_path):
> 	Do not allow __builtin_constant_p on a threading path.
> 
> gcc/testsuite/ChangeLog:
> 
> 2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>
> 
> 	* gcc.target/s390/builtin-constant-p-threading.c: New test.
> ---
>  .../s390/builtin-constant-p-threading.c       | 46
> +++++++++++++++++++
>  gcc/tree-ssa-threadbackward.c                 |  7 ++-
>  2 files changed, 52 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/builtin-constant-p-
> threading.c
> 
> diff --git a/gcc/testsuite/gcc.target/s390/builtin-constant-p-
> threading.c b/gcc/testsuite/gcc.target/s390/builtin-constant-p-
> threading.c
> new file mode 100644
> index 00000000000..5f0acdce0b0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=z196 -mzarch" } */
> +
> +typedef struct
> +{
> +  int counter;
> +} atomic_t;
> +
> +static inline __attribute__ ((__gnu_inline__)) int
> +__atomic_add (int val, int *ptr)
> +{
> +  int old;
> +  asm volatile("laa %[old],%[val],%[ptr]\n"
> +	       : [old] "=d" (old), [ptr] "+Q"(*ptr)
> +	       : [val] "d" (val)
> +	       : "cc", "memory");
> +  return old;
> +}
> +
> +static inline __attribute__ ((__gnu_inline__)) void
> +__atomic_add_const (int val, int *ptr)
> +{
> +  asm volatile("asi %[ptr],%[val]\n"
> +	       : [ptr] "+Q" (*ptr)
> +	       : [val] "i" (val)
> +	       : "cc", "memory");
> +}
> +
> +static inline __attribute__ ((__gnu_inline__)) void
> +atomic_add (int i, atomic_t *v)
> +{
> +  if (__builtin_constant_p (i) && (i > -129) && (i < 128))
> +    {
> +      __atomic_add_const (i, &v->counter);
> +      return;
> +    }
> +  __atomic_add (i, &v->counter);
> +}
> +
> +static atomic_t num_active_cpus = { (0) };
> +
> +void
> +ledtrig_cpu (_Bool is_active)
> +{
> +  atomic_add (is_active ? 1 : -1, &num_active_cpus);
> +}
> diff --git a/gcc/tree-ssa-threadbackward.c b/gcc/tree-ssa-
> threadbackward.c
> index 327628f1662..668932f6d85 100644
> --- a/gcc/tree-ssa-threadbackward.c
> +++ b/gcc/tree-ssa-threadbackward.c
> @@ -259,8 +259,13 @@ thread_jumps::profitable_jump_thread_path
> (basic_block bbi, tree name,
>  	       !gsi_end_p (gsi);
>  	       gsi_next_nondebug (&gsi))
>  	    {
> +	      /* Do not allow OpenACC loop markers and
> __builtin_constant_p on
> +		 a threading path.  The latter is disallowed, because
> an
> +		 expression being constant on a threading path does not
> mean it
> +		 can be considered constant in the entire function.  */
>  	      gimple *stmt = gsi_stmt (gsi);
> -	      if (gimple_call_internal_p (stmt, IFN_UNIQUE))
> +	      if (gimple_call_internal_p (stmt, IFN_UNIQUE)
> +		  || gimple_call_builtin_p (stmt, BUILT_IN_CONSTANT_P))
>  		{
>  		  m_path.pop ();
>  		  return NULL;

Gentle ping.
Li, Pan2 via Gcc-patches June 26, 2020, 8:27 p.m. UTC | #2
On Thu, 2020-06-04 at 02:37 +0200, Ilya Leoshkevich via Gcc-patches wrote:
> Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-redhat-linux
> and s390x-redhat-linux.
> 
> 
> Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c) build
> with GCC 10 fails on s390 with "impossible constraint".
> 
> The problem is that jump threading makes __builtin_constant_p lie when
> an expression in question appears to be a constant on a threading path.
What do you mean by this?

What should happen is the path where the queried object is constant,
builtin_constant_p will return true.  On the other path where it is not,
builtin_constant_p will return false.

> 
> Fix by disallowing __builtin_constant_p on threading paths.
> 
> gcc/ChangeLog:
> 
> 2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>
> 
> 	* tree-ssa-threadbackward.c (thread_jumps::profitable_jump_thread_path):
> 	Do not allow __builtin_constant_p on a threading path.
Sorry, this is wrong.

jeff


>
Ilya Leoshkevich June 26, 2020, 9:17 p.m. UTC | #3
On Fri, 2020-06-26 at 14:27 -0600, Jeff Law wrote:
> On Thu, 2020-06-04 at 02:37 +0200, Ilya Leoshkevich via Gcc-patches
> wrote:
> > Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-redhat-
> > linux
> > and s390x-redhat-linux.
> > 
> > 
> > Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c)
> > build
> > with GCC 10 fails on s390 with "impossible constraint".
> > 
> > The problem is that jump threading makes __builtin_constant_p lie
> > when
> > an expression in question appears to be a constant on a threading
> > path.
> What do you mean by this?
> 
> What should happen is the path where the queried object is constant,
> builtin_constant_p will return true.  On the other path where it is
> not,
> builtin_constant_p will return false.

But what if a path ends before we use the `builtin_constant_p` result?
In the testcase from the patch we have `i = is_active ? 1 : -1`, which 
produces two paths.  On each path `builtin_constant_p (i)` returns 
true, however, the asm statement which depends on this being true is
not on either path and comes only later.

Here are the relevant tree dumps (sorry, a bit verbose, I couldn't make
them smaller):

gcc/cc1 ../gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c 
-O2 -march=z196 -mzarch -fdump-tree-all-all

Before 106t.thread1:

;;   basic block 2
  if (is_active_2(D) != 0)
    goto <bb 3>; [50.00%]
  else
    goto <bb 4>; [50.00%]

;;   basic block 3
  # iftmp.0_6 = PHI <1(2)>
  _7 = __builtin_constant_pD.1098 (iftmp.0_6);
  if (_7 != 0)
    goto <bb 6>; [50.00%]
  else
    goto <bb 8>; [50.00%]

;;   basic block 4
  # iftmp.0_4 = PHI <-1(2)>
  _12 = __builtin_constant_pD.1098 (iftmp.0_4);
  if (_12 != 0)
    goto <bb 5>; [50.00%]
  else
    goto <bb 8>; [50.00%]

;;   basic block 5
  if (iftmp.0_4 >= -128)
    goto <bb 7>; [20.00%]
  else
    goto <bb 8>; [80.00%]

;;   basic block 6
  if (iftmp.0_6 <= 127)
    goto <bb 7>; [12.00%]
  else
    goto <bb 8>; [88.00%]

;;   basic block 7
  # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
  # .MEM_10 = VDEF <.MEM_3(D)>
  __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "val" "i"
iftmp.0_13, "Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "memory",
"cc");
  goto <bb 9>; [100.00%]
;;    succ:       9 [always]  count:91268056 (estimated locally)
(FALLTHRU,EXECUTABLE)

;;   basic block 8
  # iftmp.0_14 = PHI <iftmp.0_4(5), iftmp.0_4(4), iftmp.0_6(6),
iftmp.0_6(3)>
  # .MEM_11 = VDEF <.MEM_3(D)>
  __asm__ __volatile__("laa %0,%2,%1
" : "old" "=d" old_8, "ptr" "=Q" MEM[(intD.6 *)&num_active_cpusD.2277]
: "val" "d" iftmp.0_14, "Q" MEM[(intD.6 *)&num_active_cpusD.2277] :
"memory", "cc");

`asi` instruction (bb 7) can be used only with constants, and that's 
what `__builtin_constant_p` ensures here.  For other values `laa`
instruction (block 8) must be used.

106t.thread1 says:

about to thread: path: 2 -> 4, 4 -> 5, 5 -> 6
about to thread: path: 3 -> 4, 4 -> 5, 5 -> 6, 6 -> 7

and as a result produces this:

;;   basic block 2
  if (is_active_2(D) != 0)
    goto <bb 3>; [50.00%]
  else
    goto <bb 4>; [50.00%]

;;   basic block 3

;;   basic block 4
  # iftmp.0_13 = PHI <-1(2), 1(3)>
  # .MEM_10 = VDEF <.MEM_3(D)>
  __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "val" "i"
iftmp.0_13, "Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "memory",
"cc");

This code won't compile - `iftmp.0_13` is not a constant.  My idea was
that the root cause is that `i` is constant on each threading path,
but is not constant anymore when they merge.  But maybe there is a
deeper issue?  What would be a better way to fix this?

> > Fix by disallowing __builtin_constant_p on threading paths.
> > 
> > gcc/ChangeLog:
> > 
> > 2020-06-03  Ilya Leoshkevich  <iii@linux.ibm.com>
> > 
> > 	* tree-ssa-threadbackward.c
> > (thread_jumps::profitable_jump_thread_path):
> > 	Do not allow __builtin_constant_p on a threading path.
> Sorry, this is wrong.
> 
> jeff
> 
>
Li, Pan2 via Gcc-patches June 26, 2020, 9:34 p.m. UTC | #4
On Fri, 2020-06-26 at 23:17 +0200, Ilya Leoshkevich wrote:
> On Fri, 2020-06-26 at 14:27 -0600, Jeff Law wrote:
> > On Thu, 2020-06-04 at 02:37 +0200, Ilya Leoshkevich via Gcc-patches
> > wrote:
> > > Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-redhat-
> > > linux
> > > and s390x-redhat-linux.
> > > 
> > > 
> > > Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c)
> > > build
> > > with GCC 10 fails on s390 with "impossible constraint".
> > > 
> > > The problem is that jump threading makes __builtin_constant_p lie
> > > when
> > > an expression in question appears to be a constant on a threading
> > > path.
> > What do you mean by this?
> > 
> > What should happen is the path where the queried object is constant,
> > builtin_constant_p will return true.  On the other path where it is
> > not,
> > builtin_constant_p will return false.
> 
> But what if a path ends before we use the `builtin_constant_p` result?
> In the testcase from the patch we have `i = is_active ? 1 : -1`, which 
> produces two paths.  On each path `builtin_constant_p (i)` returns 
> true, however, the asm statement which depends on this being true is
> not on either path and comes only later.
> 
> Here are the relevant tree dumps (sorry, a bit verbose, I couldn't make
> them smaller):
> 
> gcc/cc1 ../gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c 
> -O2 -march=z196 -mzarch -fdump-tree-all-all
> 
> Before 106t.thread1:
> 
> ;;   basic block 2
>   if (is_active_2(D) != 0)
>     goto <bb 3>; [50.00%]
>   else
>     goto <bb 4>; [50.00%]
> 
> ;;   basic block 3
>   # iftmp.0_6 = PHI <1(2)>
>   _7 = __builtin_constant_pD.1098 (iftmp.0_6);
>   if (_7 != 0)
>     goto <bb 6>; [50.00%]
>   else
>     goto <bb 8>; [50.00%]
> 
> ;;   basic block 4
>   # iftmp.0_4 = PHI <-1(2)>
>   _12 = __builtin_constant_pD.1098 (iftmp.0_4);
>   if (_12 != 0)
>     goto <bb 5>; [50.00%]
>   else
>     goto <bb 8>; [50.00%]
> 
> ;;   basic block 5
>   if (iftmp.0_4 >= -128)
>     goto <bb 7>; [20.00%]
>   else
>     goto <bb 8>; [80.00%]
> 
> ;;   basic block 6
>   if (iftmp.0_6 <= 127)
>     goto <bb 7>; [12.00%]
>   else
>     goto <bb 8>; [88.00%]
> 
> ;;   basic block 7
>   # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
>   # .MEM_10 = VDEF <.MEM_3(D)>
>   __asm__ __volatile__("asi %0,%1
> " : "ptr" "=Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "val" "i"
> iftmp.0_13, "Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "memory",
> "cc");
>   goto <bb 9>; [100.00%]
> ;;    succ:       9 [always]  count:91268056 (estimated locally)
> (FALLTHRU,EXECUTABLE)
This looks broken already.  iftmp.0_13 is not a constant prior to thread1 and
thus, IIUC is not suitable for use in this instruction.  Threading just
simplified things, but it's just as broken before threading as it is after
threading.


Jeff
>
Ilya Leoshkevich June 26, 2020, 9:54 p.m. UTC | #5
On Fri, 2020-06-26 at 15:34 -0600, Jeff Law wrote:
> On Fri, 2020-06-26 at 23:17 +0200, Ilya Leoshkevich wrote:
> > On Fri, 2020-06-26 at 14:27 -0600, Jeff Law wrote:
> > > On Thu, 2020-06-04 at 02:37 +0200, Ilya Leoshkevich via Gcc-
> > > patches
> > > wrote:
> > > > Bootstrapped and regtested on x86_64-redhat-linux, ppc64le-
> > > > redhat-
> > > > linux
> > > > and s390x-redhat-linux.
> > > > 
> > > > 
> > > > Linux Kernel (specifically, drivers/leds/trigger/ledtrig-cpu.c)
> > > > build
> > > > with GCC 10 fails on s390 with "impossible constraint".
> > > > 
> > > > The problem is that jump threading makes __builtin_constant_p
> > > > lie
> > > > when
> > > > an expression in question appears to be a constant on a
> > > > threading
> > > > path.
> > > What do you mean by this?
> > > 
> > > What should happen is the path where the queried object is
> > > constant,
> > > builtin_constant_p will return true.  On the other path where it
> > > is
> > > not,
> > > builtin_constant_p will return false.
> > 
> > But what if a path ends before we use the `builtin_constant_p`
> > result?
> > In the testcase from the patch we have `i = is_active ? 1 : -1`,
> > which 
> > produces two paths.  On each path `builtin_constant_p (i)` returns 
> > true, however, the asm statement which depends on this being true
> > is
> > not on either path and comes only later.
> > 
> > Here are the relevant tree dumps (sorry, a bit verbose, I couldn't
> > make
> > them smaller):
> > 
> > gcc/cc1 ../gcc/testsuite/gcc.target/s390/builtin-constant-p-
> > threading.c 
> > -O2 -march=z196 -mzarch -fdump-tree-all-all
> > 
> > Before 106t.thread1:
> > 
> > ;;   basic block 2
> >   if (is_active_2(D) != 0)
> >     goto <bb 3>; [50.00%]
> >   else
> >     goto <bb 4>; [50.00%]
> > 
> > ;;   basic block 3
> >   # iftmp.0_6 = PHI <1(2)>
> >   _7 = __builtin_constant_pD.1098 (iftmp.0_6);
> >   if (_7 != 0)
> >     goto <bb 6>; [50.00%]
> >   else
> >     goto <bb 8>; [50.00%]
> > 
> > ;;   basic block 4
> >   # iftmp.0_4 = PHI <-1(2)>
> >   _12 = __builtin_constant_pD.1098 (iftmp.0_4);
> >   if (_12 != 0)
> >     goto <bb 5>; [50.00%]
> >   else
> >     goto <bb 8>; [50.00%]
> > 
> > ;;   basic block 5
> >   if (iftmp.0_4 >= -128)
> >     goto <bb 7>; [20.00%]
> >   else
> >     goto <bb 8>; [80.00%]
> > 
> > ;;   basic block 6
> >   if (iftmp.0_6 <= 127)
> >     goto <bb 7>; [12.00%]
> >   else
> >     goto <bb 8>; [88.00%]
> > 
> > ;;   basic block 7
> >   # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
> >   # .MEM_10 = VDEF <.MEM_3(D)>
> >   __asm__ __volatile__("asi %0,%1
> > " : "ptr" "=Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "val" "i"
> > iftmp.0_13, "Q" MEM[(intD.6 *)&num_active_cpusD.2277] : "memory",
> > "cc");
> >   goto <bb 9>; [100.00%]
> > ;;    succ:       9 [always]  count:91268056 (estimated locally)
> > (FALLTHRU,EXECUTABLE)
> This looks broken already.  iftmp.0_13 is not a constant prior to
> thread1 and
> thus, IIUC is not suitable for use in this instruction.  Threading
> just
> simplified things, but it's just as broken before threading as it is
> after
> threading.

How should this work ideally?  Suppose we have:

static inline void foo (int i)
{
  if (__builtin_is_constant_p (i))
    asm volatile ("bar" :: "i" (i))
  else
    asm volatile ("baz" :: "d" (i));
}

First of all, this is a supported use case, right?

Then, the way I see it, is that at least for a while there must exist
trees like the ones above, regardless of whether their asm arguments
match constraints.  But ultimately dead code elimination should get rid
of the wrong ones before they reach RTL.

E.g. in the example above, the non-inlined version should have
`!__builtin_is_constant_p (i)`, so `bar` should not survive until
RTL.  The same for inlined `foo (1)` version's `baz`.

Is 106t.thread1 already too late for such trees?  Should some earlier
pass have cleaned them up already?

> Jeff
Li, Pan2 via Gcc-patches June 26, 2020, 10:04 p.m. UTC | #6
On Fri, 2020-06-26 at 23:54 +0200, Ilya Leoshkevich wrote:
> 
> How should this work ideally?  Suppose we have:
> 
> static inline void foo (int i)
> {
>   if (__builtin_is_constant_p (i))
>     asm volatile ("bar" :: "i" (i))
>   else
>     asm volatile ("baz" :: "d" (i));
> }
> 
> First of all, this is a supported use case, right?
Yes, this is a supported case.

> 
> Then, the way I see it, is that at least for a while there must exist
> trees like the ones above, regardless of whether their asm arguments
> match constraints.  But ultimately dead code elimination should get rid
> of the wrong ones before they reach RTL.

> 
> E.g. in the example above, the non-inlined version should have
> `!__builtin_is_constant_p (i)`, so `bar` should not survive until
> RTL.  The same for inlined `foo (1)` version's `baz`.
In theory yes, but there are cases where paths converge (like you've shown) where
you may have evaluated to a constant on the paths, but it's not a constant at the
convergence point.  You have to be very careful using b_c_p like this and it's
been a regular source of kernel bugs.


I'd recommend looking at the .ssa dump and walk forward from there if the .ssa
dump looks correct.

jeff
Ilya Leoshkevich June 26, 2020, 11:03 p.m. UTC | #7
On Fri, 2020-06-26 at 16:04 -0600, Jeff Law wrote:
> On Fri, 2020-06-26 at 23:54 +0200, Ilya Leoshkevich wrote:
> > How should this work ideally?  Suppose we have:
> > 
> > static inline void foo (int i)
> > {
> >   if (__builtin_is_constant_p (i))
> >     asm volatile ("bar" :: "i" (i))
> >   else
> >     asm volatile ("baz" :: "d" (i));
> > }
> > 
> > First of all, this is a supported use case, right?
> Yes, this is a supported case.
> 
> > Then, the way I see it, is that at least for a while there must
> > exist
> > trees like the ones above, regardless of whether their asm
> > arguments
> > match constraints.  But ultimately dead code elimination should get
> > rid
> > of the wrong ones before they reach RTL.
> > E.g. in the example above, the non-inlined version should have
> > `!__builtin_is_constant_p (i)`, so `bar` should not survive until
> > RTL.  The same for inlined `foo (1)` version's `baz`.
> In theory yes, but there are cases where paths converge (like you've
> shown) where
> you may have evaluated to a constant on the paths, but it's not a
> constant at the
> convergence point.  You have to be very careful using b_c_p like this
> and it's
> been a regular source of kernel bugs.

Is there something specific that a compiler user should look out for?
For example, here is the kernel code, from which the test was derived:

static inline void atomic_add(int i, atomic_t *v)
{
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
        if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
                __atomic_add_const(i, &v->counter);
                return;
        }
#endif
        __atomic_add(i, &v->counter);
}
	
It looks very straightforward - can there still be something wrong
with its usage of b_c_p?

> I'd recommend looking at the .ssa dump and walk forward from there if
> the .ssa
> dump looks correct.
> 
> jeff

Well, 021t.ssa already has:

__attribute__((gnu_inline))
__atomic_add_const (intD.6 valD.2269, intD.6 * ptrD.2270)
{
  intD.6 val_3(D) = valD.2269;
  intD.6 * ptr_2(D) = ptrD.2270;
;;   basic block 2, loop depth 0, maybe hot
;;    prev block 0, next block 1, flags: (NEW)
;;    pred:       ENTRY (FALLTHRU)
  # .MEM_4 = VDEF <.MEM_1(D)>
  __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" *ptr_2(D) : "val" "i" val_3(D), "Q" *ptr_2(D) :
"memory", "cc");
  # VUSE <.MEM_4>
  return;
;;    succ:       EXIT

}

which is, strictly speaking, not correct, because val_3(D) and
valD.2269 are not constant.  But as far as I understand we are willing
to tolerate trees like this until a certain point.

What is this point supposed to be?  If I understood you right, 
106t.thread1 is already too late - why is it so?
Marc Glisse June 27, 2020, 11:38 a.m. UTC | #8
On Sat, 27 Jun 2020, Ilya Leoshkevich via Gcc-patches wrote:

> Is there something specific that a compiler user should look out for?
> For example, here is the kernel code, from which the test was derived:
>
> static inline void atomic_add(int i, atomic_t *v)
> {
> #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
>        if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
>                __atomic_add_const(i, &v->counter);
>                return;
>        }
> #endif
>        __atomic_add(i, &v->counter);
> }
>
> It looks very straightforward - can there still be something wrong
> with its usage of b_c_p?
>
>> I'd recommend looking at the .ssa dump and walk forward from there if 
>> the .ssa dump looks correct.
>
> Well, 021t.ssa already has:
>
> __attribute__((gnu_inline))
> __atomic_add_const (intD.6 valD.2269, intD.6 * ptrD.2270)
> {
>  intD.6 val_3(D) = valD.2269;
>  intD.6 * ptr_2(D) = ptrD.2270;
> ;;   basic block 2, loop depth 0, maybe hot
> ;;    prev block 0, next block 1, flags: (NEW)
> ;;    pred:       ENTRY (FALLTHRU)
>  # .MEM_4 = VDEF <.MEM_1(D)>
>  __asm__ __volatile__("asi %0,%1
> " : "ptr" "=Q" *ptr_2(D) : "val" "i" val_3(D), "Q" *ptr_2(D) :
> "memory", "cc");
>  # VUSE <.MEM_4>
>  return;
> ;;    succ:       EXIT
>
> }
>
> which is, strictly speaking, not correct, because val_3(D) and
> valD.2269 are not constant.  But as far as I understand we are willing
> to tolerate trees like this until a certain point.
>
> What is this point supposed to be?  If I understood you right,
> 106t.thread1 is already too late - why is it so?

Small remark: shouldn't __atomic_add_const be marked with the 
always_inline attribute, since it isn't usable when it isn't inlined?
Ilya Leoshkevich June 27, 2020, 1:16 p.m. UTC | #9
On Sat, 2020-06-27 at 13:38 +0200, Marc Glisse wrote:
> On Sat, 27 Jun 2020, Ilya Leoshkevich via Gcc-patches wrote:
> 
> > Is there something specific that a compiler user should look out
> > for?
> > For example, here is the kernel code, from which the test was
> > derived:
> > 
> > static inline void atomic_add(int i, atomic_t *v)
> > {
> > #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
> >        if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
> >                __atomic_add_const(i, &v->counter);
> >                return;
> >        }
> > #endif
> >        __atomic_add(i, &v->counter);
> > }
> > 
> > It looks very straightforward - can there still be something wrong
> > with its usage of b_c_p?
> > 
> > > I'd recommend looking at the .ssa dump and walk forward from
> > > there if 
> > > the .ssa dump looks correct.
> > 
> > Well, 021t.ssa already has:
> > 
> > __attribute__((gnu_inline))
> > __atomic_add_const (intD.6 valD.2269, intD.6 * ptrD.2270)
> > {
> >  intD.6 val_3(D) = valD.2269;
> >  intD.6 * ptr_2(D) = ptrD.2270;
> > ;;   basic block 2, loop depth 0, maybe hot
> > ;;    prev block 0, next block 1, flags: (NEW)
> > ;;    pred:       ENTRY (FALLTHRU)
> >  # .MEM_4 = VDEF <.MEM_1(D)>
> >  __asm__ __volatile__("asi %0,%1
> > " : "ptr" "=Q" *ptr_2(D) : "val" "i" val_3(D), "Q" *ptr_2(D) :
> > "memory", "cc");
> >  # VUSE <.MEM_4>
> >  return;
> > ;;    succ:       EXIT
> > 
> > }
> > 
> > which is, strictly speaking, not correct, because val_3(D) and
> > valD.2269 are not constant.  But as far as I understand we are
> > willing
> > to tolerate trees like this until a certain point.
> > 
> > What is this point supposed to be?  If I understood you right,
> > 106t.thread1 is already too late - why is it so?
> 
> Small remark: shouldn't __atomic_add_const be marked with the 
> always_inline attribute, since it isn't usable when it isn't inlined?

I agree, this would be a good improvement (from both readability and
correctness perspectives).

Still, I just tried it, and unfortunately it did not help with the
issue at hand, most likely because the function is inlined either way.
021t.ssa still contains:

__attribute__((always_inline, gnu_inline))
__atomic_add_const (intD.6 valD.2269, intD.6 * ptrD.2270)
{

and threading still eliminates its inlined version.
Marc Glisse June 27, 2020, 2:50 p.m. UTC | #10
On Fri, 26 Jun 2020, Jeff Law via Gcc-patches wrote:

> In theory yes, but there are cases where paths converge (like you've shown) where
> you may have evaluated to a constant on the paths, but it's not a constant at the
> convergence point.  You have to be very careful using b_c_p like this and it's
> been a regular source of kernel bugs.
>
>
> I'd recommend looking at the .ssa dump and walk forward from there if the .ssa
> dump looks correct.

Here is the last dump before thread1 (105t.mergephi2). I don't see 
anything incorrect in it.

ledtrig_cpu (_Bool is_active)
{
   int old;
   int iftmp.0_1;
   int _5;

   <bb 2> [local count: 1073741824]:
   if (is_active_2(D) != 0)
     goto <bb 4>; [50.00%]
   else
     goto <bb 3>; [50.00%]

   <bb 3> [local count: 536870913]:

   <bb 4> [local count: 1073741824]:
   # iftmp.0_1 = PHI <1(2), -1(3)>
   _5 = __builtin_constant_p (iftmp.0_1);
   if (_5 != 0)
     goto <bb 5>; [50.00%]
   else
     goto <bb 8>; [50.00%]

   <bb 5> [local count: 536870913]:
   if (iftmp.0_1 >= -128)
     goto <bb 6>; [50.00%]
   else
     goto <bb 8>; [50.00%]

   <bb 6> [local count: 268435456]:
   if (iftmp.0_1 <= 127)
     goto <bb 7>; [34.00%]
   else
     goto <bb 8>; [66.00%]

   <bb 7> [local count: 91268056]:
   __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
   goto <bb 9>; [100.00%]

   <bb 8> [local count: 982473769]:
   __asm__ __volatile__("laa %0,%2,%1
" : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");

   <bb 9> [local count: 1073741824]:
   return;

}

There is a single _b_c_p, the immediate asm argument is exactly the 
argument of _b_c_p, and it is in the branch protected by _b_c_p.

Now the thread1 dump, for comparison

ledtrig_cpu (_Bool is_active)
{
   int old;
   int iftmp.0_4;
   int iftmp.0_6;
   int _7;
   int _12;
   int iftmp.0_13;
   int iftmp.0_14;

   <bb 2> [local count: 1073741824]:
   if (is_active_2(D) != 0)
     goto <bb 3>; [50.00%]
   else
     goto <bb 4>; [50.00%]

   <bb 3> [local count: 536870912]:
   # iftmp.0_6 = PHI <1(2)>
   _7 = __builtin_constant_p (iftmp.0_6);
   if (_7 != 0)
     goto <bb 6>; [50.00%]
   else
     goto <bb 8>; [50.00%]

   <bb 4> [local count: 536870912]:
   # iftmp.0_4 = PHI <-1(2)>
   _12 = __builtin_constant_p (iftmp.0_4);
   if (_12 != 0)
     goto <bb 5>; [50.00%]
   else
     goto <bb 8>; [50.00%]

   <bb 5> [local count: 268435456]:
   if (iftmp.0_4 >= -128)
     goto <bb 7>; [20.00%]
   else
     goto <bb 8>; [80.00%]

   <bb 6> [local count: 214748364]:
   if (iftmp.0_6 <= 127)
     goto <bb 7>; [12.00%]
   else
     goto <bb 8>; [88.00%]

   <bb 7> [local count: 91268056]:
   # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
   __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_13, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
   goto <bb 9>; [100.00%]

   <bb 8> [local count: 982473769]:
   # iftmp.0_14 = PHI <iftmp.0_4(5), iftmp.0_4(4), iftmp.0_6(6), iftmp.0_6(3)>
   __asm__ __volatile__("laa %0,%2,%1
" : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_14, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");

   <bb 9> [local count: 1073741824]:
   return;

}

Thread1 decides to separate the paths is_active and !is_active 
(surprisingly, for one it optimizes out the comparison <= 127 and for the 
other the comparison >= -128, while it could optimize both in both cases). 
And it decides to converge after the comparisons, but before the asm.

What the pass did does seem to hurt. It looks like if we duplicate _b_c_p, 
we may need to duplicate far enough to include all the blocks dominated by 
_b_c_p==true (including the asm, here). Otherwise, any _b_c_p can be 
optimized to true, because for a boolean

b is the same as b ? true : false
__builtin_constant_p(b ? true : false) would be the same as b ? 
__builtin_constant_p(true) : __builtin_constant_p(false), i.e. true.

It is too bad we don't have any optimization pass using ranges between IPA 
and thread1, that would have gotten rid of the comparisons, and hence the 
temptation to thread. Adding always_inline on atomic_add (or flatten on 
the caller) does help: EVRP removes the comparisons.

Do you see a way forward without changing what thread1 does or declaring 
the testcase as unsupported?
Richard Biener June 29, 2020, 6:58 a.m. UTC | #11
On Sat, Jun 27, 2020 at 4:52 PM Marc Glisse <marc.glisse@inria.fr> wrote:
>
> On Fri, 26 Jun 2020, Jeff Law via Gcc-patches wrote:
>
> > In theory yes, but there are cases where paths converge (like you've shown) where
> > you may have evaluated to a constant on the paths, but it's not a constant at the
> > convergence point.  You have to be very careful using b_c_p like this and it's
> > been a regular source of kernel bugs.
> >
> >
> > I'd recommend looking at the .ssa dump and walk forward from there if the .ssa
> > dump looks correct.
>
> Here is the last dump before thread1 (105t.mergephi2). I don't see
> anything incorrect in it.
>
> ledtrig_cpu (_Bool is_active)
> {
>    int old;
>    int iftmp.0_1;
>    int _5;
>
>    <bb 2> [local count: 1073741824]:
>    if (is_active_2(D) != 0)
>      goto <bb 4>; [50.00%]
>    else
>      goto <bb 3>; [50.00%]
>
>    <bb 3> [local count: 536870913]:
>
>    <bb 4> [local count: 1073741824]:
>    # iftmp.0_1 = PHI <1(2), -1(3)>
>    _5 = __builtin_constant_p (iftmp.0_1);
>    if (_5 != 0)
>      goto <bb 5>; [50.00%]
>    else
>      goto <bb 8>; [50.00%]
>
>    <bb 5> [local count: 536870913]:
>    if (iftmp.0_1 >= -128)
>      goto <bb 6>; [50.00%]
>    else
>      goto <bb 8>; [50.00%]
>
>    <bb 6> [local count: 268435456]:
>    if (iftmp.0_1 <= 127)
>      goto <bb 7>; [34.00%]
>    else
>      goto <bb 8>; [66.00%]
>
>    <bb 7> [local count: 91268056]:
>    __asm__ __volatile__("asi %0,%1
> " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
>    goto <bb 9>; [100.00%]
>
>    <bb 8> [local count: 982473769]:
>    __asm__ __volatile__("laa %0,%2,%1
> " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
>
>    <bb 9> [local count: 1073741824]:
>    return;
>
> }
>
> There is a single _b_c_p, the immediate asm argument is exactly the
> argument of _b_c_p, and it is in the branch protected by _b_c_p.
>
> Now the thread1 dump, for comparison
>
> ledtrig_cpu (_Bool is_active)
> {
>    int old;
>    int iftmp.0_4;
>    int iftmp.0_6;
>    int _7;
>    int _12;
>    int iftmp.0_13;
>    int iftmp.0_14;
>
>    <bb 2> [local count: 1073741824]:
>    if (is_active_2(D) != 0)
>      goto <bb 3>; [50.00%]
>    else
>      goto <bb 4>; [50.00%]
>
>    <bb 3> [local count: 536870912]:
>    # iftmp.0_6 = PHI <1(2)>
>    _7 = __builtin_constant_p (iftmp.0_6);
>    if (_7 != 0)
>      goto <bb 6>; [50.00%]
>    else
>      goto <bb 8>; [50.00%]
>
>    <bb 4> [local count: 536870912]:
>    # iftmp.0_4 = PHI <-1(2)>
>    _12 = __builtin_constant_p (iftmp.0_4);
>    if (_12 != 0)
>      goto <bb 5>; [50.00%]
>    else
>      goto <bb 8>; [50.00%]
>
>    <bb 5> [local count: 268435456]:
>    if (iftmp.0_4 >= -128)
>      goto <bb 7>; [20.00%]
>    else
>      goto <bb 8>; [80.00%]
>
>    <bb 6> [local count: 214748364]:
>    if (iftmp.0_6 <= 127)
>      goto <bb 7>; [12.00%]
>    else
>      goto <bb 8>; [88.00%]
>
>    <bb 7> [local count: 91268056]:
>    # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
>    __asm__ __volatile__("asi %0,%1
> " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_13, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
>    goto <bb 9>; [100.00%]
>
>    <bb 8> [local count: 982473769]:
>    # iftmp.0_14 = PHI <iftmp.0_4(5), iftmp.0_4(4), iftmp.0_6(6), iftmp.0_6(3)>
>    __asm__ __volatile__("laa %0,%2,%1
> " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_14, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
>
>    <bb 9> [local count: 1073741824]:
>    return;
>
> }
>
> Thread1 decides to separate the paths is_active and !is_active
> (surprisingly, for one it optimizes out the comparison <= 127 and for the
> other the comparison >= -128, while it could optimize both in both cases).
> And it decides to converge after the comparisons, but before the asm.
>
> What the pass did does seem to hurt. It looks like if we duplicate _b_c_p,
> we may need to duplicate far enough to include all the blocks dominated by
> _b_c_p==true (including the asm, here). Otherwise, any _b_c_p can be
> optimized to true, because for a boolean
>
> b is the same as b ? true : false
> __builtin_constant_p(b ? true : false) would be the same as b ?
> __builtin_constant_p(true) : __builtin_constant_p(false), i.e. true.
>
> It is too bad we don't have any optimization pass using ranges between IPA
> and thread1, that would have gotten rid of the comparisons, and hence the
> temptation to thread. Adding always_inline on atomic_add (or flatten on
> the caller) does help: EVRP removes the comparisons.
>
> Do you see a way forward without changing what thread1 does or declaring
> the testcase as unsupported?

Most of the cases I've seen involve transforms that make _b_c_p constant
on one path and then introduce a new PHI merging two _b_c_p values
to be then tested in a not simplified condition.  I'm not sure how to fend
off jump threading (yeah, it's nearly always jump threading doing this...)
doing this but certainly the easiest way would be to simply disallow
[jump threading] from duplicating _b_c_p calls.

Or fold _b_c_p even earlier (though I definitely saw early backwards
jump threading mess up such a case).

Richard.

> --
> Marc Glisse
Jakub Jelinek June 29, 2020, 10:20 a.m. UTC | #12
On Mon, Jun 29, 2020 at 08:58:57AM +0200, Richard Biener via Gcc-patches wrote:
> Most of the cases I've seen involve transforms that make _b_c_p constant
> on one path and then introduce a new PHI merging two _b_c_p values
> to be then tested in a not simplified condition.  I'm not sure how to fend
> off jump threading (yeah, it's nearly always jump threading doing this...)
> doing this but certainly the easiest way would be to simply disallow
> [jump threading] from duplicating _b_c_p calls.
> 
> Or fold _b_c_p even earlier (though I definitely saw early backwards
> jump threading mess up such a case).

Yeah, perhaps disallow duplicating bcp calls in the threading before IPA
(for bcp to work well, we want to preserve it until inlining had a chance to
propagate constants in there) and after IPA allow that, but just fold them
into 0 during that on both paths.

	Jakub
Li, Pan2 via Gcc-patches July 1, 2020, 7:50 p.m. UTC | #13
On Sat, 2020-06-27 at 01:03 +0200, Ilya Leoshkevich wrote:
> On Fri, 2020-06-26 at 16:04 -0600, Jeff Law wrote:
> > On Fri, 2020-06-26 at 23:54 +0200, Ilya Leoshkevich wrote:
> > > How should this work ideally?  Suppose we have:
> > > 
> > > static inline void foo (int i)
> > > {
> > >   if (__builtin_is_constant_p (i))
> > >     asm volatile ("bar" :: "i" (i))
> > >   else
> > >     asm volatile ("baz" :: "d" (i));
> > > }
> > > 
> > > First of all, this is a supported use case, right?
> > Yes, this is a supported case.
> > 
> > > Then, the way I see it, is that at least for a while there must
> > > exist
> > > trees like the ones above, regardless of whether their asm
> > > arguments
> > > match constraints.  But ultimately dead code elimination should get
> > > rid
> > > of the wrong ones before they reach RTL.
> > > E.g. in the example above, the non-inlined version should have
> > > `!__builtin_is_constant_p (i)`, so `bar` should not survive until
> > > RTL.  The same for inlined `foo (1)` version's `baz`.
> > In theory yes, but there are cases where paths converge (like you've
> > shown) where
> > you may have evaluated to a constant on the paths, but it's not a
> > constant at the
> > convergence point.  You have to be very careful using b_c_p like this
> > and it's
> > been a regular source of kernel bugs.
> 
> Is there something specific that a compiler user should look out for?
> For example, here is the kernel code, from which the test was derived:
> 
> static inline void atomic_add(int i, atomic_t *v)
> {
> #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
>         if (__builtin_constant_p(i) && (i > -129) && (i < 128)) {
>                 __atomic_add_const(i, &v->counter);
>                 return;
>         }
> #endif
>         __atomic_add(i, &v->counter);
> }
> 	
> It looks very straightforward - can there still be something wrong
> with its usage of b_c_p?
> 
> > I'd recommend looking at the .ssa dump and walk forward from there if
> > the .ssa
> > dump looks correct.
> > 
> > jeff
> 
> Well, 021t.ssa already has:
> 
> __attribute__((gnu_inline))
> __atomic_add_const (intD.6 valD.2269, intD.6 * ptrD.2270)
> {
>   intD.6 val_3(D) = valD.2269;
>   intD.6 * ptr_2(D) = ptrD.2270;
> ;;   basic block 2, loop depth 0, maybe hot
> ;;    prev block 0, next block 1, flags: (NEW)
> ;;    pred:       ENTRY (FALLTHRU)
>   # .MEM_4 = VDEF <.MEM_1(D)>
>   __asm__ __volatile__("asi %0,%1
> " : "ptr" "=Q" *ptr_2(D) : "val" "i" val_3(D), "Q" *ptr_2(D) :
> "memory", "cc");
>   # VUSE <.MEM_4>
>   return;
> ;;    succ:       EXIT
> 
> }
> 
> which is, strictly speaking, not correct, because val_3(D) and
> valD.2269 are not constant.  But as far as I understand we are willing
> to tolerate trees like this until a certain point.
> 
> What is this point supposed to be?  If I understood you right, 
> 106t.thread1 is already too late - why is it so?
Well, you need to show more context to know if it's strictly OK.

Here's what I get with a cross in the .ssa dump:

;;   basic block 2, loop depth 0, maybe hot
;;    prev block 0, next block 3, flags: (NEW)
;;    pred:       ENTRY (FALLTHRU)
  _1 = __builtin_constant_p (i_5(D));
  if (_1 != 0)
    goto <bb 3>; [INV]
  else
    goto <bb 6>; [INV]
;;    succ:       3 (TRUE_VALUE)
;;                6 (FALSE_VALUE)

;;   basic block 3, loop depth 0, maybe hot
;;    prev block 2, next block 4, flags: (NEW)
;;    pred:       2 (TRUE_VALUE)
  if (i_5(D) >= -128)
    goto <bb 4>; [INV]
  else
    goto <bb 6>; [INV]
;;    succ:       4 (TRUE_VALUE)
;;                6 (FALSE_VALUE)

;;   basic block 4, loop depth 0, maybe hot
;;    prev block 3, next block 5, flags: (NEW)
;;    pred:       3 (TRUE_VALUE)
  if (i_5(D) <= 127)
    goto <bb 5>; [INV]
  else
    goto <bb 6>; [INV]
;;    succ:       5 (TRUE_VALUE)
;;                6 (FALSE_VALUE)

;;   basic block 5, loop depth 0, maybe hot
;;    prev block 4, next block 6, flags: (NEW)
;;    pred:       4 (TRUE_VALUE)
  _2 = &v_6(D)->counter;
  __atomic_add_const (i_5(D), _2);
  // predicted unlikely by early return (on trees) predictor.
  goto <bb 7>; [INV]
;;    succ:       7 (FALLTHRU)

And that's OK.  In particular we have the _b_c_p call on i_5 and the only path to
the call to __atomic_add_const is predicated on the result of that b_c_p call. 
Furthermore, we use i_5 in the call to atomic_add_const.

Contrast that to the .thread1 dump:


;;   basic block 3, loop depth 0
;;    pred:       2
  # iftmp.0_6 = PHI <1(2)>
  _7 = __builtin_constant_p (iftmp.0_6);
  if (_7 != 0)
    goto <bb 6>; [50.00%]
  else
    goto <bb 8>; [50.00%]
;;    succ:       6
;;                8

;;   basic block 4, loop depth 0
;;    pred:       2
  # iftmp.0_4 = PHI <-1(2)>
  _12 = __builtin_constant_p (iftmp.0_4);
  if (_12 != 0)
    goto <bb 5>; [50.00%]
  else
    goto <bb 8>; [50.00%]
;;    succ:       5
;;                8

;;   basic block 5, loop depth 0
;;    pred:       4
  if (iftmp.0_4 >= -128)
    goto <bb 7>; [20.00%]
  else
    goto <bb 8>; [80.00%]
;;    succ:       7
;;                8

;;   basic block 6, loop depth 0
;;    pred:       3
  if (iftmp.0_6 <= 127)
    goto <bb 7>; [12.00%]
  else
    goto <bb 8>; [88.00%]
;;    succ:       7
;;                8

;;   basic block 7, loop depth 0
;;    pred:       6
;;                5
  # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
  __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_13, "Q" MEM[(int
*)&num_active_cpus] : "memory", "cc");
  goto <bb 9>; [100.00%]

In the (now inlined) call to atomic_add_const we use iftmp.0_13 which is set from
a PHI node and we never test iftmp.0_13 via b_c_p.

I think that's the key difference.

Having said that, and having a cross compiler and a full set of dumps I can see
that things are fine before .thread1.

Here's the relevant parts of the .mergephi2 dump:

;;   basic block 4, loop depth 0
;;    pred:       2
;;                3
  # iftmp.0_1 = PHI <1(2), -1(3)>
  _5 = __builtin_constant_p (iftmp.0_1);
  if (_5 != 0)
    goto <bb 5>; [50.00%]
  else
    goto <bb 8>; [50.00%]
;;    succ:       5
;;                8

;;   basic block 5, loop depth 0
;;    pred:       4
  if (iftmp.0_1 >= -128)
    goto <bb 6>; [50.00%]
  else
    goto <bb 8>; [50.00%]
;;    succ:       6
;;                8

;;   basic block 6, loop depth 0
;;    pred:       5
  if (iftmp.0_1 <= 127)
    goto <bb 7>; [34.00%]
  else
    goto <bb 8>; [66.00%]
;;    succ:       7
;;                8

;;   basic block 7, loop depth 0
;;    pred:       6
  __asm__ __volatile__("asi %0,%1
" : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_1, "Q" MEM[(int
*)&num_active_cpus] : "memory", "cc");
  goto <bb 9>; [100.00%]
;;    succ:       9

So we use iftmp.0_1 in bb7 the path to bb7 is predicated on b_c_p (iftmp.0_1), so
we're fine.

So, I agree that backwards jump threading mucked this up.  The question becomes
what can/should we do about it.


Disabling the transformation in the backwards jump threader like you've suggest
just papers over the problem.  I could easily see other passes making similar
transformations.  And there's this nugget in tree-ssa-dom.c:


         /* Resolve __builtin_constant_p.  If it hasn't been
            folded to
integer_one_node by now, it's fairly
            certain that the value simply
isn't constant.  */

Which hints that maybe the right thing to do is just resolve these calls earlier
before things start mucking around too much with the CFG.

The other alternative would be to reject transformations which register an
SSA_NAME for incremental updating when that SSA_NAME is used in a b_c_p call. 
Jump threading (forward and backward) are probably the worst offenders here, but
there could well be others.

We should probably seriously consider verifying that names registered for updates
aren't referenced by a b_c_p call.  I wouldn't be surprised if adding such a
check would turn up all kinds of violations.

Jeff
Li, Pan2 via Gcc-patches July 1, 2020, 8:02 p.m. UTC | #14
On Mon, 2020-06-29 at 08:58 +0200, Richard Biener wrote:
> On Sat, Jun 27, 2020 at 4:52 PM Marc Glisse <marc.glisse@inria.fr> wrote:
> > On Fri, 26 Jun 2020, Jeff Law via Gcc-patches wrote:
> > 
> > > In theory yes, but there are cases where paths converge (like you've shown) where
> > > you may have evaluated to a constant on the paths, but it's not a constant at the
> > > convergence point.  You have to be very careful using b_c_p like this and it's
> > > been a regular source of kernel bugs.
> > > 
> > > 
> > > I'd recommend looking at the .ssa dump and walk forward from there if the .ssa
> > > dump looks correct.
> > 
> > Here is the last dump before thread1 (105t.mergephi2). I don't see
> > anything incorrect in it.
> > 
> > ledtrig_cpu (_Bool is_active)
> > {
> >    int old;
> >    int iftmp.0_1;
> >    int _5;
> > 
> >    <bb 2> [local count: 1073741824]:
> >    if (is_active_2(D) != 0)
> >      goto <bb 4>; [50.00%]
> >    else
> >      goto <bb 3>; [50.00%]
> > 
> >    <bb 3> [local count: 536870913]:
> > 
> >    <bb 4> [local count: 1073741824]:
> >    # iftmp.0_1 = PHI <1(2), -1(3)>
> >    _5 = __builtin_constant_p (iftmp.0_1);
> >    if (_5 != 0)
> >      goto <bb 5>; [50.00%]
> >    else
> >      goto <bb 8>; [50.00%]
> > 
> >    <bb 5> [local count: 536870913]:
> >    if (iftmp.0_1 >= -128)
> >      goto <bb 6>; [50.00%]
> >    else
> >      goto <bb 8>; [50.00%]
> > 
> >    <bb 6> [local count: 268435456]:
> >    if (iftmp.0_1 <= 127)
> >      goto <bb 7>; [34.00%]
> >    else
> >      goto <bb 8>; [66.00%]
> > 
> >    <bb 7> [local count: 91268056]:
> >    __asm__ __volatile__("asi %0,%1
> > " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> >    goto <bb 9>; [100.00%]
> > 
> >    <bb 8> [local count: 982473769]:
> >    __asm__ __volatile__("laa %0,%2,%1
> > " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > 
> >    <bb 9> [local count: 1073741824]:
> >    return;
> > 
> > }
> > 
> > There is a single _b_c_p, the immediate asm argument is exactly the
> > argument of _b_c_p, and it is in the branch protected by _b_c_p.
> > 
> > Now the thread1 dump, for comparison
> > 
> > ledtrig_cpu (_Bool is_active)
> > {
> >    int old;
> >    int iftmp.0_4;
> >    int iftmp.0_6;
> >    int _7;
> >    int _12;
> >    int iftmp.0_13;
> >    int iftmp.0_14;
> > 
> >    <bb 2> [local count: 1073741824]:
> >    if (is_active_2(D) != 0)
> >      goto <bb 3>; [50.00%]
> >    else
> >      goto <bb 4>; [50.00%]
> > 
> >    <bb 3> [local count: 536870912]:
> >    # iftmp.0_6 = PHI <1(2)>
> >    _7 = __builtin_constant_p (iftmp.0_6);
> >    if (_7 != 0)
> >      goto <bb 6>; [50.00%]
> >    else
> >      goto <bb 8>; [50.00%]
> > 
> >    <bb 4> [local count: 536870912]:
> >    # iftmp.0_4 = PHI <-1(2)>
> >    _12 = __builtin_constant_p (iftmp.0_4);
> >    if (_12 != 0)
> >      goto <bb 5>; [50.00%]
> >    else
> >      goto <bb 8>; [50.00%]
> > 
> >    <bb 5> [local count: 268435456]:
> >    if (iftmp.0_4 >= -128)
> >      goto <bb 7>; [20.00%]
> >    else
> >      goto <bb 8>; [80.00%]
> > 
> >    <bb 6> [local count: 214748364]:
> >    if (iftmp.0_6 <= 127)
> >      goto <bb 7>; [12.00%]
> >    else
> >      goto <bb 8>; [88.00%]
> > 
> >    <bb 7> [local count: 91268056]:
> >    # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
> >    __asm__ __volatile__("asi %0,%1
> > " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_13, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> >    goto <bb 9>; [100.00%]
> > 
> >    <bb 8> [local count: 982473769]:
> >    # iftmp.0_14 = PHI <iftmp.0_4(5), iftmp.0_4(4), iftmp.0_6(6), iftmp.0_6(3)>
> >    __asm__ __volatile__("laa %0,%2,%1
> > " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_14, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > 
> >    <bb 9> [local count: 1073741824]:
> >    return;
> > 
> > }
> > 
> > Thread1 decides to separate the paths is_active and !is_active
> > (surprisingly, for one it optimizes out the comparison <= 127 and for the
> > other the comparison >= -128, while it could optimize both in both cases).
> > And it decides to converge after the comparisons, but before the asm.
> > 
> > What the pass did does seem to hurt. It looks like if we duplicate _b_c_p,
> > we may need to duplicate far enough to include all the blocks dominated by
> > _b_c_p==true (including the asm, here). Otherwise, any _b_c_p can be
> > optimized to true, because for a boolean
> > 
> > b is the same as b ? true : false
> > __builtin_constant_p(b ? true : false) would be the same as b ?
> > __builtin_constant_p(true) : __builtin_constant_p(false), i.e. true.
> > 
> > It is too bad we don't have any optimization pass using ranges between IPA
> > and thread1, that would have gotten rid of the comparisons, and hence the
> > temptation to thread. Adding always_inline on atomic_add (or flatten on
> > the caller) does help: EVRP removes the comparisons.
> > 
> > Do you see a way forward without changing what thread1 does or declaring
> > the testcase as unsupported?
> 
> Most of the cases I've seen involve transforms that make _b_c_p constant
> on one path and then introduce a new PHI merging two _b_c_p values
> to be then tested in a not simplified condition.  I'm not sure how to fend
> off jump threading (yeah, it's nearly always jump threading doing this...)
> doing this but certainly the easiest way would be to simply disallow
> [jump threading] from duplicating _b_c_p calls.
> 
> Or fold _b_c_p even earlier (though I definitely saw early backwards
> jump threading mess up such a case).
I'm not 100% sure it's duplication of the b_c_p call that's the problem.  Or
perhaps better stated, I think anything which results in an incremental update
with an SSA_NAME that's used in a b_c_p call is going to potentially be
problematical.  I suspect those are strong correlated though.
Jeff


> 
> Richard.
> 
> > --
> > Marc Glisse
Richard Biener July 2, 2020, 8:29 a.m. UTC | #15
On Wed, Jul 1, 2020 at 10:02 PM Jeff Law <law@redhat.com> wrote:
>
> On Mon, 2020-06-29 at 08:58 +0200, Richard Biener wrote:
> > On Sat, Jun 27, 2020 at 4:52 PM Marc Glisse <marc.glisse@inria.fr> wrote:
> > > On Fri, 26 Jun 2020, Jeff Law via Gcc-patches wrote:
> > >
> > > > In theory yes, but there are cases where paths converge (like you've shown) where
> > > > you may have evaluated to a constant on the paths, but it's not a constant at the
> > > > convergence point.  You have to be very careful using b_c_p like this and it's
> > > > been a regular source of kernel bugs.
> > > >
> > > >
> > > > I'd recommend looking at the .ssa dump and walk forward from there if the .ssa
> > > > dump looks correct.
> > >
> > > Here is the last dump before thread1 (105t.mergephi2). I don't see
> > > anything incorrect in it.
> > >
> > > ledtrig_cpu (_Bool is_active)
> > > {
> > >    int old;
> > >    int iftmp.0_1;
> > >    int _5;
> > >
> > >    <bb 2> [local count: 1073741824]:
> > >    if (is_active_2(D) != 0)
> > >      goto <bb 4>; [50.00%]
> > >    else
> > >      goto <bb 3>; [50.00%]
> > >
> > >    <bb 3> [local count: 536870913]:
> > >
> > >    <bb 4> [local count: 1073741824]:
> > >    # iftmp.0_1 = PHI <1(2), -1(3)>
> > >    _5 = __builtin_constant_p (iftmp.0_1);
> > >    if (_5 != 0)
> > >      goto <bb 5>; [50.00%]
> > >    else
> > >      goto <bb 8>; [50.00%]
> > >
> > >    <bb 5> [local count: 536870913]:
> > >    if (iftmp.0_1 >= -128)
> > >      goto <bb 6>; [50.00%]
> > >    else
> > >      goto <bb 8>; [50.00%]
> > >
> > >    <bb 6> [local count: 268435456]:
> > >    if (iftmp.0_1 <= 127)
> > >      goto <bb 7>; [34.00%]
> > >    else
> > >      goto <bb 8>; [66.00%]
> > >
> > >    <bb 7> [local count: 91268056]:
> > >    __asm__ __volatile__("asi %0,%1
> > > " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > >    goto <bb 9>; [100.00%]
> > >
> > >    <bb 8> [local count: 982473769]:
> > >    __asm__ __volatile__("laa %0,%2,%1
> > > " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_1, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > >
> > >    <bb 9> [local count: 1073741824]:
> > >    return;
> > >
> > > }
> > >
> > > There is a single _b_c_p, the immediate asm argument is exactly the
> > > argument of _b_c_p, and it is in the branch protected by _b_c_p.
> > >
> > > Now the thread1 dump, for comparison
> > >
> > > ledtrig_cpu (_Bool is_active)
> > > {
> > >    int old;
> > >    int iftmp.0_4;
> > >    int iftmp.0_6;
> > >    int _7;
> > >    int _12;
> > >    int iftmp.0_13;
> > >    int iftmp.0_14;
> > >
> > >    <bb 2> [local count: 1073741824]:
> > >    if (is_active_2(D) != 0)
> > >      goto <bb 3>; [50.00%]
> > >    else
> > >      goto <bb 4>; [50.00%]
> > >
> > >    <bb 3> [local count: 536870912]:
> > >    # iftmp.0_6 = PHI <1(2)>
> > >    _7 = __builtin_constant_p (iftmp.0_6);
> > >    if (_7 != 0)
> > >      goto <bb 6>; [50.00%]
> > >    else
> > >      goto <bb 8>; [50.00%]
> > >
> > >    <bb 4> [local count: 536870912]:
> > >    # iftmp.0_4 = PHI <-1(2)>
> > >    _12 = __builtin_constant_p (iftmp.0_4);
> > >    if (_12 != 0)
> > >      goto <bb 5>; [50.00%]
> > >    else
> > >      goto <bb 8>; [50.00%]
> > >
> > >    <bb 5> [local count: 268435456]:
> > >    if (iftmp.0_4 >= -128)
> > >      goto <bb 7>; [20.00%]
> > >    else
> > >      goto <bb 8>; [80.00%]
> > >
> > >    <bb 6> [local count: 214748364]:
> > >    if (iftmp.0_6 <= 127)
> > >      goto <bb 7>; [12.00%]
> > >    else
> > >      goto <bb 8>; [88.00%]
> > >
> > >    <bb 7> [local count: 91268056]:
> > >    # iftmp.0_13 = PHI <iftmp.0_6(6), iftmp.0_4(5)>
> > >    __asm__ __volatile__("asi %0,%1
> > > " : "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "i" iftmp.0_13, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > >    goto <bb 9>; [100.00%]
> > >
> > >    <bb 8> [local count: 982473769]:
> > >    # iftmp.0_14 = PHI <iftmp.0_4(5), iftmp.0_4(4), iftmp.0_6(6), iftmp.0_6(3)>
> > >    __asm__ __volatile__("laa %0,%2,%1
> > > " : "old" "=d" old_8, "ptr" "=Q" MEM[(int *)&num_active_cpus] : "val" "d" iftmp.0_14, "Q" MEM[(int *)&num_active_cpus] : "memory", "cc");
> > >
> > >    <bb 9> [local count: 1073741824]:
> > >    return;
> > >
> > > }
> > >
> > > Thread1 decides to separate the paths is_active and !is_active
> > > (surprisingly, for one it optimizes out the comparison <= 127 and for the
> > > other the comparison >= -128, while it could optimize both in both cases).
> > > And it decides to converge after the comparisons, but before the asm.
> > >
> > > What the pass did does seem to hurt. It looks like if we duplicate _b_c_p,
> > > we may need to duplicate far enough to include all the blocks dominated by
> > > _b_c_p==true (including the asm, here). Otherwise, any _b_c_p can be
> > > optimized to true, because for a boolean
> > >
> > > b is the same as b ? true : false
> > > __builtin_constant_p(b ? true : false) would be the same as b ?
> > > __builtin_constant_p(true) : __builtin_constant_p(false), i.e. true.
> > >
> > > It is too bad we don't have any optimization pass using ranges between IPA
> > > and thread1, that would have gotten rid of the comparisons, and hence the
> > > temptation to thread. Adding always_inline on atomic_add (or flatten on
> > > the caller) does help: EVRP removes the comparisons.
> > >
> > > Do you see a way forward without changing what thread1 does or declaring
> > > the testcase as unsupported?
> >
> > Most of the cases I've seen involve transforms that make _b_c_p constant
> > on one path and then introduce a new PHI merging two _b_c_p values
> > to be then tested in a not simplified condition.  I'm not sure how to fend
> > off jump threading (yeah, it's nearly always jump threading doing this...)
> > doing this but certainly the easiest way would be to simply disallow
> > [jump threading] from duplicating _b_c_p calls.
> >
> > Or fold _b_c_p even earlier (though I definitely saw early backwards
> > jump threading mess up such a case).
> I'm not 100% sure it's duplication of the b_c_p call that's the problem.  Or
> perhaps better stated, I think anything which results in an incremental update
> with an SSA_NAME that's used in a b_c_p call is going to potentially be
> problematical.  I suspect those are strong correlated though.

The issue is indeed not duplicating b_c_p in itself but that we stop threading
before reaching what is ultimatively guarded in the original source.  Usually
the ovious cases are like the one in this thread:

  if (__bultin_constant_p (i) && i > -128 && i < 127)
    ...
  else
    ...

and if we isolate parts of the condition and then can resolve _b_c_p to one
we get inconsitencies.  So yes, resolving _b_c_p earlier - if only during
threading itself as in the patch - will likely hide the problem.  Because it's
not always as easy as above to see what the "whole" condition that
ultimatively guards what was intended to be guarded was...

Thus the patch indeed looks appealing to me since we likely cannot
resolve _b_c_p during early optimizations in all cases.  In the end
for the example we lose optimization doing that since we should be
able to resolve this fully since both -1 and 1 are in the supported
range for atomic_add_const ...

One thing we might encourage people to do is re-formulate those
conditions as

 if (i >-128 && i < 127 && __builtin_constant_p (i))

which helps (but I've seen cases where doing that would be awkward).
Maybe

  if (__builtin_constant_p (__builtin_constant_p (i) && i > -128 && i < 127))

would be even better.

Richard.

> Jeff
>
>
> >
> > Richard.
> >
> > > --
> > > Marc Glisse
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c b/gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c
new file mode 100644
index 00000000000..5f0acdce0b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/builtin-constant-p-threading.c
@@ -0,0 +1,46 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=z196 -mzarch" } */
+
+typedef struct
+{
+  int counter;
+} atomic_t;
+
+static inline __attribute__ ((__gnu_inline__)) int
+__atomic_add (int val, int *ptr)
+{
+  int old;
+  asm volatile("laa %[old],%[val],%[ptr]\n"
+	       : [old] "=d" (old), [ptr] "+Q"(*ptr)
+	       : [val] "d" (val)
+	       : "cc", "memory");
+  return old;
+}
+
+static inline __attribute__ ((__gnu_inline__)) void
+__atomic_add_const (int val, int *ptr)
+{
+  asm volatile("asi %[ptr],%[val]\n"
+	       : [ptr] "+Q" (*ptr)
+	       : [val] "i" (val)
+	       : "cc", "memory");
+}
+
+static inline __attribute__ ((__gnu_inline__)) void
+atomic_add (int i, atomic_t *v)
+{
+  if (__builtin_constant_p (i) && (i > -129) && (i < 128))
+    {
+      __atomic_add_const (i, &v->counter);
+      return;
+    }
+  __atomic_add (i, &v->counter);
+}
+
+static atomic_t num_active_cpus = { (0) };
+
+void
+ledtrig_cpu (_Bool is_active)
+{
+  atomic_add (is_active ? 1 : -1, &num_active_cpus);
+}
diff --git a/gcc/tree-ssa-threadbackward.c b/gcc/tree-ssa-threadbackward.c
index 327628f1662..668932f6d85 100644
--- a/gcc/tree-ssa-threadbackward.c
+++ b/gcc/tree-ssa-threadbackward.c
@@ -259,8 +259,13 @@  thread_jumps::profitable_jump_thread_path (basic_block bbi, tree name,
 	       !gsi_end_p (gsi);
 	       gsi_next_nondebug (&gsi))
 	    {
+	      /* Do not allow OpenACC loop markers and __builtin_constant_p on
+		 a threading path.  The latter is disallowed, because an
+		 expression being constant on a threading path does not mean it
+		 can be considered constant in the entire function.  */
 	      gimple *stmt = gsi_stmt (gsi);
-	      if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+	      if (gimple_call_internal_p (stmt, IFN_UNIQUE)
+		  || gimple_call_builtin_p (stmt, BUILT_IN_CONSTANT_P))
 		{
 		  m_path.pop ();
 		  return NULL;