diff mbox series

[PR,rtl/optimization/98694] Fix incorrect optimization by cprop_hardreg.

Message ID CAMZc-by1DCvkq-TwSssx6LDUKbTrBNC6VPCU-amzwQ5GPzousw@mail.gmail.com
State New
Headers show
Series [PR,rtl/optimization/98694] Fix incorrect optimization by cprop_hardreg. | expand

Commit Message

Hongtao Liu Jan. 18, 2021, 9:16 a.m. UTC
Hi:
  If SRC had been assigned a mode narrower than the copy, we can't link
DEST into the chain even they have same
hard_regno_nregs(i.e. HImode/SImode in i386 backend).

i.e
        kmovw   %k0, %edi
        vmovd   %edi, %xmm2
        vpshuflw        $0, %xmm2, %xmm0
        kmovw   %k0, %r8d
        kmovd   %k0, %r9d
...
-        movl %r9d, %r11d
+        vmovd %xmm2, %r11d

  Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

        PR rtl-optimization/98694
        * regcprop.c (copy_value): If SRC had been assigned a mode
        narrower than the copy, we can't link DEST into the chain even
        they have same hard_regno_nregs(i.e. HImode/SImode in i386
        backend).

gcc/testsuite/ChangeLog:

        PR rtl-optimization/98694
        * gcc.target/i386/pr98694.c: New test.

  ---
 gcc/regcprop.c                          |  3 +-
 gcc/testsuite/gcc.target/i386/pr98694.c | 38 +++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c

Comments

Richard Sandiford Jan. 18, 2021, 10:18 a.m. UTC | #1
Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi:
>   If SRC had been assigned a mode narrower than the copy, we can't link
> DEST into the chain even they have same
> hard_regno_nregs(i.e. HImode/SImode in i386 backend).

In general, changes between modes within the same hard register are OK.
Could you explain in more detail what's going wrong?

Thanks,
Richard


>
> i.e
>         kmovw   %k0, %edi
>         vmovd   %edi, %xmm2
>         vpshuflw        $0, %xmm2, %xmm0
>         kmovw   %k0, %r8d
>         kmovd   %k0, %r9d
> ...
> -        movl %r9d, %r11d
> +        vmovd %xmm2, %r11d
>
>   Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR rtl-optimization/98694
>         * regcprop.c (copy_value): If SRC had been assigned a mode
>         narrower than the copy, we can't link DEST into the chain even
>         they have same hard_regno_nregs(i.e. HImode/SImode in i386
>         backend).
>
> gcc/testsuite/ChangeLog:
>
>         PR rtl-optimization/98694
>         * gcc.target/i386/pr98694.c: New test.
>
>   ---
>  gcc/regcprop.c                          |  3 +-
>  gcc/testsuite/gcc.target/i386/pr98694.c | 38 +++++++++++++++++++++++++
>  2 files changed, 40 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c
>
> diff --git a/gcc/regcprop.c b/gcc/regcprop.c
> index dd62cb36013..997516eca07 100644
> --- a/gcc/regcprop.c
> +++ b/gcc/regcprop.c
> @@ -355,7 +355,8 @@ copy_value (rtx dest, rtx src, struct value_data *vd)
>    /* If SRC had been assigned a mode narrower than the copy, we can't
>       link DEST into the chain, because not all of the pieces of the
>       copy came from oldest_regno.  */
> -  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode))
> +  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)
> +          || partial_subreg_p (vd->e[sr].mode, GET_MODE (src)))
>      return;
>
>    /* Link DR at the end of the value chain used by SR.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr98694.c
> b/gcc/testsuite/gcc.target/i386/pr98694.c
> new file mode 100644
> index 00000000000..611f9e77627
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr98694.c
> @@ -0,0 +1,38 @@
> +/* PR rtl-optimization/98694 */
> +/* { dg-do run { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512bw" } */
> +/* { dg-require-effective-target avx512bw } */
> +
> +#include<immintrin.h>
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef int v2si __attribute__ ((vector_size (8)));
> +v4hi b;
> +
> +__attribute__ ((noipa))
> +v2si
> +foo (__m512i src1, __m512i src2)
> +{
> +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> +  short s = (short) m;
> +  int i = (int)m;
> +  b = __extension__ (v4hi) {s, s, s, s};
> +  return __extension__ (v2si) {i, i};
> +}
> +
> +int main ()
> +{
> +  __m512i src1 = _mm512_setzero_si512 ();
> +  __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1,
> +                                 0, 1, 0, 1, 0, 1, 0, 1);
> +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> +  v2si a = foo (src1, src2);
> +  if (a[0] != (int)m)
> +    __builtin_abort ();
> +  return 0;
> +}
> --
Hongtao Liu Jan. 18, 2021, 10:43 a.m. UTC | #2
On Mon, Jan 18, 2021 at 6:18 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > Hi:
> >   If SRC had been assigned a mode narrower than the copy, we can't link
> > DEST into the chain even they have same
> > hard_regno_nregs(i.e. HImode/SImode in i386 backend).
>
> In general, changes between modes within the same hard register are OK.
> Could you explain in more detail what's going wrong?
>

cprop hardreg change

(insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
        (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal}
     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
        (nil)))

to

(insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75
{*movsi_internal}
     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
        (nil)))

since (reg:SI 22 xmm2) and (reg:SI r9) are in the same value chain in
which the oldest regno is k0.

but with xmm2 defined as

kmovw %k0, %edi  # 69 [c=4 l=4] *movhi_internal/6----- kmovw move the
lower 16bits to %edi, and clear the upper 16 bits.
vmovd %edi, %xmm2 # 489 *movsi_internal  --- vmovd move 32bits from
%edi to %xmm2.

(insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
        (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
{*movhi_internal}
     (nil))

(insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
        (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
     (nil))
...
kmovd %k0, %r9d (movsi) ---- kmovd move 32bits from %k0 to %r9d.

for %edi, bit 16-31 is cleared by kmovw which means %r9d is not equal
to %xmm2 as a SImode value.

> Thanks,
> Richard
>
>
> >
> > i.e
> >         kmovw   %k0, %edi
> >         vmovd   %edi, %xmm2
> >         vpshuflw        $0, %xmm2, %xmm0
> >         kmovw   %k0, %r8d
> >         kmovd   %k0, %r9d
> > ...
> > -        movl %r9d, %r11d
> > +        vmovd %xmm2, %r11d
> >
> >   Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
> >   Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR rtl-optimization/98694
> >         * regcprop.c (copy_value): If SRC had been assigned a mode
> >         narrower than the copy, we can't link DEST into the chain even
> >         they have same hard_regno_nregs(i.e. HImode/SImode in i386
> >         backend).
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR rtl-optimization/98694
> >         * gcc.target/i386/pr98694.c: New test.
> >
> >   ---
> >  gcc/regcprop.c                          |  3 +-
> >  gcc/testsuite/gcc.target/i386/pr98694.c | 38 +++++++++++++++++++++++++
> >  2 files changed, 40 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c
> >
> > diff --git a/gcc/regcprop.c b/gcc/regcprop.c
> > index dd62cb36013..997516eca07 100644
> > --- a/gcc/regcprop.c
> > +++ b/gcc/regcprop.c
> > @@ -355,7 +355,8 @@ copy_value (rtx dest, rtx src, struct value_data *vd)
> >    /* If SRC had been assigned a mode narrower than the copy, we can't
> >       link DEST into the chain, because not all of the pieces of the
> >       copy came from oldest_regno.  */
> > -  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode))
> > +  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)
> > +          || partial_subreg_p (vd->e[sr].mode, GET_MODE (src)))
> >      return;
> >
> >    /* Link DR at the end of the value chain used by SR.  */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr98694.c
> > b/gcc/testsuite/gcc.target/i386/pr98694.c
> > new file mode 100644
> > index 00000000000..611f9e77627
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr98694.c
> > @@ -0,0 +1,38 @@
> > +/* PR rtl-optimization/98694 */
> > +/* { dg-do run { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx512bw" } */
> > +/* { dg-require-effective-target avx512bw } */
> > +
> > +#include<immintrin.h>
> > +typedef short v4hi __attribute__ ((vector_size (8)));
> > +typedef int v2si __attribute__ ((vector_size (8)));
> > +v4hi b;
> > +
> > +__attribute__ ((noipa))
> > +v2si
> > +foo (__m512i src1, __m512i src2)
> > +{
> > +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> > +  short s = (short) m;
> > +  int i = (int)m;
> > +  b = __extension__ (v4hi) {s, s, s, s};
> > +  return __extension__ (v2si) {i, i};
> > +}
> > +
> > +int main ()
> > +{
> > +  __m512i src1 = _mm512_setzero_si512 ();
> > +  __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > +                                 0, 1, 0, 1, 0, 1, 0, 1);
> > +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> > +  v2si a = foo (src1, src2);
> > +  if (a[0] != (int)m)
> > +    __builtin_abort ();
> > +  return 0;
> > +}
> > --



--
BR,
Hongtao
Hongtao Liu Jan. 18, 2021, 10:51 a.m. UTC | #3
On Mon, Jan 18, 2021 at 6:43 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jan 18, 2021 at 6:18 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > Hi:
> > >   If SRC had been assigned a mode narrower than the copy, we can't link
> > > DEST into the chain even they have same
> > > hard_regno_nregs(i.e. HImode/SImode in i386 backend).
> >
> > In general, changes between modes within the same hard register are OK.
> > Could you explain in more detail what's going wrong?

For simplicity, If the copy of narrow mode has the side effect of
clearing the upper bits of the same hard register, But this behavior
is not described in the insn pattern, shouldn't it be wrong to add
different modes to the same value chain.

> >
>
> cprop hardreg change
>
> (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>         (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>         (nil)))
>
> to
>
> (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>         (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75
> {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
>         (nil)))
>
> since (reg:SI 22 xmm2) and (reg:SI r9) are in the same value chain in
> which the oldest regno is k0.
>
> but with xmm2 defined as
>
> kmovw %k0, %edi  # 69 [c=4 l=4] *movhi_internal/6----- kmovw move the
> lower 16bits to %edi, and clear the upper 16 bits.
> vmovd %edi, %xmm2 # 489 *movsi_internal  --- vmovd move 32bits from
> %edi to %xmm2.
>
> (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
>         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> {*movhi_internal}
>      (nil))
>
> (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
>         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>      (nil))
> ...
> kmovd %k0, %r9d (movsi) ---- kmovd move 32bits from %k0 to %r9d.
>
> for %edi, bit 16-31 is cleared by kmovw which means %r9d is not equal
> to %xmm2 as a SImode value.
>
> > Thanks,
> > Richard
> >
> >
> > >
> > > i.e
> > >         kmovw   %k0, %edi
> > >         vmovd   %edi, %xmm2
> > >         vpshuflw        $0, %xmm2, %xmm0
> > >         kmovw   %k0, %r8d
> > >         kmovd   %k0, %r9d
> > > ...
> > > -        movl %r9d, %r11d
> > > +        vmovd %xmm2, %r11d
> > >
> > >   Bootstrap and regtested on x86_64-linux-gnu{-m32,}.
> > >   Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR rtl-optimization/98694
> > >         * regcprop.c (copy_value): If SRC had been assigned a mode
> > >         narrower than the copy, we can't link DEST into the chain even
> > >         they have same hard_regno_nregs(i.e. HImode/SImode in i386
> > >         backend).
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR rtl-optimization/98694
> > >         * gcc.target/i386/pr98694.c: New test.
> > >
> > >   ---
> > >  gcc/regcprop.c                          |  3 +-
> > >  gcc/testsuite/gcc.target/i386/pr98694.c | 38 +++++++++++++++++++++++++
> > >  2 files changed, 40 insertions(+), 1 deletion(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c
> > >
> > > diff --git a/gcc/regcprop.c b/gcc/regcprop.c
> > > index dd62cb36013..997516eca07 100644
> > > --- a/gcc/regcprop.c
> > > +++ b/gcc/regcprop.c
> > > @@ -355,7 +355,8 @@ copy_value (rtx dest, rtx src, struct value_data *vd)
> > >    /* If SRC had been assigned a mode narrower than the copy, we can't
> > >       link DEST into the chain, because not all of the pieces of the
> > >       copy came from oldest_regno.  */
> > > -  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode))
> > > +  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)
> > > +          || partial_subreg_p (vd->e[sr].mode, GET_MODE (src)))
> > >      return;
> > >
> > >    /* Link DR at the end of the value chain used by SR.  */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr98694.c
> > > b/gcc/testsuite/gcc.target/i386/pr98694.c
> > > new file mode 100644
> > > index 00000000000..611f9e77627
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr98694.c
> > > @@ -0,0 +1,38 @@
> > > +/* PR rtl-optimization/98694 */
> > > +/* { dg-do run { target { ! ia32 } } } */
> > > +/* { dg-options "-O2 -mavx512bw" } */
> > > +/* { dg-require-effective-target avx512bw } */
> > > +
> > > +#include<immintrin.h>
> > > +typedef short v4hi __attribute__ ((vector_size (8)));
> > > +typedef int v2si __attribute__ ((vector_size (8)));
> > > +v4hi b;
> > > +
> > > +__attribute__ ((noipa))
> > > +v2si
> > > +foo (__m512i src1, __m512i src2)
> > > +{
> > > +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> > > +  short s = (short) m;
> > > +  int i = (int)m;
> > > +  b = __extension__ (v4hi) {s, s, s, s};
> > > +  return __extension__ (v2si) {i, i};
> > > +}
> > > +
> > > +int main ()
> > > +{
> > > +  __m512i src1 = _mm512_setzero_si512 ();
> > > +  __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1,
> > > +                                 0, 1, 0, 1, 0, 1, 0, 1);
> > > +  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
> > > +  v2si a = foo (src1, src2);
> > > +  if (a[0] != (int)m)
> > > +    __builtin_abort ();
> > > +  return 0;
> > > +}
> > > --
>
>
>
> --
> BR,
> Hongtao
Richard Sandiford Jan. 18, 2021, 11:10 a.m. UTC | #4
Hongtao Liu <crazylht@gmail.com> writes:
> On Mon, Jan 18, 2021 at 6:18 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > Hi:
>> >   If SRC had been assigned a mode narrower than the copy, we can't link
>> > DEST into the chain even they have same
>> > hard_regno_nregs(i.e. HImode/SImode in i386 backend).
>>
>> In general, changes between modes within the same hard register are OK.
>> Could you explain in more detail what's going wrong?
>>
>
> cprop hardreg change
>
> (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>         (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>         (nil)))
>
> to
>
> (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>         (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75
> {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
>         (nil)))
>
> since (reg:SI 22 xmm2) and (reg:SI r9) are in the same value chain in
> which the oldest regno is k0.
>
> but with xmm2 defined as
>
> kmovw %k0, %edi  # 69 [c=4 l=4] *movhi_internal/6----- kmovw move the
> lower 16bits to %edi, and clear the upper 16 bits.
> vmovd %edi, %xmm2 # 489 *movsi_internal  --- vmovd move 32bits from
> %edi to %xmm2.
>
> (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
>         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> {*movhi_internal}
>      (nil))
>
> (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
>         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>      (nil))

The sequence is OK in itself, but insn 489 can't make any assumptions
about what's in the upper 16 bits of %edi.  In other words, as far as
RTL semantics are concerned, insn 489 only leaves bits 0-15 of %xmm2
with defined values; the other bits are undefined.

If the target wants all 32 bits of %edi to be carried over to insn 489
then it needs to make insn 69 an SImode set instead of a HImode set.

So what cprop is doing is OK: it's changing the values of undefined
bits but not changing the definition of defined bits (from an RTL
point of view).

Thanks,
Richard
Hongtao Liu Jan. 19, 2021, 12:59 a.m. UTC | #5
On Mon, Jan 18, 2021 at 7:10 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Hongtao Liu <crazylht@gmail.com> writes:
> > On Mon, Jan 18, 2021 at 6:18 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> > Hi:
> >> >   If SRC had been assigned a mode narrower than the copy, we can't link
> >> > DEST into the chain even they have same
> >> > hard_regno_nregs(i.e. HImode/SImode in i386 backend).
> >>
> >> In general, changes between modes within the same hard register are OK.
> >> Could you explain in more detail what's going wrong?
> >>
> >
> > cprop hardreg change
> >
> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal}
> >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> >         (nil)))
> >
> > to
> >
> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> >         (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75
> > {*movsi_internal}
> >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
> >         (nil)))
> >
> > since (reg:SI 22 xmm2) and (reg:SI r9) are in the same value chain in
> > which the oldest regno is k0.
> >
> > but with xmm2 defined as
> >
> > kmovw %k0, %edi  # 69 [c=4 l=4] *movhi_internal/6----- kmovw move the
> > lower 16bits to %edi, and clear the upper 16 bits.
> > vmovd %edi, %xmm2 # 489 *movsi_internal  --- vmovd move 32bits from
> > %edi to %xmm2.
> >
> > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> > {*movhi_internal}
> >      (nil))
> >
> > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> >      (nil))
>
> The sequence is OK in itself, but insn 489 can't make any assumptions
> about what's in the upper 16 bits of %edi.  In other words, as far as
> RTL semantics are concerned, insn 489 only leaves bits 0-15 of %xmm2
> with defined values; the other bits are undefined.
>
> If the target wants all 32 bits of %edi to be carried over to insn 489
> then it needs to make insn 69 an SImode set instead of a HImode set.
>

actually only the lower 16bits are needed, the original insn is like

.294.r.ira
(insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
        (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
{*movhi_internal}
     (nil))
(insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
        (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
]) 0)))) 1412 {*vec_dupv4hi}
     (nil))

.295r.reload
(insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
        (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
{*movhi_internal}
     (nil))
(insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
        (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
     (nil))
(insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
        (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
1412 {*vec_dupv4hi}
     (nil))

and insn 489 is created by lra/reload which seems ok for the sequence,
but problemistic with considering the logic of hardreg_cprop.

> So what cprop is doing is OK: it's changing the values of undefined
> bits but not changing the definition of defined bits (from an RTL
> point of view).
>
> Thanks,
> Richard
Richard Sandiford Jan. 19, 2021, 12:38 p.m. UTC | #6
Hongtao Liu <crazylht@gmail.com> writes:
> On Mon, Jan 18, 2021 at 7:10 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Hongtao Liu <crazylht@gmail.com> writes:
>> > On Mon, Jan 18, 2021 at 6:18 PM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> Hongtao Liu via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >> > Hi:
>> >> >   If SRC had been assigned a mode narrower than the copy, we can't link
>> >> > DEST into the chain even they have same
>> >> > hard_regno_nregs(i.e. HImode/SImode in i386 backend).
>> >>
>> >> In general, changes between modes within the same hard register are OK.
>> >> Could you explain in more detail what's going wrong?
>> >>
>> >
>> > cprop hardreg change
>> >
>> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>> >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "test.c":29:36 75 {*movsi_internal}
>> >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>> >         (nil)))
>> >
>> > to
>> >
>> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>> >         (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "test.c":29:36 75
>> > {*movsi_internal}
>> >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
>> >         (nil)))
>> >
>> > since (reg:SI 22 xmm2) and (reg:SI r9) are in the same value chain in
>> > which the oldest regno is k0.
>> >
>> > but with xmm2 defined as
>> >
>> > kmovw %k0, %edi  # 69 [c=4 l=4] *movhi_internal/6----- kmovw move the
>> > lower 16bits to %edi, and clear the upper 16 bits.
>> > vmovd %edi, %xmm2 # 489 *movsi_internal  --- vmovd move 32bits from
>> > %edi to %xmm2.
>> >
>> > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
>> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
>> > {*movhi_internal}
>> >      (nil))
>> >
>> > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
>> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>> >      (nil))
>>
>> The sequence is OK in itself, but insn 489 can't make any assumptions
>> about what's in the upper 16 bits of %edi.  In other words, as far as
>> RTL semantics are concerned, insn 489 only leaves bits 0-15 of %xmm2
>> with defined values; the other bits are undefined.
>>
>> If the target wants all 32 bits of %edi to be carried over to insn 489
>> then it needs to make insn 69 an SImode set instead of a HImode set.
>>
>
> actually only the lower 16bits are needed, the original insn is like
>
> .294.r.ira
> (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
>         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
> {*movhi_internal}
>      (nil))
> (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
>         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
> ]) 0)))) 1412 {*vec_dupv4hi}
>      (nil))
>
> .295r.reload
> (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
>         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> {*movhi_internal}
>      (nil))
> (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
>         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>      (nil))
> (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
>         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
> 1412 {*vec_dupv4hi}
>      (nil))
>
> and insn 489 is created by lra/reload which seems ok for the sequence,
> but problemistic with considering the logic of hardreg_cprop.

It looks OK even with the regcprop behaviour though:

- insn 69 defines only the low 16 bits of di,
- insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
  too (with unknown contents)
- insn 78 uses only the low 16 bits of xmm2 (the unknown contents
  introduced by insn 489 are truncated away)

So where do bits 16-31 become significant?  What goes wrong if they're
not zero?

Thanks,
Richard
Jakub Jelinek Jan. 19, 2021, 2:45 p.m. UTC | #7
On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
> > actually only the lower 16bits are needed, the original insn is like
> >
> > .294.r.ira
> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
> > {*movhi_internal}
> >      (nil))
> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
> > ]) 0)))) 1412 {*vec_dupv4hi}
> >      (nil))
> >
> > .295r.reload
> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> > {*movhi_internal}
> >      (nil))
> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> >      (nil))
> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
> > 1412 {*vec_dupv4hi}
> >      (nil))
> >
> > and insn 489 is created by lra/reload which seems ok for the sequence,
> > but problemistic with considering the logic of hardreg_cprop.
> 
> It looks OK even with the regcprop behaviour though:
> 
> - insn 69 defines only the low 16 bits of di,
> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
>   too (with unknown contents)
> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
>   introduced by insn 489 are truncated away)
> 
> So where do bits 16-31 become significant?  What goes wrong if they're
> not zero?

The k0 register is initialized I believe with
(insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
        (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
     (nil))
and so it contains all 64-bits, and then the code sometimes uses all the
bits, sometimes just the low 16-bits and sometimes low 32-bits of that
value.
(insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
        (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
     (nil))
(insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
        (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
     (nil))
(insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
        (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
     (nil))
(insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
        (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
     (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
        (nil)))
are examples when it uses only the low 16 bits from that, and
(insn 487 72 73 12 (set (reg:SI 1 dx [148])
        (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
     (nil))

(insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
        (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
     (nil))

(insn 491 85 88 13 (set (reg:SI 3 bx [299])
        (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
     (nil))
(insn 88 491 89 13 (set (reg:CCNO 17 flags)
        (compare:CCNO (reg:SI 3 bx [299])
            (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
     (expr_list:REG_DEAD (reg:SI 3 bx [299])
        (nil)))

(insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
        (nil)))
are examples where it uses low 32-bits from k0.
So the
 (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
-        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
-     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
+        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
+     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
         (nil)))
cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
only the low 16-bits of the value and has the upper bits undefined, while r9
it is replacing had all of the low 32-bits well defined.

	Jakub
Richard Sandiford Jan. 19, 2021, 4:10 p.m. UTC | #8
Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
>> > actually only the lower 16bits are needed, the original insn is like
>> >
>> > .294.r.ira
>> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
>> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
>> > {*movhi_internal}
>> >      (nil))
>> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
>> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
>> > ]) 0)))) 1412 {*vec_dupv4hi}
>> >      (nil))
>> >
>> > .295r.reload
>> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
>> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
>> > {*movhi_internal}
>> >      (nil))
>> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
>> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>> >      (nil))
>> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
>> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
>> > 1412 {*vec_dupv4hi}
>> >      (nil))
>> >
>> > and insn 489 is created by lra/reload which seems ok for the sequence,
>> > but problemistic with considering the logic of hardreg_cprop.
>> 
>> It looks OK even with the regcprop behaviour though:
>> 
>> - insn 69 defines only the low 16 bits of di,
>> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
>>   too (with unknown contents)
>> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
>>   introduced by insn 489 are truncated away)
>> 
>> So where do bits 16-31 become significant?  What goes wrong if they're
>> not zero?
>
> The k0 register is initialized I believe with
> (insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
>         (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
>      (nil))
> and so it contains all 64-bits, and then the code sometimes uses all the
> bits, sometimes just the low 16-bits and sometimes low 32-bits of that
> value.
> (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
>         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
>      (nil))
> (insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
>         (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
>      (nil))
> (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
>         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>      (nil))
> (insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
>         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
>      (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
>         (nil)))
> are examples when it uses only the low 16 bits from that, and
> (insn 487 72 73 12 (set (reg:SI 1 dx [148])
>         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
>      (nil))
>
> (insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
>         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
>      (nil))
>
> (insn 491 85 88 13 (set (reg:SI 3 bx [299])
>         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
>      (nil))
> (insn 88 491 89 13 (set (reg:CCNO 17 flags)
>         (compare:CCNO (reg:SI 3 bx [299])
>             (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
>      (expr_list:REG_DEAD (reg:SI 3 bx [299])
>         (nil)))
>
> (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>         (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
>      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>         (nil)))
> are examples where it uses low 32-bits from k0.
> So the
>  (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> -        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> -     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> +        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> +     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
>          (nil)))
> cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
> only the low 16-bits of the value and has the upper bits undefined, while r9
> it is replacing had all of the low 32-bits well defined.

Ah, ok, thanks for the extra context.

So AIUI the problem when recording xmm2<-di isn't just:

 [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))

but also that:

 [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)

For example, all registers in this sequence can be part of the same chain:

    (set (reg:HI R1) (reg:HI R0))
    (set (reg:SI R2) (reg:SI R1)) // [A]
    (set (reg:DI R3) (reg:DI R2)) // [A]
    (set (reg:SI R4) (reg:SI R[0-3]))
    (set (reg:HI R5) (reg:HI R[0-4]))

But:

    (set (reg:SI R1) (reg:SI R0))
    (set (reg:HI R2) (reg:HI R1))
    (set (reg:SI R3) (reg:SI R2)) // [A] && [B]

is problematic because it dips below the precision of the oldest regno
and then increases again.

When this happens, I guess we have two choices:

(1) what the patch does: treat R3 as the start of a new chain.
(2) pretend that the copy occured in vd->e[sr].mode instead
    (i.e. copy vd->e[sr].mode to vd->e[dr].mode)

I guess (2) would need to be subject to REG_CAN_CHANGE_MODE_P.
Maybe the optimisation provided by (2) compared to (1) isn't common
enough to be worth the complication.

I think we should test [B] as well as [A] though.  The pass is set
up to do some quite elaborate mode changes and I think rejecting
[A] on its own would make some of the other code redundant.
It also feels like it should be a seperate “if” or “else if”,
with its own comment.

Thanks,
Richard
Hongtao Liu Jan. 20, 2021, 4:35 a.m. UTC | #9
On Wed, Jan 20, 2021 at 12:10 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
> >> > actually only the lower 16bits are needed, the original insn is like
> >> >
> >> > .294.r.ira
> >> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
> >> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
> >> > {*movhi_internal}
> >> >      (nil))
> >> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
> >> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
> >> > ]) 0)))) 1412 {*vec_dupv4hi}
> >> >      (nil))
> >> >
> >> > .295r.reload
> >> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
> >> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> >> > {*movhi_internal}
> >> >      (nil))
> >> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
> >> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> >> >      (nil))
> >> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> >> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
> >> > 1412 {*vec_dupv4hi}
> >> >      (nil))
> >> >
> >> > and insn 489 is created by lra/reload which seems ok for the sequence,
> >> > but problemistic with considering the logic of hardreg_cprop.
> >>
> >> It looks OK even with the regcprop behaviour though:
> >>
> >> - insn 69 defines only the low 16 bits of di,
> >> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
> >>   too (with unknown contents)
> >> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
> >>   introduced by insn 489 are truncated away)
> >>
> >> So where do bits 16-31 become significant?  What goes wrong if they're
> >> not zero?
> >
> > The k0 register is initialized I believe with
> > (insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
> >         (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
> >      (nil))
> > and so it contains all 64-bits, and then the code sometimes uses all the
> > bits, sometimes just the low 16-bits and sometimes low 32-bits of that
> > value.
> > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
> >      (nil))
> > (insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
> >         (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
> >      (nil))
> > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> >      (nil))
> > (insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
> >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
> >         (nil)))
> > are examples when it uses only the low 16 bits from that, and
> > (insn 487 72 73 12 (set (reg:SI 1 dx [148])
> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> >      (nil))
> >
> > (insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
> >      (nil))
> >
> > (insn 491 85 88 13 (set (reg:SI 3 bx [299])
> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> >      (nil))
> > (insn 88 491 89 13 (set (reg:CCNO 17 flags)
> >         (compare:CCNO (reg:SI 3 bx [299])
> >             (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
> >      (expr_list:REG_DEAD (reg:SI 3 bx [299])
> >         (nil)))
> >
> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> >         (nil)))
> > are examples where it uses low 32-bits from k0.
> > So the
> >  (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> > -        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > -     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> > +        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > +     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
> >          (nil)))
> > cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
> > only the low 16-bits of the value and has the upper bits undefined, while r9
> > it is replacing had all of the low 32-bits well defined.
>
> Ah, ok, thanks for the extra context.
>
> So AIUI the problem when recording xmm2<-di isn't just:
>
>  [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))
>
> but also that:
>
>  [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)
>
> For example, all registers in this sequence can be part of the same chain:
>
>     (set (reg:HI R1) (reg:HI R0))
>     (set (reg:SI R2) (reg:SI R1)) // [A]
>     (set (reg:DI R3) (reg:DI R2)) // [A]
>     (set (reg:SI R4) (reg:SI R[0-3]))
>     (set (reg:HI R5) (reg:HI R[0-4]))
>
> But:
>
>     (set (reg:SI R1) (reg:SI R0))
>     (set (reg:HI R2) (reg:HI R1))
>     (set (reg:SI R3) (reg:SI R2)) // [A] && [B]
>
> is problematic because it dips below the precision of the oldest regno
> and then increases again.
>
> When this happens, I guess we have two choices:
>
> (1) what the patch does: treat R3 as the start of a new chain.
> (2) pretend that the copy occured in vd->e[sr].mode instead
>     (i.e. copy vd->e[sr].mode to vd->e[dr].mode)
>
> I guess (2) would need to be subject to REG_CAN_CHANGE_MODE_P.
> Maybe the optimisation provided by (2) compared to (1) isn't common
> enough to be worth the complication.
>
> I think we should test [B] as well as [A] though.  The pass is set
> up to do some quite elaborate mode changes and I think rejecting
> [A] on its own would make some of the other code redundant.
> It also feels like it should be a seperate “if” or “else if”,
> with its own comment.
>
Update patch.
> Thanks,
> Richard
Hongtao Liu Jan. 20, 2021, 4:40 a.m. UTC | #10
On Wed, Jan 20, 2021 at 12:35 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Jan 20, 2021 at 12:10 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
> > >> > actually only the lower 16bits are needed, the original insn is like
> > >> >
> > >> > .294.r.ira
> > >> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
> > >> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
> > >> > {*movhi_internal}
> > >> >      (nil))
> > >> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
> > >> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
> > >> > ]) 0)))) 1412 {*vec_dupv4hi}
> > >> >      (nil))
> > >> >
> > >> > .295r.reload
> > >> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
> > >> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> > >> > {*movhi_internal}
> > >> >      (nil))
> > >> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
> > >> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> > >> >      (nil))
> > >> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> > >> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
> > >> > 1412 {*vec_dupv4hi}
> > >> >      (nil))
> > >> >
> > >> > and insn 489 is created by lra/reload which seems ok for the sequence,
> > >> > but problemistic with considering the logic of hardreg_cprop.
> > >>
> > >> It looks OK even with the regcprop behaviour though:
> > >>
> > >> - insn 69 defines only the low 16 bits of di,
> > >> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
> > >>   too (with unknown contents)
> > >> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
> > >>   introduced by insn 489 are truncated away)
> > >>
> > >> So where do bits 16-31 become significant?  What goes wrong if they're
> > >> not zero?
> > >
> > > The k0 register is initialized I believe with
> > > (insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
> > >         (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
> > >      (nil))
> > > and so it contains all 64-bits, and then the code sometimes uses all the
> > > bits, sometimes just the low 16-bits and sometimes low 32-bits of that
> > > value.
> > > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
> > >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
> > >      (nil))
> > > (insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
> > >         (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
> > >      (nil))
> > > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
> > >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> > >      (nil))
> > > (insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> > >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
> > >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
> > >         (nil)))
> > > are examples when it uses only the low 16 bits from that, and
> > > (insn 487 72 73 12 (set (reg:SI 1 dx [148])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> > >      (nil))
> > >
> > > (insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
> > >      (nil))
> > >
> > > (insn 491 85 88 13 (set (reg:SI 3 bx [299])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> > >      (nil))
> > > (insn 88 491 89 13 (set (reg:CCNO 17 flags)
> > >         (compare:CCNO (reg:SI 3 bx [299])
> > >             (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
> > >      (expr_list:REG_DEAD (reg:SI 3 bx [299])
> > >         (nil)))
> > >
> > > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> > >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> > >         (nil)))
> > > are examples where it uses low 32-bits from k0.
> > > So the
> > >  (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> > > -        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > > -     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> > > +        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > > +     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
> > >          (nil)))
> > > cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
> > > only the low 16-bits of the value and has the upper bits undefined, while r9
> > > it is replacing had all of the low 32-bits well defined.
> >
> > Ah, ok, thanks for the extra context.
> >
> > So AIUI the problem when recording xmm2<-di isn't just:
> >
> >  [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))
> >
> > but also that:
> >
> >  [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)
> >
> > For example, all registers in this sequence can be part of the same chain:
> >
> >     (set (reg:HI R1) (reg:HI R0))
> >     (set (reg:SI R2) (reg:SI R1)) // [A]
> >     (set (reg:DI R3) (reg:DI R2)) // [A]
> >     (set (reg:SI R4) (reg:SI R[0-3]))
> >     (set (reg:HI R5) (reg:HI R[0-4]))
> >
> > But:
> >
> >     (set (reg:SI R1) (reg:SI R0))
> >     (set (reg:HI R2) (reg:HI R1))
> >     (set (reg:SI R3) (reg:SI R2)) // [A] && [B]
> >
> > is problematic because it dips below the precision of the oldest regno
> > and then increases again.
> >
> > When this happens, I guess we have two choices:
> >
> > (1) what the patch does: treat R3 as the start of a new chain.
> > (2) pretend that the copy occured in vd->e[sr].mode instead
> >     (i.e. copy vd->e[sr].mode to vd->e[dr].mode)
> >
> > I guess (2) would need to be subject to REG_CAN_CHANGE_MODE_P.
> > Maybe the optimisation provided by (2) compared to (1) isn't common
> > enough to be worth the complication.
> >
> > I think we should test [B] as well as [A] though.  The pass is set
> > up to do some quite elaborate mode changes and I think rejecting
> > [A] on its own would make some of the other code redundant.
> > It also feels like it should be a seperate “if” or “else if”,
> > with its own comment.
> >
> Update patch.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.

> > Thanks,
> > Richard
>
>
>
> --
> BR,
> Hongtao
H.J. Lu Jan. 20, 2021, 12:56 p.m. UTC | #11
On Tue, Jan 19, 2021 at 8:32 PM Hongtao Liu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, Jan 20, 2021 at 12:10 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
> > >> > actually only the lower 16bits are needed, the original insn is like
> > >> >
> > >> > .294.r.ira
> > >> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
> > >> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
> > >> > {*movhi_internal}
> > >> >      (nil))
> > >> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
> > >> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
> > >> > ]) 0)))) 1412 {*vec_dupv4hi}
> > >> >      (nil))
> > >> >
> > >> > .295r.reload
> > >> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
> > >> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
> > >> > {*movhi_internal}
> > >> >      (nil))
> > >> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
> > >> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> > >> >      (nil))
> > >> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> > >> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
> > >> > 1412 {*vec_dupv4hi}
> > >> >      (nil))
> > >> >
> > >> > and insn 489 is created by lra/reload which seems ok for the sequence,
> > >> > but problemistic with considering the logic of hardreg_cprop.
> > >>
> > >> It looks OK even with the regcprop behaviour though:
> > >>
> > >> - insn 69 defines only the low 16 bits of di,
> > >> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
> > >>   too (with unknown contents)
> > >> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
> > >>   introduced by insn 489 are truncated away)
> > >>
> > >> So where do bits 16-31 become significant?  What goes wrong if they're
> > >> not zero?
> > >
> > > The k0 register is initialized I believe with
> > > (insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
> > >         (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
> > >      (nil))
> > > and so it contains all 64-bits, and then the code sometimes uses all the
> > > bits, sometimes just the low 16-bits and sometimes low 32-bits of that
> > > value.
> > > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
> > >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
> > >      (nil))
> > > (insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
> > >         (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
> > >      (nil))
> > > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
> > >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
> > >      (nil))
> > > (insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
> > >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
> > >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
> > >         (nil)))
> > > are examples when it uses only the low 16 bits from that, and
> > > (insn 487 72 73 12 (set (reg:SI 1 dx [148])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> > >      (nil))
> > >
> > > (insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
> > >      (nil))
> > >
> > > (insn 491 85 88 13 (set (reg:SI 3 bx [299])
> > >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
> > >      (nil))
> > > (insn 88 491 89 13 (set (reg:CCNO 17 flags)
> > >         (compare:CCNO (reg:SI 3 bx [299])
> > >             (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
> > >      (expr_list:REG_DEAD (reg:SI 3 bx [299])
> > >         (nil)))
> > >
> > > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> > >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> > >         (nil)))
> > > are examples where it uses low 32-bits from k0.
> > > So the
> > >  (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
> > > -        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > > -     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
> > > +        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
> > > +     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
> > >          (nil)))
> > > cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
> > > only the low 16-bits of the value and has the upper bits undefined, while r9
> > > it is replacing had all of the low 32-bits well defined.
> >
> > Ah, ok, thanks for the extra context.
> >
> > So AIUI the problem when recording xmm2<-di isn't just:
> >
> >  [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))
> >
> > but also that:
> >
> >  [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)
> >
> > For example, all registers in this sequence can be part of the same chain:
> >
> >     (set (reg:HI R1) (reg:HI R0))
> >     (set (reg:SI R2) (reg:SI R1)) // [A]
> >     (set (reg:DI R3) (reg:DI R2)) // [A]
> >     (set (reg:SI R4) (reg:SI R[0-3]))
> >     (set (reg:HI R5) (reg:HI R[0-4]))
> >
> > But:
> >
> >     (set (reg:SI R1) (reg:SI R0))
> >     (set (reg:HI R2) (reg:HI R1))
> >     (set (reg:SI R3) (reg:SI R2)) // [A] && [B]
> >
> > is problematic because it dips below the precision of the oldest regno
> > and then increases again.
> >
> > When this happens, I guess we have two choices:
> >
> > (1) what the patch does: treat R3 as the start of a new chain.
> > (2) pretend that the copy occured in vd->e[sr].mode instead
> >     (i.e. copy vd->e[sr].mode to vd->e[dr].mode)
> >
> > I guess (2) would need to be subject to REG_CAN_CHANGE_MODE_P.
> > Maybe the optimisation provided by (2) compared to (1) isn't common
> > enough to be worth the complication.
> >
> > I think we should test [B] as well as [A] though.  The pass is set
> > up to do some quite elaborate mode changes and I think rejecting
> > [A] on its own would make some of the other code redundant.
> > It also feels like it should be a seperate “if” or “else if”,
> > with its own comment.
> >
> Update patch.
> > Thanks,
> > Richard

+int main ()
+{

Please add __builtin_cpu_supports ("avx512bw") check.

+  __m512i src1 = _mm512_setzero_si512 ();
+  __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1,
+   0, 1, 0, 1, 0, 1, 0, 1);
+  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
+  v2si a = foo (src1, src2);
+  if (a[0] != (int)m)
+    __builtin_abort ();
+  return 0;
+}
Richard Sandiford Jan. 20, 2021, 2:14 p.m. UTC | #12
Hongtao Liu <crazylht@gmail.com> writes:
> On Wed, Jan 20, 2021 at 12:10 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > On Tue, Jan 19, 2021 at 12:38:47PM +0000, Richard Sandiford via Gcc-patches wrote:
>> >> > actually only the lower 16bits are needed, the original insn is like
>> >> >
>> >> > .294.r.ira
>> >> > (insn 69 68 70 13 (set (reg:HI 96 [ _52 ])
>> >> >         (subreg:HI (reg:DI 82 [ var_6.0_1 ]) 0)) "test.c":21:23 76
>> >> > {*movhi_internal}
>> >> >      (nil))
>> >> > (insn 78 75 82 13 (set (reg:V4HI 140 [ _283 ])
>> >> >         (vec_duplicate:V4HI (truncate:HI (subreg:SI (reg:HI 96 [ _52
>> >> > ]) 0)))) 1412 {*vec_dupv4hi}
>> >> >      (nil))
>> >> >
>> >> > .295r.reload
>> >> > (insn 69 68 70 13 (set (reg:HI 5 di [orig:96 _52 ] [96])
>> >> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "test.c":21:23 76
>> >> > {*movhi_internal}
>> >> >      (nil))
>> >> > (insn 489 75 78 13 (set (reg:SI 22 xmm2 [297])
>> >> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>> >> >      (nil))
>> >> > (insn 78 489 490 13 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
>> >> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297]))))
>> >> > 1412 {*vec_dupv4hi}
>> >> >      (nil))
>> >> >
>> >> > and insn 489 is created by lra/reload which seems ok for the sequence,
>> >> > but problemistic with considering the logic of hardreg_cprop.
>> >>
>> >> It looks OK even with the regcprop behaviour though:
>> >>
>> >> - insn 69 defines only the low 16 bits of di,
>> >> - insn 489 defines only the low 16 bits of xmm2, but copies bits 16-31
>> >>   too (with unknown contents)
>> >> - insn 78 uses only the low 16 bits of xmm2 (the unknown contents
>> >>   introduced by insn 489 are truncated away)
>> >>
>> >> So where do bits 16-31 become significant?  What goes wrong if they're
>> >> not zero?
>> >
>> > The k0 register is initialized I believe with
>> > (insn 20 2 21 2 (set (reg:DI 68 k0 [orig:82 var_6.0_1 ] [82])
>> >         (mem/c:DI (symbol_ref:DI ("var_6") [flags 0x40]  <var_decl 0x7f7babeaaf30 var_6>) [3 var_6+0 S8 A64])) "pr98694.C":21:10 74 {*movdi_internal}
>> >      (nil))
>> > and so it contains all 64-bits, and then the code sometimes uses all the
>> > bits, sometimes just the low 16-bits and sometimes low 32-bits of that
>> > value.
>> > (insn 69 68 70 12 (set (reg:HI 5 di [orig:96 _52 ] [96])
>> >         (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":27:23 76 {*movhi_internal}
>> >      (nil))
>> > (insn 74 73 75 12 (set (reg:SI 36 r8 [orig:149 _52 ] [149])
>> >         (zero_extend:SI (reg:HI 68 k0 [orig:82 var_6.0_1 ] [82]))) 144 {*zero_extendhisi2}
>> >      (nil))
>> > (insn 489 75 78 12 (set (reg:SI 22 xmm2 [297])
>> >         (reg:SI 5 di [orig:96 _52 ] [96])) 75 {*movsi_internal}
>> >      (nil))
>> > (insn 78 489 490 12 (set (reg:V4HI 20 xmm0 [orig:140 _283 ] [140])
>> >         (vec_duplicate:V4HI (truncate:HI (reg:SI 22 xmm2 [297])))) 1412 {*vec_dupv4hi}
>> >      (expr_list:REG_DEAD (reg:SI 22 xmm2 [297])
>> >         (nil)))
>> > are examples when it uses only the low 16 bits from that, and
>> > (insn 487 72 73 12 (set (reg:SI 1 dx [148])
>> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
>> >      (nil))
>> >
>> > (insn 85 84 491 13 (set (reg:SI 37 r9 [orig:86 _11 ] [86])
>> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) "pr98694.C":28:14 75 {*movsi_internal}
>> >      (nil))
>> >
>> > (insn 491 85 88 13 (set (reg:SI 3 bx [299])
>> >         (reg:SI 68 k0 [orig:82 var_6.0_1 ] [82])) 75 {*movsi_internal}
>> >      (nil))
>> > (insn 88 491 89 13 (set (reg:CCNO 17 flags)
>> >         (compare:CCNO (reg:SI 3 bx [299])
>> >             (const_int 0 [0]))) 7 {*cmpsi_ccno_1}
>> >      (expr_list:REG_DEAD (reg:SI 3 bx [299])
>> >         (nil)))
>> >
>> > (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>> >         (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
>> >      (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>> >         (nil)))
>> > are examples where it uses low 32-bits from k0.
>> > So the
>> >  (insn 457 499 460 33 (set (reg:SI 39 r11 [orig:86 _11 ] [86])
>> > -        (reg:SI 37 r9 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
>> > -     (expr_list:REG_DEAD (reg:SI 37 r9 [orig:86 _11 ] [86])
>> > +        (reg:SI 22 xmm2 [orig:86 _11 ] [86])) "pr98694.C":35:36 75 {*movsi_internal}
>> > +     (expr_list:REG_DEAD (reg:SI 22 xmm2 [orig:86 _11 ] [86])
>> >          (nil)))
>> > cprop_hardreg change indeed looks bogus, while xmm2 has SImode, it holds
>> > only the low 16-bits of the value and has the upper bits undefined, while r9
>> > it is replacing had all of the low 32-bits well defined.
>>
>> Ah, ok, thanks for the extra context.
>>
>> So AIUI the problem when recording xmm2<-di isn't just:
>>
>>  [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))
>>
>> but also that:
>>
>>  [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)
>>
>> For example, all registers in this sequence can be part of the same chain:
>>
>>     (set (reg:HI R1) (reg:HI R0))
>>     (set (reg:SI R2) (reg:SI R1)) // [A]
>>     (set (reg:DI R3) (reg:DI R2)) // [A]
>>     (set (reg:SI R4) (reg:SI R[0-3]))
>>     (set (reg:HI R5) (reg:HI R[0-4]))
>>
>> But:
>>
>>     (set (reg:SI R1) (reg:SI R0))
>>     (set (reg:HI R2) (reg:HI R1))
>>     (set (reg:SI R3) (reg:SI R2)) // [A] && [B]
>>
>> is problematic because it dips below the precision of the oldest regno
>> and then increases again.
>>
>> When this happens, I guess we have two choices:
>>
>> (1) what the patch does: treat R3 as the start of a new chain.
>> (2) pretend that the copy occured in vd->e[sr].mode instead
>>     (i.e. copy vd->e[sr].mode to vd->e[dr].mode)
>>
>> I guess (2) would need to be subject to REG_CAN_CHANGE_MODE_P.
>> Maybe the optimisation provided by (2) compared to (1) isn't common
>> enough to be worth the complication.
>>
>> I think we should test [B] as well as [A] though.  The pass is set
>> up to do some quite elaborate mode changes and I think rejecting
>> [A] on its own would make some of the other code redundant.
>> It also feels like it should be a seperate “if” or “else if”,
>> with its own comment.
>>
> Update patch.
>> Thanks,
>> Richard
>
>
>
> -- 
> BR,
> Hongtao
>
> From a52b3c8a90a0bf6cbda8ce86d99c82c6182863a7 Mon Sep 17 00:00:00 2001
> From: liuhongt <hongtao.liu@intel.com>
> Date: Mon, 18 Jan 2021 16:55:32 +0800
> Subject: [PATCH] [PR rtl/optimization/98694] Fix incorrect optimization by
>  cprop_hardreg.
>
> If SRC had been assigned a mode narrower than the copy, we can't link
> DEST into the chain even they have same
> hard_regno_nregs(i.e. HImode/SImode in i386 backend).

This is a bit out of date now.  Maybe just say “can't always link”
instead of just “can't link”.

> i.e
>         kmovw   %k0, %edi
>         vmovd   %edi, %xmm2
> 	vpshuflw        $0, %xmm2, %xmm0
>         kmovw   %k0, %r8d
>         kmovd   %k0, %r9d
> ...
> -	 movl %r9d, %r11d
> +	 vmovd %xmm2, %r11d
>
> gcc/ChangeLog:
>
> 	PR rtl-optimization/98694
> 	* regcprop.c (copy_value): If SRC had been assigned a mode
> 	narrower than the copy, we can't link DEST into the chain even
> 	they have same hard_regno_nregs(i.e. HImode/SImode in i386
> 	backend).
>
> gcc/testsuite/ChangeLog:
>
> 	PR rtl-optimization/98694
> 	* gcc.target/i386/pr98694.c: New test.
> ---
>  gcc/regcprop.c                          | 33 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr98694.c | 38 +++++++++++++++++++++++++
>  2 files changed, 71 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr98694.c
>
> diff --git a/gcc/regcprop.c b/gcc/regcprop.c
> index dd62cb36013..908298beaea 100644
> --- a/gcc/regcprop.c
> +++ b/gcc/regcprop.c
> @@ -358,6 +358,39 @@ copy_value (rtx dest, rtx src, struct value_data *vd)
>    else if (sn > hard_regno_nregs (sr, vd->e[sr].mode))
>      return;
>  
> +  /* If SRC had been assigned a mode narrower than the copy, Although
> +     they have same hard_regno_nregs, it's not safe to link DEST into the
> +     chain. .i.e.

How about:

  It is not safe to link DEST into the chain if SRC was defined in some
  narrower mode M and if M is also narrower than the mode of the first
  register in the chain.  For example:

> +     (set (reg:DI r1) (reg:DI r0))
> +     (set (reg:HI r2) (reg:HI r1))
> +     (set (reg:SI r3) (reg:SI r2)) //Should be a new chain start at r3
> +     (set (reg:SI r4) (reg:SI r1))
> +     (set (reg:SI r5) (reg:SI r4))
> +     the upper part of r3 is undefined, if adding it to the chain, it may be
> +     prop to r5 which has defined upper bits, .i.e. pr98694.

And for this:

     (set (reg:DI r1) (reg:DI r0))
     (set (reg:HI r2) (reg:HI r1))
     (set (reg:SI r3) (reg:SI r2)) // Should be a new chain starting at r3
     (set (reg:SI r4) (reg:SI r1))
     (set (reg:SI r5) (reg:SI r4))

  the upper part of r3 is undefined.  If we added it to the chain,
  it may be used to replace r5, which has defined upper bits.
  See PR98694 for details.

> +
> +     [A] partial_subreg_p (vd->e[sr].mode, GET_MODE (src))
> +     [B] partial_subreg_p (vd->e[sr].mode, vd->e[vd->e[sr].oldest_regno].mode)
> +     Condition B is added to to catch optimization opportunities of
> +
> +     (set (reg:HI R1) (reg:HI R0))
> +     (set (reg:SI R2) (reg:SI R1)) // [A]
> +     (set (reg:DI R3) (reg:DI R2)) // [A]
> +     (set (reg:SI R4) (reg:SI R[0-3]))
> +     (set (reg:HI R5) (reg:HI R[0-4]))
> +

Maybe add here:

  in which all registers have only 16 defined bits.

> +     but problematic for
> +
> +     (set (reg:SI R1) (reg:SI R0))
> +     (set (reg:HI R2) (reg:HI R1))
> +     (set (reg:SI R3) (reg:SI R2)) // [A] && [B]
> +
> +     to be fixed????   */

I think we should drop this part.  Your example above covers it in
more detail.

OK with those changes and the one that HJ asked for.

Thanks,
Richard
Hongtao Liu Jan. 21, 2021, 5:25 a.m. UTC | #13
> OK with those changes and the one that HJ asked for.
>

This is the patch I'm checking in, thanks for the review.

> Thanks,
> Richard
diff mbox series

Patch

diff --git a/gcc/regcprop.c b/gcc/regcprop.c
index dd62cb36013..997516eca07 100644
--- a/gcc/regcprop.c
+++ b/gcc/regcprop.c
@@ -355,7 +355,8 @@  copy_value (rtx dest, rtx src, struct value_data *vd)
   /* If SRC had been assigned a mode narrower than the copy, we can't
      link DEST into the chain, because not all of the pieces of the
      copy came from oldest_regno.  */
-  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode))
+  else if (sn > hard_regno_nregs (sr, vd->e[sr].mode)
+          || partial_subreg_p (vd->e[sr].mode, GET_MODE (src)))
     return;

   /* Link DR at the end of the value chain used by SR.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr98694.c
b/gcc/testsuite/gcc.target/i386/pr98694.c
new file mode 100644
index 00000000000..611f9e77627
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98694.c
@@ -0,0 +1,38 @@ 
+/* PR rtl-optimization/98694 */
+/* { dg-do run { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-require-effective-target avx512bw } */
+
+#include<immintrin.h>
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef int v2si __attribute__ ((vector_size (8)));
+v4hi b;
+
+__attribute__ ((noipa))
+v2si
+foo (__m512i src1, __m512i src2)
+{
+  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
+  short s = (short) m;
+  int i = (int)m;
+  b = __extension__ (v4hi) {s, s, s, s};
+  return __extension__ (v2si) {i, i};
+}
+
+int main ()
+{
+  __m512i src1 = _mm512_setzero_si512 ();
+  __m512i src2 = _mm512_set_epi8 (0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1,
+                                 0, 1, 0, 1, 0, 1, 0, 1);
+  __mmask64 m = _mm512_cmpeq_epu8_mask (src1, src2);
+  v2si a = foo (src1, src2);
+  if (a[0] != (int)m)
+    __builtin_abort ();
+  return 0;
+}