diff mbox series

[PR94442,AArch64] Redundant ldp/stp instructions emitted at -O3

Message ID 014c7f5ef7874db4ae98470c298b1f9b@huawei.com
State New
Headers show
Series [PR94442,AArch64] Redundant ldp/stp instructions emitted at -O3 | expand

Commit Message

xiezhiheng July 2, 2020, 1:22 p.m. UTC
Hi,

This is a fix for pr94442.
I modify get_inner_reference to handle the case for MEM[ptr, off].
I extract the "off" and add it to the recorded offset, then I build a
MEM[ptr, 0] and return it later.


I add an argument "include_memref_p" to control whether to go into MEM_REF,
because without it will cause the test case "Warray-bounds-46.c" to fail in regression.

It because function set_base_and_offset in gimple-ssa-warn-restrict.c
  base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
                              &mode, &sign, &reverse, &vol);
  ...
  ...
  if (TREE_CODE (base) == MEM_REF)
    {
      tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));
      extend_offset_range (memrefoff);
      base = TREE_OPERAND (base, 0);

      if (refoff != HOST_WIDE_INT_MIN
          && TREE_CODE (expr) == COMPONENT_REF)
        {
          /* Bump up the offset of the referenced subobject to reflect
             the offset to the enclosing object.  For example, so that
             in
               struct S { char a, b[3]; } s[2];
               strcpy (s[1].b, "1234");
             REFOFF is set to s[1].b - (char*)s.  */
          offset_int off = tree_to_shwi (memrefoff);
          refoff += off;
        }

      if (!integer_zerop (memrefoff))       <=================
        /* A non-zero offset into an array of struct with flexible array
           members implies that the array is empty because there is no
           way to initialize such a member when it belongs to an array.
           This must be some sort of a bug.  */
        refsize = 0;
    }

needs MEM_REF offset to judge whether refsize should be set to zero.
But I fold the offset into bitpos and the offset will always be zero.

Suggestion?

Comments

Richard Biener July 2, 2020, 2:45 p.m. UTC | #1
On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
>
> Hi,
>
> This is a fix for pr94442.
> I modify get_inner_reference to handle the case for MEM[ptr, off].
> I extract the "off" and add it to the recorded offset, then I build a
> MEM[ptr, 0] and return it later.
>
> diff --git a/gcc/expr.c b/gcc/expr.c
> index 3c68b0d754c..8cc18449a0c 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -7362,7 +7362,8 @@ tree
>  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>                      poly_int64_pod *pbitpos, tree *poffset,
>                      machine_mode *pmode, int *punsignedp,
> -                    int *preversep, int *pvolatilep)
> +                    int *preversep, int *pvolatilep,
> +                    bool include_memref_p)
>  {
>    tree size_tree = 0;
>    machine_mode mode = VOIDmode;
> @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>                 }
>               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
>             }
> +         else if (include_memref_p
> +                  && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)
> +           {
> +             tree off = TREE_OPERAND (exp, 1);
> +             if (!integer_zerop (off))
> +               {
> +                 poly_offset_int boff = mem_ref_offset (exp);
> +                 boff <<= LOG2_BITS_PER_UNIT;
> +                 bit_offset += boff;
> +
> +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
> +                               TREE_OPERAND (exp, 0),
> +                               build_int_cst (TREE_TYPE (off), 0));
> +               }
> +           }
>           goto done;
>
>         default:
> @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
>         int reversep, volatilep = 0, must_force_mem;
>         tree tem
>           = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,
> -                                &unsignedp, &reversep, &volatilep);
> +                                &unsignedp, &reversep, &volatilep, true);
>         rtx orig_op0, memloc;
>         bool clear_mem_expr = false;
>
> diff --git a/gcc/tree.h b/gcc/tree.h
> index a74872f5f3e..7df0d15f7f9 100644
> --- a/gcc/tree.h
> +++ b/gcc/tree.h
> @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);
>     look for the ultimate containing object, which is returned and specify
>     the access position and size.  */
>  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,
> -                                tree *, machine_mode *, int *, int *, int *);
> +                                tree *, machine_mode *, int *, int *, int *,
> +                                bool = false);
>
>  extern tree build_personality_function (const char *);
>
>
> I add an argument "include_memref_p" to control whether to go into MEM_REF,
> because without it will cause the test case "Warray-bounds-46.c" to fail in regression.
>
> It because function set_base_and_offset in gimple-ssa-warn-restrict.c
>   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
>                               &mode, &sign, &reverse, &vol);
>   ...
>   ...
>   if (TREE_CODE (base) == MEM_REF)
>     {
>       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND (base, 1));
>       extend_offset_range (memrefoff);
>       base = TREE_OPERAND (base, 0);
>
>       if (refoff != HOST_WIDE_INT_MIN
>           && TREE_CODE (expr) == COMPONENT_REF)
>         {
>           /* Bump up the offset of the referenced subobject to reflect
>              the offset to the enclosing object.  For example, so that
>              in
>                struct S { char a, b[3]; } s[2];
>                strcpy (s[1].b, "1234");
>              REFOFF is set to s[1].b - (char*)s.  */
>           offset_int off = tree_to_shwi (memrefoff);
>           refoff += off;
>         }
>
>       if (!integer_zerop (memrefoff))       <=================
>         /* A non-zero offset into an array of struct with flexible array
>            members implies that the array is empty because there is no
>            way to initialize such a member when it belongs to an array.
>            This must be some sort of a bug.  */
>         refsize = 0;
>     }
>
> needs MEM_REF offset to judge whether refsize should be set to zero.
> But I fold the offset into bitpos and the offset will always be zero.
>
> Suggestion?

The thing you want to fix is not get_inner_reference but the aarch64 backend
to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That way
CSE can happen on GIMPLE which can handle the difference in the IL just
fine.

Richard.
xiezhiheng July 6, 2020, 9:10 a.m. UTC | #2
> -----Original Message-----
> From: Richard Biener [mailto:richard.guenther@gmail.com]
> Sent: Thursday, July 2, 2020 10:46 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
> >
> > Hi,
> >
> > This is a fix for pr94442.
> > I modify get_inner_reference to handle the case for MEM[ptr, off].
> > I extract the "off" and add it to the recorded offset, then I build a
> > MEM[ptr, 0] and return it later.
> >
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index 3c68b0d754c..8cc18449a0c 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -7362,7 +7362,8 @@ tree
> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
> >                      poly_int64_pod *pbitpos, tree *poffset,
> >                      machine_mode *pmode, int *punsignedp,
> > -                    int *preversep, int *pvolatilep)
> > +                    int *preversep, int *pvolatilep,
> > +                    bool include_memref_p)
> >  {
> >    tree size_tree = 0;
> >    machine_mode mode = VOIDmode;
> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod
> *pbitsize,
> >                 }
> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
> >             }
> > +         else if (include_memref_p
> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==
> SSA_NAME)
> > +           {
> > +             tree off = TREE_OPERAND (exp, 1);
> > +             if (!integer_zerop (off))
> > +               {
> > +                 poly_offset_int boff = mem_ref_offset (exp);
> > +                 boff <<= LOG2_BITS_PER_UNIT;
> > +                 bit_offset += boff;
> > +
> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
> > +                               TREE_OPERAND (exp, 0),
> > +                               build_int_cst (TREE_TYPE (off), 0));
> > +               }
> > +           }
> >           goto done;
> >
> >         default:
> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,
> machine_mode tmode,
> >         int reversep, volatilep = 0, must_force_mem;
> >         tree tem
> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,
> &mode1,
> > -                                &unsignedp, &reversep, &volatilep);
> > +                                &unsignedp, &reversep, &volatilep,
> true);
> >         rtx orig_op0, memloc;
> >         bool clear_mem_expr = false;
> >
> > diff --git a/gcc/tree.h b/gcc/tree.h
> > index a74872f5f3e..7df0d15f7f9 100644
> > --- a/gcc/tree.h
> > +++ b/gcc/tree.h
> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p
> (const_tree, HOST_WIDE_INT, const_tree);
> >     look for the ultimate containing object, which is returned and specify
> >     the access position and size.  */
> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod
> *,
> > -                                tree *, machine_mode *, int *, int *,
> int *);
> > +                                tree *, machine_mode *, int *, int *,
> int *,
> > +                                bool = false);
> >
> >  extern tree build_personality_function (const char *);
> >
> >
> > I add an argument "include_memref_p" to control whether to go into
> MEM_REF,
> > because without it will cause the test case "Warray-bounds-46.c" to fail in
> regression.
> >
> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c
> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
> >                               &mode, &sign, &reverse, &vol);
> >   ...
> >   ...
> >   if (TREE_CODE (base) == MEM_REF)
> >     {
> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND
> (base, 1));
> >       extend_offset_range (memrefoff);
> >       base = TREE_OPERAND (base, 0);
> >
> >       if (refoff != HOST_WIDE_INT_MIN
> >           && TREE_CODE (expr) == COMPONENT_REF)
> >         {
> >           /* Bump up the offset of the referenced subobject to reflect
> >              the offset to the enclosing object.  For example, so that
> >              in
> >                struct S { char a, b[3]; } s[2];
> >                strcpy (s[1].b, "1234");
> >              REFOFF is set to s[1].b - (char*)s.  */
> >           offset_int off = tree_to_shwi (memrefoff);
> >           refoff += off;
> >         }
> >
> >       if (!integer_zerop (memrefoff))       <=================
> >         /* A non-zero offset into an array of struct with flexible array
> >            members implies that the array is empty because there is no
> >            way to initialize such a member when it belongs to an array.
> >            This must be some sort of a bug.  */
> >         refsize = 0;
> >     }
> >
> > needs MEM_REF offset to judge whether refsize should be set to zero.
> > But I fold the offset into bitpos and the offset will always be zero.
> >
> > Suggestion?
> 
> The thing you want to fix is not get_inner_reference but the aarch64 backend
> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That
> way
> CSE can happen on GIMPLE which can handle the difference in the IL just
> fine.
> 
> Richard.

Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that
it would not clobber global memory.  But I find it strange that when building
SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.

void
aarch64_init_simd_builtins (void)
{
...
      ftype = build_function_type (return_type, args);

      gcc_assert (ftype != NULL);

      if (print_type_signature_p)
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",
                  d->name, type_signature);
      else
        snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
                  d->name);

      fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
      aarch64_builtin_decls[fcode] = fndecl;
...
}
static tree
aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
{
  code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
  return add_builtin_function (name, type, code, BUILT_IN_MD,
                               NULL, NULL_TREE);
}

The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each
build-in function and put the node in array.  But it does not set any attributes.
And I did not find interface for each build-in function to control the attributes.

Did I miss anything?
Richard Sandiford July 6, 2020, 9:31 a.m. UTC | #3
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Biener [mailto:richard.guenther@gmail.com]
>> Sent: Thursday, July 2, 2020 10:46 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> On Thu, Jul 2, 2020 at 3:22 PM xiezhiheng <xiezhiheng@huawei.com> wrote:
>> >
>> > Hi,
>> >
>> > This is a fix for pr94442.
>> > I modify get_inner_reference to handle the case for MEM[ptr, off].
>> > I extract the "off" and add it to the recorded offset, then I build a
>> > MEM[ptr, 0] and return it later.
>> >
>> > diff --git a/gcc/expr.c b/gcc/expr.c
>> > index 3c68b0d754c..8cc18449a0c 100644
>> > --- a/gcc/expr.c
>> > +++ b/gcc/expr.c
>> > @@ -7362,7 +7362,8 @@ tree
>> >  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
>> >                      poly_int64_pod *pbitpos, tree *poffset,
>> >                      machine_mode *pmode, int *punsignedp,
>> > -                    int *preversep, int *pvolatilep)
>> > +                    int *preversep, int *pvolatilep,
>> > +                    bool include_memref_p)
>> >  {
>> >    tree size_tree = 0;
>> >    machine_mode mode = VOIDmode;
>> > @@ -7509,6 +7510,21 @@ get_inner_reference (tree exp, poly_int64_pod
>> *pbitsize,
>> >                 }
>> >               exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
>> >             }
>> > +         else if (include_memref_p
>> > +                  && TREE_CODE (TREE_OPERAND (exp, 0)) ==
>> SSA_NAME)
>> > +           {
>> > +             tree off = TREE_OPERAND (exp, 1);
>> > +             if (!integer_zerop (off))
>> > +               {
>> > +                 poly_offset_int boff = mem_ref_offset (exp);
>> > +                 boff <<= LOG2_BITS_PER_UNIT;
>> > +                 bit_offset += boff;
>> > +
>> > +                 exp = build2 (MEM_REF, TREE_TYPE (exp),
>> > +                               TREE_OPERAND (exp, 0),
>> > +                               build_int_cst (TREE_TYPE (off), 0));
>> > +               }
>> > +           }
>> >           goto done;
>> >
>> >         default:
>> > @@ -10786,7 +10802,7 @@ expand_expr_real_1 (tree exp, rtx target,
>> machine_mode tmode,
>> >         int reversep, volatilep = 0, must_force_mem;
>> >         tree tem
>> >           = get_inner_reference (exp, &bitsize, &bitpos, &offset,
>> &mode1,
>> > -                                &unsignedp, &reversep, &volatilep);
>> > +                                &unsignedp, &reversep, &volatilep,
>> true);
>> >         rtx orig_op0, memloc;
>> >         bool clear_mem_expr = false;
>> >
>> > diff --git a/gcc/tree.h b/gcc/tree.h
>> > index a74872f5f3e..7df0d15f7f9 100644
>> > --- a/gcc/tree.h
>> > +++ b/gcc/tree.h
>> > @@ -6139,7 +6139,8 @@ extern bool complete_ctor_at_level_p
>> (const_tree, HOST_WIDE_INT, const_tree);
>> >     look for the ultimate containing object, which is returned and specify
>> >     the access position and size.  */
>> >  extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod
>> *,
>> > -                                tree *, machine_mode *, int *, int *,
>> int *);
>> > +                                tree *, machine_mode *, int *, int *,
>> int *,
>> > +                                bool = false);
>> >
>> >  extern tree build_personality_function (const char *);
>> >
>> >
>> > I add an argument "include_memref_p" to control whether to go into
>> MEM_REF,
>> > because without it will cause the test case "Warray-bounds-46.c" to fail in
>> regression.
>> >
>> > It because function set_base_and_offset in gimple-ssa-warn-restrict.c
>> >   base = get_inner_reference (expr, &bitsize, &bitpos, &var_off,
>> >                               &mode, &sign, &reverse, &vol);
>> >   ...
>> >   ...
>> >   if (TREE_CODE (base) == MEM_REF)
>> >     {
>> >       tree memrefoff = fold_convert (ptrdiff_type_node, TREE_OPERAND
>> (base, 1));
>> >       extend_offset_range (memrefoff);
>> >       base = TREE_OPERAND (base, 0);
>> >
>> >       if (refoff != HOST_WIDE_INT_MIN
>> >           && TREE_CODE (expr) == COMPONENT_REF)
>> >         {
>> >           /* Bump up the offset of the referenced subobject to reflect
>> >              the offset to the enclosing object.  For example, so that
>> >              in
>> >                struct S { char a, b[3]; } s[2];
>> >                strcpy (s[1].b, "1234");
>> >              REFOFF is set to s[1].b - (char*)s.  */
>> >           offset_int off = tree_to_shwi (memrefoff);
>> >           refoff += off;
>> >         }
>> >
>> >       if (!integer_zerop (memrefoff))       <=================
>> >         /* A non-zero offset into an array of struct with flexible array
>> >            members implies that the array is empty because there is no
>> >            way to initialize such a member when it belongs to an array.
>> >            This must be some sort of a bug.  */
>> >         refsize = 0;
>> >     }
>> >
>> > needs MEM_REF offset to judge whether refsize should be set to zero.
>> > But I fold the offset into bitpos and the offset will always be zero.
>> >
>> > Suggestion?
>> 
>> The thing you want to fix is not get_inner_reference but the aarch64 backend
>> to not make __builtin_aarch64_sqaddv16qi clobber global memory.  That
>> way
>> CSE can happen on GIMPLE which can handle the difference in the IL just
>> fine.
>> 
>> Richard.
>
> Yes, __builtin_aarch64_sqaddv16qi is not set any attributes to describe that
> it would not clobber global memory.  But I find it strange that when building
> SIMD built-in FUNCTION_DECLs they are not set any attributes in the backend.
>
> void
> aarch64_init_simd_builtins (void)
> {
> ...
>       ftype = build_function_type (return_type, args);
>
>       gcc_assert (ftype != NULL);
>
>       if (print_type_signature_p)
>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s_%s",
>                   d->name, type_signature);
>       else
>         snprintf (namebuf, sizeof (namebuf), "__builtin_aarch64_%s",
>                   d->name);
>
>       fndecl = aarch64_general_add_builtin (namebuf, ftype, fcode);
>       aarch64_builtin_decls[fcode] = fndecl;
> ...
> }
> static tree
> aarch64_general_add_builtin (const char *name, tree type, unsigned int code)
> {
>   code = (code << AARCH64_BUILTIN_SHIFT) | AARCH64_BUILTIN_GENERAL;
>   return add_builtin_function (name, type, code, BUILT_IN_MD,
>                                NULL, NULL_TREE);
> }
>
> The loop in aarch64_init_simd_builtins creates FUNCTION_DECL node for each
> build-in function and put the node in array.  But it does not set any attributes.
> And I did not find interface for each build-in function to control the attributes.
>
> Did I miss anything?

No, this is unfortunately a known bug.  See:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964

(Although the PR is recent, it's been a known bug for longer.)

As you say, the difficulty is that the correct attributes depend on what
the built-in function does.  Most integer arithmetic is “const”, but things
get more complicated for floating-point arithmetic.

The SVE intrinsics use a three stage process:

- each function is classified into one of several groups
- each group has a set of flags that describe what functions in the
  group can do
- these flags get converted into attributes based on the current
  command-line options

I guess we should have something similar for the arm_neon.h built-ins.

If you're willing to help fix this, that'd be great.  I think a first
step would be to agree a design.

Thanks,
Richard
xiezhiheng July 7, 2020, 12:49 p.m. UTC | #4
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, July 6, 2020 5:31 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> No, this is unfortunately a known bug.  See:
> 
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> 
> (Although the PR is recent, it's been a known bug for longer.)
> 
> As you say, the difficulty is that the correct attributes depend on what
> the built-in function does.  Most integer arithmetic is “const”, but things
> get more complicated for floating-point arithmetic.
> 
> The SVE intrinsics use a three stage process:
> 
> - each function is classified into one of several groups
> - each group has a set of flags that describe what functions in the
>   group can do
> - these flags get converted into attributes based on the current
>   command-line options
> 
> I guess we should have something similar for the arm_neon.h built-ins.
> 
> If you're willing to help fix this, that'd be great.  I think a first
> step would be to agree a design.
> 
> Thanks,
> Richard

I'd like to have a try.  I have checked the steps in SVE intrinsics.
It defines a base class "function_base" and derives different classes
to describe several intrinsics for each.  And each class may
have its own unique flags described in virtual function "call_properties".
The specific attributes will be converted from these flags in
"get_attributes" later.

I find that there are more than 100 classes in total and if I only
need to classify them into different groups by attributes, maybe
we does not need so many classes?

The difficult thing I think is how to classify neon intrinsics into
different groups.  I'm going to follow up the way in SVE intrinsics
first now.

Xie Zhiheng
Richard Sandiford July 7, 2020, 2:07 p.m. UTC | #5
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Monday, July 6, 2020 5:31 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> No, this is unfortunately a known bug.  See:
>> 
>>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> 
>> (Although the PR is recent, it's been a known bug for longer.)
>> 
>> As you say, the difficulty is that the correct attributes depend on what
>> the built-in function does.  Most integer arithmetic is “const”, but things
>> get more complicated for floating-point arithmetic.
>> 
>> The SVE intrinsics use a three stage process:
>> 
>> - each function is classified into one of several groups
>> - each group has a set of flags that describe what functions in the
>>   group can do
>> - these flags get converted into attributes based on the current
>>   command-line options
>> 
>> I guess we should have something similar for the arm_neon.h built-ins.
>> 
>> If you're willing to help fix this, that'd be great.  I think a first
>> step would be to agree a design.
>> 
>> Thanks,
>> Richard
>
> I'd like to have a try.

Great!

> I have checked the steps in SVE intrinsics.
> It defines a base class "function_base" and derives different classes
> to describe several intrinsics for each.  And each class may
> have its own unique flags described in virtual function "call_properties".
> The specific attributes will be converted from these flags in
> "get_attributes" later.
>
> I find that there are more than 100 classes in total and if I only
> need to classify them into different groups by attributes, maybe
> we does not need so many classes?

Yeah, I agree.

Long term, there might be value in defining arm_neon.h in a similar
way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
a special compiler pragma.  But that's going to be a lot of work.

I think it's possible to make incremental improvements to the current
arm_neon.h implementation without that work being thrown away if we ever
did switch to a pragma in future.  And the incremental approach seems
more practical.

> The difficult thing I think is how to classify neon intrinsics into
> different groups.  I'm going to follow up the way in SVE intrinsics
> first now.

For now I'd suggest just giving a name to each combination of flags
that the intrinsics need, rather than splitting instructions in a
more fine-grained way.  (It's not at all obvious from the final state
of the SVE code, but even there, the idea was to have as few groups as
possible.  I.e. the groups were supposedly only split where necessary.
As you say, there still ended up being a lot of groups in the end…)

It'd be easier to review if the work was split up into smaller steps.
E.g. maybe one way would be this, with each number being a single
patch:

(1) (a) Add a flags field to the built-in function definitions
        that for now is always zero.
    (b) Pick a name N to describe the most conservative set of flags.
    (c) Make every built-in function definition use N.

(2) (a) Pick one type of function that cannot yet be described properly.
    (b) Pick a name N for that type of function.
    (c) Add whichever new flags are needed.
    (d) Add the appropriate attributes when the flags are set,
        possibly based on command-line options.
    (e) Make (exactly) one built-in function definition use N.

(3) (a) Pick some functions that all need the same attributes and
        that can already be described properly
    (b) Update all of their built-in function definitions accordingly,
        as a single change.

So after (1), filling out the table is an iterative process of (2) and
(3), in any order that's convenient (although it might help to order the
(2) patches so that each one adds as few flags as possible).  Each patch
would then be fairly small and self-contained.

That's just a suggestion though.  Please let me know if you have
any other suggestions.

I guess there are two obvious ways of adding the flags field:

- add a new parameter to every built-in function macro, e.g.
  BUILTIN_VSDQ_I and VAR1.

- wrap the definitions in a new macro, e.g.
  MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))

I don't really have a preference, and I guess all other things being
equal, the first one wins by being more obvious than the second.
Just thought I'd mention the second way in case anyone preferred it.

Thanks,
Richard
xiezhiheng July 15, 2020, 8:49 a.m. UTC | #6
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, July 7, 2020 10:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Monday, July 6, 2020 5:31 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> No, this is unfortunately a known bug.  See:
> >>
> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> >>
> >> (Although the PR is recent, it's been a known bug for longer.)
> >>
> >> As you say, the difficulty is that the correct attributes depend on what
> >> the built-in function does.  Most integer arithmetic is “const”, but
> things
> >> get more complicated for floating-point arithmetic.
> >>
> >> The SVE intrinsics use a three stage process:
> >>
> >> - each function is classified into one of several groups
> >> - each group has a set of flags that describe what functions in the
> >>   group can do
> >> - these flags get converted into attributes based on the current
> >>   command-line options
> >>
> >> I guess we should have something similar for the arm_neon.h built-ins.
> >>
> >> If you're willing to help fix this, that'd be great.  I think a first
> >> step would be to agree a design.
> >>
> >> Thanks,
> >> Richard
> >
> > I'd like to have a try.
> 
> Great!
> 
> > I have checked the steps in SVE intrinsics.
> > It defines a base class "function_base" and derives different classes
> > to describe several intrinsics for each.  And each class may
> > have its own unique flags described in virtual function "call_properties".
> > The specific attributes will be converted from these flags in
> > "get_attributes" later.
> >
> > I find that there are more than 100 classes in total and if I only
> > need to classify them into different groups by attributes, maybe
> > we does not need so many classes?
> 
> Yeah, I agree.
> 
> Long term, there might be value in defining arm_neon.h in a similar
> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
> a special compiler pragma.  But that's going to be a lot of work.
> 
> I think it's possible to make incremental improvements to the current
> arm_neon.h implementation without that work being thrown away if we
> ever
> did switch to a pragma in future.  And the incremental approach seems
> more practical.
> 
> > The difficult thing I think is how to classify neon intrinsics into
> > different groups.  I'm going to follow up the way in SVE intrinsics
> > first now.
> 
> For now I'd suggest just giving a name to each combination of flags
> that the intrinsics need, rather than splitting instructions in a
> more fine-grained way.  (It's not at all obvious from the final state
> of the SVE code, but even there, the idea was to have as few groups as
> possible.  I.e. the groups were supposedly only split where necessary.
> As you say, there still ended up being a lot of groups in the end…)
> 
> It'd be easier to review if the work was split up into smaller steps.
> E.g. maybe one way would be this, with each number being a single
> patch:
> 
> (1) (a) Add a flags field to the built-in function definitions
>         that for now is always zero.
>     (b) Pick a name N to describe the most conservative set of flags.
>     (c) Make every built-in function definition use N.
> 

I have finished the first part.

(a) I add a new parameter called FLAG to every built-in function macro.

(b) I define some flags in aarch64-builtins.c
FLAG_NONE for no needed flags
FLAG_READ_FPCR for functions will read FPCR register
FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
FLAG_READ_MEMORY for functions will read global memory
FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
FLAG_WRITE_MEMORY for functions will write global memory

FLAG_FP is used for floating-point arithmetic
FLAG_ALL is all flags above

(c) I add a field in struct aarch64_simd_builtin_datum to record flags
for each built-in function.  But the default flags I set for built-in functions
are FLAG_ALL because by default the built-in functions might do anything.

And bootstrap and regression are tested ok on aarch64 Linux platform.

Any suggestions?

Thanks,
Xie Zhiheng

> (2) (a) Pick one type of function that cannot yet be described properly.
>     (b) Pick a name N for that type of function.
>     (c) Add whichever new flags are needed.
>     (d) Add the appropriate attributes when the flags are set,
>         possibly based on command-line options.
>     (e) Make (exactly) one built-in function definition use N.
> 
> (3) (a) Pick some functions that all need the same attributes and
>         that can already be described properly
>     (b) Update all of their built-in function definitions accordingly,
>         as a single change.
> 
> So after (1), filling out the table is an iterative process of (2) and
> (3), in any order that's convenient (although it might help to order the
> (2) patches so that each one adds as few flags as possible).  Each patch
> would then be fairly small and self-contained.
> 
> That's just a suggestion though.  Please let me know if you have
> any other suggestions.
> 
> I guess there are two obvious ways of adding the flags field:
> 
> - add a new parameter to every built-in function macro, e.g.
>   BUILTIN_VSDQ_I and VAR1.
> 
> - wrap the definitions in a new macro, e.g.
>   MY_NEW_GROUP (BUILTIN_VSDQ_I (BINOP, sqshl, 0))
> 
> I don't really have a preference, and I guess all other things being
> equal, the first one wins by being more obvious than the second.
> Just thought I'd mention the second way in case anyone preferred it.
> 
> Thanks,
> Richard
Richard Sandiford July 16, 2020, 12:41 p.m. UTC | #7
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, July 7, 2020 10:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Monday, July 6, 2020 5:31 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> No, this is unfortunately a known bug.  See:
>> >>
>> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> >>
>> >> (Although the PR is recent, it's been a known bug for longer.)
>> >>
>> >> As you say, the difficulty is that the correct attributes depend on what
>> >> the built-in function does.  Most integer arithmetic is “const”, but
>> things
>> >> get more complicated for floating-point arithmetic.
>> >>
>> >> The SVE intrinsics use a three stage process:
>> >>
>> >> - each function is classified into one of several groups
>> >> - each group has a set of flags that describe what functions in the
>> >>   group can do
>> >> - these flags get converted into attributes based on the current
>> >>   command-line options
>> >>
>> >> I guess we should have something similar for the arm_neon.h built-ins.
>> >>
>> >> If you're willing to help fix this, that'd be great.  I think a first
>> >> step would be to agree a design.
>> >>
>> >> Thanks,
>> >> Richard
>> >
>> > I'd like to have a try.
>> 
>> Great!
>> 
>> > I have checked the steps in SVE intrinsics.
>> > It defines a base class "function_base" and derives different classes
>> > to describe several intrinsics for each.  And each class may
>> > have its own unique flags described in virtual function "call_properties".
>> > The specific attributes will be converted from these flags in
>> > "get_attributes" later.
>> >
>> > I find that there are more than 100 classes in total and if I only
>> > need to classify them into different groups by attributes, maybe
>> > we does not need so many classes?
>> 
>> Yeah, I agree.
>> 
>> Long term, there might be value in defining arm_neon.h in a similar
>> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
>> a special compiler pragma.  But that's going to be a lot of work.
>> 
>> I think it's possible to make incremental improvements to the current
>> arm_neon.h implementation without that work being thrown away if we
>> ever
>> did switch to a pragma in future.  And the incremental approach seems
>> more practical.
>> 
>> > The difficult thing I think is how to classify neon intrinsics into
>> > different groups.  I'm going to follow up the way in SVE intrinsics
>> > first now.
>> 
>> For now I'd suggest just giving a name to each combination of flags
>> that the intrinsics need, rather than splitting instructions in a
>> more fine-grained way.  (It's not at all obvious from the final state
>> of the SVE code, but even there, the idea was to have as few groups as
>> possible.  I.e. the groups were supposedly only split where necessary.
>> As you say, there still ended up being a lot of groups in the end…)
>> 
>> It'd be easier to review if the work was split up into smaller steps.
>> E.g. maybe one way would be this, with each number being a single
>> patch:
>> 
>> (1) (a) Add a flags field to the built-in function definitions
>>         that for now is always zero.
>>     (b) Pick a name N to describe the most conservative set of flags.
>>     (c) Make every built-in function definition use N.
>> 
>
> I have finished the first part.
>
> (a) I add a new parameter called FLAG to every built-in function macro.
>
> (b) I define some flags in aarch64-builtins.c
> FLAG_NONE for no needed flags
> FLAG_READ_FPCR for functions will read FPCR register
> FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
> FLAG_READ_MEMORY for functions will read global memory
> FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
> FLAG_WRITE_MEMORY for functions will write global memory
>
> FLAG_FP is used for floating-point arithmetic
> FLAG_ALL is all flags above
>
> (c) I add a field in struct aarch64_simd_builtin_datum to record flags
> for each built-in function.  But the default flags I set for built-in functions
> are FLAG_ALL because by default the built-in functions might do anything.
>
> And bootstrap and regression are tested ok on aarch64 Linux platform.

This looks great.

The patch is OK for trunk, but could you send a changelog too,
so that I can include it in the commit message?

Thanks,
Richard
xiezhiheng July 16, 2020, 2:05 p.m. UTC | #8
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, July 16, 2020 8:42 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Tuesday, July 7, 2020 10:08 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> >> Sent: Monday, July 6, 2020 5:31 PM
> >> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> >> gcc-patches@gcc.gnu.org
> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp
> instructions
> >> >> emitted at -O3
> >> >>
> >> >> No, this is unfortunately a known bug.  See:
> >> >>
> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
> >> >>
> >> >> (Although the PR is recent, it's been a known bug for longer.)
> >> >>
> >> >> As you say, the difficulty is that the correct attributes depend on what
> >> >> the built-in function does.  Most integer arithmetic is “const”, but
> >> things
> >> >> get more complicated for floating-point arithmetic.
> >> >>
> >> >> The SVE intrinsics use a three stage process:
> >> >>
> >> >> - each function is classified into one of several groups
> >> >> - each group has a set of flags that describe what functions in the
> >> >>   group can do
> >> >> - these flags get converted into attributes based on the current
> >> >>   command-line options
> >> >>
> >> >> I guess we should have something similar for the arm_neon.h built-ins.
> >> >>
> >> >> If you're willing to help fix this, that'd be great.  I think a first
> >> >> step would be to agree a design.
> >> >>
> >> >> Thanks,
> >> >> Richard
> >> >
> >> > I'd like to have a try.
> >>
> >> Great!
> >>
> >> > I have checked the steps in SVE intrinsics.
> >> > It defines a base class "function_base" and derives different classes
> >> > to describe several intrinsics for each.  And each class may
> >> > have its own unique flags described in virtual function "call_properties".
> >> > The specific attributes will be converted from these flags in
> >> > "get_attributes" later.
> >> >
> >> > I find that there are more than 100 classes in total and if I only
> >> > need to classify them into different groups by attributes, maybe
> >> > we does not need so many classes?
> >>
> >> Yeah, I agree.
> >>
> >> Long term, there might be value in defining arm_neon.h in a similar
> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
> >> a special compiler pragma.  But that's going to be a lot of work.
> >>
> >> I think it's possible to make incremental improvements to the current
> >> arm_neon.h implementation without that work being thrown away if we
> >> ever
> >> did switch to a pragma in future.  And the incremental approach seems
> >> more practical.
> >>
> >> > The difficult thing I think is how to classify neon intrinsics into
> >> > different groups.  I'm going to follow up the way in SVE intrinsics
> >> > first now.
> >>
> >> For now I'd suggest just giving a name to each combination of flags
> >> that the intrinsics need, rather than splitting instructions in a
> >> more fine-grained way.  (It's not at all obvious from the final state
> >> of the SVE code, but even there, the idea was to have as few groups as
> >> possible.  I.e. the groups were supposedly only split where necessary.
> >> As you say, there still ended up being a lot of groups in the end…)
> >>
> >> It'd be easier to review if the work was split up into smaller steps.
> >> E.g. maybe one way would be this, with each number being a single
> >> patch:
> >>
> >> (1) (a) Add a flags field to the built-in function definitions
> >>         that for now is always zero.
> >>     (b) Pick a name N to describe the most conservative set of flags.
> >>     (c) Make every built-in function definition use N.
> >>
> >
> > I have finished the first part.
> >
> > (a) I add a new parameter called FLAG to every built-in function macro.
> >
> > (b) I define some flags in aarch64-builtins.c
> > FLAG_NONE for no needed flags
> > FLAG_READ_FPCR for functions will read FPCR register
> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
> > FLAG_READ_MEMORY for functions will read global memory
> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
> > FLAG_WRITE_MEMORY for functions will write global memory
> >
> > FLAG_FP is used for floating-point arithmetic
> > FLAG_ALL is all flags above
> >
> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags
> > for each built-in function.  But the default flags I set for built-in functions
> > are FLAG_ALL because by default the built-in functions might do anything.
> >
> > And bootstrap and regression are tested ok on aarch64 Linux platform.
> 
> This looks great.
> 
> The patch is OK for trunk, but could you send a changelog too,
> so that I can include it in the commit message?
> 
> Thanks,
> Richard

OK, and I add the git commit msg in patch.

Thanks,
XieZhiheng

+2020-07-16  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	PR tree-optimization/94442
+	* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
+	Add new field flags.
+	(VAR1): Add new field FLAG in macro.
+	(VAR2): Likewise.
+	(VAR3): Likewise.
+	(VAR4): Likewise.
+	(VAR5): Likewise.
+	(VAR6): Likewise.
+	(VAR7): Likewise.
+	(VAR8): Likewise.
+	(VAR9): Likewise.
+	(VAR10): Likewise.
+	(VAR11): Likewise.
+	(VAR12): Likewise.
+	(VAR13): Likewise.
+	(VAR14): Likewise.
+	(VAR15): Likewise.
+	(VAR16): Likewise.
+	(aarch64_general_fold_builtin): Likewise.
+	(aarch64_general_gimple_fold_builtin): Likewise.
+	* config/aarch64/aarch64-simd-builtins.def: Add default flag for
+	each built-in function.
+	* config/aarch64/geniterators.sh: Add new field in BUILTIN macro.
+
Richard Sandiford July 17, 2020, 9:03 a.m. UTC | #9
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, July 16, 2020 8:42 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Tuesday, July 7, 2020 10:08 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> >> -----Original Message-----
>> >> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> >> Sent: Monday, July 6, 2020 5:31 PM
>> >> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> >> gcc-patches@gcc.gnu.org
>> >> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp
>> instructions
>> >> >> emitted at -O3
>> >> >>
>> >> >> No, this is unfortunately a known bug.  See:
>> >> >>
>> >> >>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95964
>> >> >>
>> >> >> (Although the PR is recent, it's been a known bug for longer.)
>> >> >>
>> >> >> As you say, the difficulty is that the correct attributes depend on what
>> >> >> the built-in function does.  Most integer arithmetic is “const”, but
>> >> things
>> >> >> get more complicated for floating-point arithmetic.
>> >> >>
>> >> >> The SVE intrinsics use a three stage process:
>> >> >>
>> >> >> - each function is classified into one of several groups
>> >> >> - each group has a set of flags that describe what functions in the
>> >> >>   group can do
>> >> >> - these flags get converted into attributes based on the current
>> >> >>   command-line options
>> >> >>
>> >> >> I guess we should have something similar for the arm_neon.h built-ins.
>> >> >>
>> >> >> If you're willing to help fix this, that'd be great.  I think a first
>> >> >> step would be to agree a design.
>> >> >>
>> >> >> Thanks,
>> >> >> Richard
>> >> >
>> >> > I'd like to have a try.
>> >>
>> >> Great!
>> >>
>> >> > I have checked the steps in SVE intrinsics.
>> >> > It defines a base class "function_base" and derives different classes
>> >> > to describe several intrinsics for each.  And each class may
>> >> > have its own unique flags described in virtual function "call_properties".
>> >> > The specific attributes will be converted from these flags in
>> >> > "get_attributes" later.
>> >> >
>> >> > I find that there are more than 100 classes in total and if I only
>> >> > need to classify them into different groups by attributes, maybe
>> >> > we does not need so many classes?
>> >>
>> >> Yeah, I agree.
>> >>
>> >> Long term, there might be value in defining arm_neon.h in a similar
>> >> way to arm_sve.h: i.e. have arm_neon.h defer most of the work to
>> >> a special compiler pragma.  But that's going to be a lot of work.
>> >>
>> >> I think it's possible to make incremental improvements to the current
>> >> arm_neon.h implementation without that work being thrown away if we
>> >> ever
>> >> did switch to a pragma in future.  And the incremental approach seems
>> >> more practical.
>> >>
>> >> > The difficult thing I think is how to classify neon intrinsics into
>> >> > different groups.  I'm going to follow up the way in SVE intrinsics
>> >> > first now.
>> >>
>> >> For now I'd suggest just giving a name to each combination of flags
>> >> that the intrinsics need, rather than splitting instructions in a
>> >> more fine-grained way.  (It's not at all obvious from the final state
>> >> of the SVE code, but even there, the idea was to have as few groups as
>> >> possible.  I.e. the groups were supposedly only split where necessary.
>> >> As you say, there still ended up being a lot of groups in the end…)
>> >>
>> >> It'd be easier to review if the work was split up into smaller steps.
>> >> E.g. maybe one way would be this, with each number being a single
>> >> patch:
>> >>
>> >> (1) (a) Add a flags field to the built-in function definitions
>> >>         that for now is always zero.
>> >>     (b) Pick a name N to describe the most conservative set of flags.
>> >>     (c) Make every built-in function definition use N.
>> >>
>> >
>> > I have finished the first part.
>> >
>> > (a) I add a new parameter called FLAG to every built-in function macro.
>> >
>> > (b) I define some flags in aarch64-builtins.c
>> > FLAG_NONE for no needed flags
>> > FLAG_READ_FPCR for functions will read FPCR register
>> > FLAG_RAISE_FP_EXCEPTIONS for functions will raise fp exceptions
>> > FLAG_READ_MEMORY for functions will read global memory
>> > FLAG_PREFETCH_MEMORY for functions will prefetch data to memory
>> > FLAG_WRITE_MEMORY for functions will write global memory
>> >
>> > FLAG_FP is used for floating-point arithmetic
>> > FLAG_ALL is all flags above
>> >
>> > (c) I add a field in struct aarch64_simd_builtin_datum to record flags
>> > for each built-in function.  But the default flags I set for built-in functions
>> > are FLAG_ALL because by default the built-in functions might do anything.
>> >
>> > And bootstrap and regression are tested ok on aarch64 Linux platform.
>> 
>> This looks great.
>> 
>> The patch is OK for trunk, but could you send a changelog too,
>> so that I can include it in the commit message?
>> 
>> Thanks,
>> Richard
>
> OK, and I add the git commit msg in patch.

Thanks, pushed to master.

Richard
xiezhiheng July 30, 2020, 2:43 a.m. UTC | #10
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, July 17, 2020 5:04 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
>

Cut...

> 
> Thanks, pushed to master.
> 
> Richard

And I have finished the second part.

In function aarch64_general_add_builtin, I add an argument ATTRS to
pass attributes for each built-in function.

And some new functions are added:
aarch64_call_properties: return flags for each built-in function based
on command-line options.  When the built-in function handles
floating-points, add FLAG_FP flag.

aarch64_modifies_global_state_p: True if the function would modify
global states.

aarch64_reads_global_state_p: True if the function would read
global states.

aarch64_could_trap_p: True if the function would raise a signal.

aarch64_add_attribute: Add attributes in ATTRS.

aarch64_get_attributes: return attributes for each built-in functons
based on flags and command-line options.

In function aarch64_init_simd_builtins, attributes are get by flags
and pass them to function aarch64_general_add_builtin.


Bootstrap is tested OK on aarch64 Linux platform, but regression
FAIL one test case ---- pr93423.f90.
However, I found that this test case would fail randomly in trunk.
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
Some PRs have tracked it.  After my patch, this test case would
always fail.  I guess the syntax errors in fortran crash some structures
result in illegal memory access but I can't find what exactly it is.
But I think my patch should have no influence on it.

Have some further suggestions?

Thanks,
Xiezhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 871b97c8543..8882ec1d59a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,15 @@
+2020-07-30  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_general_add_builtin):
+	Add new argument ATTRS.
+	(aarch64_call_properties): New function.
+	(aarch64_modifies_global_state_p): Likewise.
+	(aarch64_reads_global_state_p): Likewise.
+	(aarch64_could_trap_p): Likewise.
+	(aarch64_add_attribute): Likewise.
+	(aarch64_get_attributes): Likewise.
+	(aarch64_init_simd_builtins): Add attributes for each built-in function.
+
Richard Sandiford July 31, 2020, 9:02 a.m. UTC | #11
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, July 17, 2020 5:04 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>>
>
> Cut...
>
>> 
>> Thanks, pushed to master.
>> 
>> Richard
>
> And I have finished the second part.
>
> In function aarch64_general_add_builtin, I add an argument ATTRS to
> pass attributes for each built-in function.
>
> And some new functions are added:
> aarch64_call_properties: return flags for each built-in function based
> on command-line options.  When the built-in function handles
> floating-points, add FLAG_FP flag.
>
> aarch64_modifies_global_state_p: True if the function would modify
> global states.
>
> aarch64_reads_global_state_p: True if the function would read
> global states.
>
> aarch64_could_trap_p: True if the function would raise a signal.
>
> aarch64_add_attribute: Add attributes in ATTRS.
>
> aarch64_get_attributes: return attributes for each built-in functons
> based on flags and command-line options.
>
> In function aarch64_init_simd_builtins, attributes are get by flags
> and pass them to function aarch64_general_add_builtin.
>
>
> Bootstrap is tested OK on aarch64 Linux platform, but regression
> FAIL one test case ---- pr93423.f90.
> However, I found that this test case would fail randomly in trunk.
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
>   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
> Some PRs have tracked it.  After my patch, this test case would
> always fail.  I guess the syntax errors in fortran crash some structures
> result in illegal memory access but I can't find what exactly it is.
> But I think my patch should have no influence on it.

Yeah, I agree.  And FWIW, I didn't see this in my testing.

I've pushed the patch with one trivial change: to remove the “and”
before “CODE” in:

>  /* Wrapper around add_builtin_function.  NAME is the name of the built-in
>     function, TYPE is the function type, and CODE is the function subcode
> -   (relative to AARCH64_BUILTIN_GENERAL).  */
> +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
> +   attributes.  */

BTW, one thing to be careful of in future is that not all FP intrinsics
raise FP exceptions.  So while:

> +  switch (d->mode)
> +    {
> +    /* Floating-point.  */
> +    case E_BFmode:
> +    case E_V4BFmode:
> +    case E_V8BFmode:
> +    case E_HFmode:
> +    case E_V4HFmode:
> +    case E_V8HFmode:
> +    case E_SFmode:
> +    case E_V2SFmode:
> +    case E_V4SFmode:
> +    case E_DFmode:
> +    case E_V1DFmode:
> +    case E_V2DFmode:
> +      flags |= FLAG_FP;
> +      break;
> +
> +    default:
> +      break;
> +    }

is a good, conservatively-correct default, we might need an additional
flag to suppress it for certain intrinsics.

I've just realised that the code above could have used FLOAT_MODE_P,
but I didn't think of that before pushing the patch :-)

Thanks,
Richard
xiezhiheng Aug. 3, 2020, 2:21 a.m. UTC | #12
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, July 31, 2020 5:03 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Friday, July 17, 2020 5:04 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >
> > Cut...
> >
> >>
> >> Thanks, pushed to master.
> >>
> >> Richard
> >
> > And I have finished the second part.
> >
> > In function aarch64_general_add_builtin, I add an argument ATTRS to
> > pass attributes for each built-in function.
> >
> > And some new functions are added:
> > aarch64_call_properties: return flags for each built-in function based
> > on command-line options.  When the built-in function handles
> > floating-points, add FLAG_FP flag.
> >
> > aarch64_modifies_global_state_p: True if the function would modify
> > global states.
> >
> > aarch64_reads_global_state_p: True if the function would read
> > global states.
> >
> > aarch64_could_trap_p: True if the function would raise a signal.
> >
> > aarch64_add_attribute: Add attributes in ATTRS.
> >
> > aarch64_get_attributes: return attributes for each built-in functons
> > based on flags and command-line options.
> >
> > In function aarch64_init_simd_builtins, attributes are get by flags
> > and pass them to function aarch64_general_add_builtin.
> >
> >
> > Bootstrap is tested OK on aarch64 Linux platform, but regression
> > FAIL one test case ---- pr93423.f90.
> > However, I found that this test case would fail randomly in trunk.
> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
> > Some PRs have tracked it.  After my patch, this test case would
> > always fail.  I guess the syntax errors in fortran crash some structures
> > result in illegal memory access but I can't find what exactly it is.
> > But I think my patch should have no influence on it.
> 
> Yeah, I agree.  And FWIW, I didn't see this in my testing.
> 
> I've pushed the patch with one trivial change: to remove the “and”
> before “CODE” in:
> 
> >  /* Wrapper around add_builtin_function.  NAME is the name of the
> built-in
> >     function, TYPE is the function type, and CODE is the function subcode
> > -   (relative to AARCH64_BUILTIN_GENERAL).  */
> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
> > +   attributes.  */
> 
> BTW, one thing to be careful of in future is that not all FP intrinsics
> raise FP exceptions.  So while:
> 
> > +  switch (d->mode)
> > +    {
> > +    /* Floating-point.  */
> > +    case E_BFmode:
> > +    case E_V4BFmode:
> > +    case E_V8BFmode:
> > +    case E_HFmode:
> > +    case E_V4HFmode:
> > +    case E_V8HFmode:
> > +    case E_SFmode:
> > +    case E_V2SFmode:
> > +    case E_V4SFmode:
> > +    case E_DFmode:
> > +    case E_V1DFmode:
> > +    case E_V2DFmode:
> > +      flags |= FLAG_FP;
> > +      break;
> > +
> > +    default:
> > +      break;
> > +    }
> 
> is a good, conservatively-correct default, we might need an additional
> flag to suppress it for certain intrinsics.
> 

I agree.

> I've just realised that the code above could have used FLOAT_MODE_P,
> but I didn't think of that before pushing the patch :-)
> 

Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

Bootstrap and regression are tested ok on aarch64 Linux platform.

Thanks,
Xiezhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 83e41ff737e..a848b1f64f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_SUPPRESS_FP_EXCEPTIONS.
+

> Thanks,
> Richard
Richard Sandiford Aug. 3, 2020, 1:55 p.m. UTC | #13
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, July 31, 2020 5:03 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Friday, July 17, 2020 5:04 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >
>> > Cut...
>> >
>> >>
>> >> Thanks, pushed to master.
>> >>
>> >> Richard
>> >
>> > And I have finished the second part.
>> >
>> > In function aarch64_general_add_builtin, I add an argument ATTRS to
>> > pass attributes for each built-in function.
>> >
>> > And some new functions are added:
>> > aarch64_call_properties: return flags for each built-in function based
>> > on command-line options.  When the built-in function handles
>> > floating-points, add FLAG_FP flag.
>> >
>> > aarch64_modifies_global_state_p: True if the function would modify
>> > global states.
>> >
>> > aarch64_reads_global_state_p: True if the function would read
>> > global states.
>> >
>> > aarch64_could_trap_p: True if the function would raise a signal.
>> >
>> > aarch64_add_attribute: Add attributes in ATTRS.
>> >
>> > aarch64_get_attributes: return attributes for each built-in functons
>> > based on flags and command-line options.
>> >
>> > In function aarch64_init_simd_builtins, attributes are get by flags
>> > and pass them to function aarch64_general_add_builtin.
>> >
>> >
>> > Bootstrap is tested OK on aarch64 Linux platform, but regression
>> > FAIL one test case ---- pr93423.f90.
>> > However, I found that this test case would fail randomly in trunk.
>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93423
>> >   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96041
>> > Some PRs have tracked it.  After my patch, this test case would
>> > always fail.  I guess the syntax errors in fortran crash some structures
>> > result in illegal memory access but I can't find what exactly it is.
>> > But I think my patch should have no influence on it.
>> 
>> Yeah, I agree.  And FWIW, I didn't see this in my testing.
>> 
>> I've pushed the patch with one trivial change: to remove the “and”
>> before “CODE” in:
>> 
>> >  /* Wrapper around add_builtin_function.  NAME is the name of the
>> built-in
>> >     function, TYPE is the function type, and CODE is the function subcode
>> > -   (relative to AARCH64_BUILTIN_GENERAL).  */
>> > +   (relative to AARCH64_BUILTIN_GENERAL), and ATTRS is the function
>> > +   attributes.  */
>> 
>> BTW, one thing to be careful of in future is that not all FP intrinsics
>> raise FP exceptions.  So while:
>> 
>> > +  switch (d->mode)
>> > +    {
>> > +    /* Floating-point.  */
>> > +    case E_BFmode:
>> > +    case E_V4BFmode:
>> > +    case E_V8BFmode:
>> > +    case E_HFmode:
>> > +    case E_V4HFmode:
>> > +    case E_V8HFmode:
>> > +    case E_SFmode:
>> > +    case E_V2SFmode:
>> > +    case E_V4SFmode:
>> > +    case E_DFmode:
>> > +    case E_V1DFmode:
>> > +    case E_V2DFmode:
>> > +      flags |= FLAG_FP;
>> > +      break;
>> > +
>> > +    default:
>> > +      break;
>> > +    }
>> 
>> is a good, conservatively-correct default, we might need an additional
>> flag to suppress it for certain intrinsics.
>> 
>
> I agree.
>
>> I've just realised that the code above could have used FLOAT_MODE_P,
>> but I didn't think of that before pushing the patch :-)
>> 
>
> Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
> macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.

The same thing is true for reading FPCR as well, so I think the flag
should suppress the FLOAT_MODE_P check, instead of fixing up the flags
afterwards.

I'm struggling to think of a good name though.  How about adding
FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on FLAG_AUTO_FP
being set?

We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
though.

Thanks,
Richard
xiezhiheng Aug. 4, 2020, 8:01 a.m. UTC | #14
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, August 3, 2020 9:55 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut...

> >
> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
> 
> The same thing is true for reading FPCR as well, so I think the flag
> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
> afterwards.
> 
> I'm struggling to think of a good name though.  How about adding
> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
> FLAG_AUTO_FP
> being set?
> 
> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
> though.

I could not think of a better name either.  So I choose to use FLAG_AUTO_FP
to control the check of FLOAT_MODE_P finally.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
XieZhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b834a2c473a..f4a44704926 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-08-04  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_call_properties):
+	Use FLOAT_MODE_P macro instead of enumerating all floating-point
+	modes and add global flag FLAG_AUTO_FP.
+
Richard Sandiford Aug. 4, 2020, 4:25 p.m. UTC | #15
xiezhiheng <xiezhiheng@huawei.com> writes:
>> > Sorry, I should have used it.  And I prepare a patch to use FLOAT_MODE_P
>> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
>> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
>> 
>> The same thing is true for reading FPCR as well, so I think the flag
>> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
>> afterwards.
>> 
>> I'm struggling to think of a good name though.  How about adding
>> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
>> FLAG_AUTO_FP
>> being set?
>> 
>> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
>> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
>> though.
>
> I could not think of a better name either.  So I choose to use FLAG_AUTO_FP
> to control the check of FLOAT_MODE_P finally.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, pushed to master.

Richard
xiezhiheng Aug. 17, 2020, 8:05 a.m. UTC | #16
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 5, 2020 12:26 AM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> > Sorry, I should have used it.  And I prepare a patch to use
> FLOAT_MODE_P
> >> > macro and add a flag FLAG_SUPPRESS_FP_EXCEPTIONS to suppress
> >> > FLAG_RAISE_FP_EXCEPTIONS for certain intrinsics in future.
> >>
> >> The same thing is true for reading FPCR as well, so I think the flag
> >> should suppress the FLOAT_MODE_P check, instead of fixing up the flags
> >> afterwards.
> >>
> >> I'm struggling to think of a good name though.  How about adding
> >> FLAG_AUTO_FP and making the FLOAT_MODE_P check dependent on
> >> FLAG_AUTO_FP
> >> being set?
> >>
> >> We could leave FLAG_AUTO_FP out of FLAG_ALL, since FLAG_ALL already
> >> includes FLAG_FP.  Including it in FLAG_ALL wouldn't do no any harm
> >> though.
> >
> > I could not think of a better name either.  So I choose to use
> FLAG_AUTO_FP
> > to control the check of FLOAT_MODE_P finally.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks, pushed to master.
> 
> Richard

I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
including all the add/sub arithmetic intrinsics.

Something like faddp intrinsic which only handles floating-point operations,
both FP and NONE flags are suitable for it because FLAG_FP will be added
later if the intrinsic handles floating-point operations.  And I prefer FP since
it would be more clear.

But for qadd intrinsics, they would modify FPSR register which is a scenario
I missed before.  And I consider to add an additional flag FLAG_WRITE_FPSR
to represent it.

Bootstrapped and tested on aarch64 Linux platform.

Have any suggestions?

Thanks,
XieZhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9cf1f9733e7..cde50c54d9e 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2020-08-17  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c (aarch64_modifies_global_state_p):
+	Add flag FLAG_WRITE_FPSR to control attribtues.
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
+	for intrinsic functions.
+
Richard Sandiford Aug. 19, 2020, 10:06 a.m. UTC | #17
xiezhiheng <xiezhiheng@huawei.com> writes:
> I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
> including all the add/sub arithmetic intrinsics.
>
> Something like faddp intrinsic which only handles floating-point operations,
> both FP and NONE flags are suitable for it because FLAG_FP will be added
> later if the intrinsic handles floating-point operations.  And I prefer FP since
> it would be more clear.

Sounds good to me.

> But for qadd intrinsics, they would modify FPSR register which is a scenario
> I missed before.  And I consider to add an additional flag FLAG_WRITE_FPSR
> to represent it.

I don't think we make any attempt to guarantee that the Q flag is
meaningful after saturating intrinsics.  To do that, we'd need to model
the modification of the flag in the .md patterns too.

So my preference would be to leave this out and just use NONE for the
saturating forms too.

Thanks,
Richard
xiezhiheng Aug. 20, 2020, 8:24 a.m. UTC | #18
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 19, 2020 6:06 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
> > including all the add/sub arithmetic intrinsics.
> >
> > Something like faddp intrinsic which only handles floating-point operations,
> > both FP and NONE flags are suitable for it because FLAG_FP will be added
> > later if the intrinsic handles floating-point operations.  And I prefer FP
> since
> > it would be more clear.
> 
> Sounds good to me.
> 
> > But for qadd intrinsics, they would modify FPSR register which is a scenario
> > I missed before.  And I consider to add an additional flag
> FLAG_WRITE_FPSR
> > to represent it.
> 
> I don't think we make any attempt to guarantee that the Q flag is
> meaningful after saturating intrinsics.  To do that, we'd need to model
> the modification of the flag in the .md patterns too.
> 
> So my preference would be to leave this out and just use NONE for the
> saturating forms too.

The problem is that the test case in the attachment has different results under -O0 and -O2.

In gimple phase statement:
  _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
would be treated as dead code if we set NONE flag for saturating intrinsics.
Adding FLAG_WRITE_FPSR would help fix this problem.

Even when we set FLAG_WRITE_FPSR, the uqadd insn: 
  (insn 11 10 12 2 (set (reg:V2SI 97)
        (us_plus:V2SI (reg:V2SI 98)
            (reg:V2SI 99))) {aarch64_uqaddv2si}
     (nil))
could also be eliminated in RTL phase because this insn will be treated as dead insn.
So I think we might also need to modify saturating instruction patterns adding the side effect of set the FPSR register.

So if we could use NONE flag for saturating intrinsics, the description of function attributes and patterns are both incorrect. 
I think I can propose another patch to fix the patterns if you agree? 

Thanks,
Xie Zhiheng
#include <arm_neon.h>
#include <stdlib.h>

typedef union {
  struct {
    int _xxx:24;
    unsigned int FZ:1;
    unsigned int DN:1;
    unsigned int AHP:1;
    unsigned int QC:1;
    int V:1;
    int C:1;
    int Z:1;
    int N:1;
  } b;
  unsigned int word;
} _ARM_FPSCR;

static volatile int __read_neon_cumulative_sat (void) {
    _ARM_FPSCR _afpscr_for_qc;
    asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
    return _afpscr_for_qc.b.QC;
}

int main()
{
  uint32x2_t op0, op1, res;

  op0 = vdup_n_u32 ((uint32_t)0xfffffff0);
  op1 = vdup_n_u32 ((uint32_t)0x20);

  _ARM_FPSCR _afpscr_for_qc;
  asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
  _afpscr_for_qc.b.QC = (0);
  asm volatile ("msr fpsr,%0" :  : "r" (_afpscr_for_qc));

  res = vqadd_u32 (op0, op1);
  if (__read_neon_cumulative_sat () != 1)
    abort ();

  return 0;
}
Richard Sandiford Aug. 20, 2020, 8:55 a.m. UTC | #19
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Wednesday, August 19, 2020 6:06 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a try,
>> > including all the add/sub arithmetic intrinsics.
>> >
>> > Something like faddp intrinsic which only handles floating-point operations,
>> > both FP and NONE flags are suitable for it because FLAG_FP will be added
>> > later if the intrinsic handles floating-point operations.  And I prefer FP
>> since
>> > it would be more clear.
>> 
>> Sounds good to me.
>> 
>> > But for qadd intrinsics, they would modify FPSR register which is a scenario
>> > I missed before.  And I consider to add an additional flag
>> FLAG_WRITE_FPSR
>> > to represent it.
>> 
>> I don't think we make any attempt to guarantee that the Q flag is
>> meaningful after saturating intrinsics.  To do that, we'd need to model
>> the modification of the flag in the .md patterns too.
>> 
>> So my preference would be to leave this out and just use NONE for the
>> saturating forms too.
>
> The problem is that the test case in the attachment has different results under -O0 and -O2.

Right.  But my point was that I don't think that use case is supported.
If you want to use saturating instructions and read the Q flag afterwards,
the saturating instructions need to be inline asm too.

> In gimple phase statement:
>   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
> would be treated as dead code if we set NONE flag for saturating intrinsics.
> Adding FLAG_WRITE_FPSR would help fix this problem.
>
> Even when we set FLAG_WRITE_FPSR, the uqadd insn: 
>   (insn 11 10 12 2 (set (reg:V2SI 97)
>         (us_plus:V2SI (reg:V2SI 98)
>             (reg:V2SI 99))) {aarch64_uqaddv2si}
>      (nil))
> could also be eliminated in RTL phase because this insn will be treated as dead insn.
> So I think we might also need to modify saturating instruction patterns adding the side effect of set the FPSR register.

The problem is that FPSR is global state and we don't in general
know who might read it.  So if we modelled the modification of the FPSR,
we'd never be able to fold away saturating arithmetic that does actually
saturate at compile time, because we'd never know whether the program
wanted the effect on the Q flag result to be visible (perhaps to another
function that the compiler can't see).  We'd also be unable to remove
results that really are dead.

So I think this is one of those situations in which we can't keep all
constituents happy.  Catering for people who want to read the Q flag
would make things worse for those who want saturating arithmetic to be
optimised as aggressively as possible.  And the same holds in reverse.

Thanks,
Richard

>
> So if we could use NONE flag for saturating intrinsics, the description of function attributes and patterns are both incorrect. 
> I think I can propose another patch to fix the patterns if you agree? 
>
> Thanks,
> Xie Zhiheng
>
> #include <arm_neon.h>
> #include <stdlib.h>
>
> typedef union {
>   struct {
>     int _xxx:24;
>     unsigned int FZ:1;
>     unsigned int DN:1;
>     unsigned int AHP:1;
>     unsigned int QC:1;
>     int V:1;
>     int C:1;
>     int Z:1;
>     int N:1;
>   } b;
>   unsigned int word;
> } _ARM_FPSCR;
>
> static volatile int __read_neon_cumulative_sat (void) {
>     _ARM_FPSCR _afpscr_for_qc;
>     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
>     return _afpscr_for_qc.b.QC;
> }
>
> int main()
> {
>   uint32x2_t op0, op1, res;
>
>   op0 = vdup_n_u32 ((uint32_t)0xfffffff0);
>   op1 = vdup_n_u32 ((uint32_t)0x20);
>
>   _ARM_FPSCR _afpscr_for_qc;
>   asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
>   _afpscr_for_qc.b.QC = (0);
>   asm volatile ("msr fpsr,%0" :  : "r" (_afpscr_for_qc));
>
>   res = vqadd_u32 (op0, op1);
>   if (__read_neon_cumulative_sat () != 1)
>     abort ();
>
>   return 0;
> }
xiezhiheng Aug. 20, 2020, 12:16 p.m. UTC | #20
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, August 20, 2020 4:55 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Wednesday, August 19, 2020 6:06 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a
> try,
> >> > including all the add/sub arithmetic intrinsics.
> >> >
> >> > Something like faddp intrinsic which only handles floating-point
> operations,
> >> > both FP and NONE flags are suitable for it because FLAG_FP will be
> added
> >> > later if the intrinsic handles floating-point operations.  And I prefer FP
> >> since
> >> > it would be more clear.
> >>
> >> Sounds good to me.
> >>
> >> > But for qadd intrinsics, they would modify FPSR register which is a
> scenario
> >> > I missed before.  And I consider to add an additional flag
> >> FLAG_WRITE_FPSR
> >> > to represent it.
> >>
> >> I don't think we make any attempt to guarantee that the Q flag is
> >> meaningful after saturating intrinsics.  To do that, we'd need to model
> >> the modification of the flag in the .md patterns too.
> >>
> >> So my preference would be to leave this out and just use NONE for the
> >> saturating forms too.
> >
> > The problem is that the test case in the attachment has different results
> under -O0 and -O2.
> 
> Right.  But my point was that I don't think that use case is supported.
> If you want to use saturating instructions and read the Q flag afterwards,
> the saturating instructions need to be inline asm too.
> 
> > In gimple phase statement:
> >   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
> > would be treated as dead code if we set NONE flag for saturating intrinsics.
> > Adding FLAG_WRITE_FPSR would help fix this problem.
> >
> > Even when we set FLAG_WRITE_FPSR, the uqadd insn:
> >   (insn 11 10 12 2 (set (reg:V2SI 97)
> >         (us_plus:V2SI (reg:V2SI 98)
> >             (reg:V2SI 99))) {aarch64_uqaddv2si}
> >      (nil))
> > could also be eliminated in RTL phase because this insn will be treated as
> dead insn.
> > So I think we might also need to modify saturating instruction patterns
> adding the side effect of set the FPSR register.
> 
> The problem is that FPSR is global state and we don't in general
> know who might read it.  So if we modelled the modification of the FPSR,
> we'd never be able to fold away saturating arithmetic that does actually
> saturate at compile time, because we'd never know whether the program
> wanted the effect on the Q flag result to be visible (perhaps to another
> function that the compiler can't see).  We'd also be unable to remove
> results that really are dead.
> 
> So I think this is one of those situations in which we can't keep all
> constituents happy.  Catering for people who want to read the Q flag
> would make things worse for those who want saturating arithmetic to be
> optimised as aggressively as possible.  And the same holds in reverse.

I agree.  The test case is extracted from gcc.target/aarch64/advsimd-intrinsics/vqadd.c
If we set NONE flag for saturating intrinsics, it would fail in regression because some qadd
intrinsics would be treated as dead code and be eliminated.
  Running target unix
  Running ./gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp ...
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  (test for excess errors)
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  execution test
  PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
  FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  execution test

So maybe this test case should only be tested at -O0 level?

Thanks,
Xie Zhiheng
Richard Sandiford Aug. 21, 2020, 9:02 a.m. UTC | #21
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, August 20, 2020 4:55 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Wednesday, August 19, 2020 6:06 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> > I add FLAGS for part of intrinsics in aarch64-simd-builtins.def first for a
>> try,
>> >> > including all the add/sub arithmetic intrinsics.
>> >> >
>> >> > Something like faddp intrinsic which only handles floating-point
>> operations,
>> >> > both FP and NONE flags are suitable for it because FLAG_FP will be
>> added
>> >> > later if the intrinsic handles floating-point operations.  And I prefer FP
>> >> since
>> >> > it would be more clear.
>> >>
>> >> Sounds good to me.
>> >>
>> >> > But for qadd intrinsics, they would modify FPSR register which is a
>> scenario
>> >> > I missed before.  And I consider to add an additional flag
>> >> FLAG_WRITE_FPSR
>> >> > to represent it.
>> >>
>> >> I don't think we make any attempt to guarantee that the Q flag is
>> >> meaningful after saturating intrinsics.  To do that, we'd need to model
>> >> the modification of the flag in the .md patterns too.
>> >>
>> >> So my preference would be to leave this out and just use NONE for the
>> >> saturating forms too.
>> >
>> > The problem is that the test case in the attachment has different results
>> under -O0 and -O2.
>> 
>> Right.  But my point was that I don't think that use case is supported.
>> If you want to use saturating instructions and read the Q flag afterwards,
>> the saturating instructions need to be inline asm too.
>> 
>> > In gimple phase statement:
>> >   _9 = __builtin_aarch64_uqaddv2si_uuu (op0_4, op1_6);
>> > would be treated as dead code if we set NONE flag for saturating intrinsics.
>> > Adding FLAG_WRITE_FPSR would help fix this problem.
>> >
>> > Even when we set FLAG_WRITE_FPSR, the uqadd insn:
>> >   (insn 11 10 12 2 (set (reg:V2SI 97)
>> >         (us_plus:V2SI (reg:V2SI 98)
>> >             (reg:V2SI 99))) {aarch64_uqaddv2si}
>> >      (nil))
>> > could also be eliminated in RTL phase because this insn will be treated as
>> dead insn.
>> > So I think we might also need to modify saturating instruction patterns
>> adding the side effect of set the FPSR register.
>> 
>> The problem is that FPSR is global state and we don't in general
>> know who might read it.  So if we modelled the modification of the FPSR,
>> we'd never be able to fold away saturating arithmetic that does actually
>> saturate at compile time, because we'd never know whether the program
>> wanted the effect on the Q flag result to be visible (perhaps to another
>> function that the compiler can't see).  We'd also be unable to remove
>> results that really are dead.
>> 
>> So I think this is one of those situations in which we can't keep all
>> constituents happy.  Catering for people who want to read the Q flag
>> would make things worse for those who want saturating arithmetic to be
>> optimised as aggressively as possible.  And the same holds in reverse.
>
> I agree.  The test case is extracted from gcc.target/aarch64/advsimd-intrinsics/vqadd.c
> If we set NONE flag for saturating intrinsics, it would fail in regression because some qadd
> intrinsics would be treated as dead code and be eliminated.
>   Running target unix
>   Running ./gcc.target/aarch64/advsimd-intrinsics/advsimd-intrinsics.exp ...
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  (test for excess errors)
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O0  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O1  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O3 -g  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Os  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -Og -g  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fno-use-linker-plugin -flto-partition=none  execution test
>   PASS: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
>   FAIL: gcc.target/aarch64/advsimd-intrinsics/vqadd.c   -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects  execution test

Ah, OK.

> So maybe this test case should only be tested at -O0 level?

Looks like the saturating intrinsics might need a bit more thought.
Would you mind submitting the patch with just the other parts?
Those were uncontroversial and it would be a shame to hold them
up over this.

Thanks,
Richard
xiezhiheng Aug. 25, 2020, 3:14 a.m. UTC | #22
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, August 21, 2020 5:02 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3

Cut...
 
> Looks like the saturating intrinsics might need a bit more thought.
> Would you mind submitting the patch with just the other parts?
> Those were uncontroversial and it would be a shame to hold them
> up over this.

Okay, I reorganized the existing patch and finished the first half of the intrinsics
except saturating intrinsics and load intrinsics.

Bootstrapped and tested on aarch64 Linux platform.

For load intrinsics, I have one problem when I set FLAG_READ_MEMORY for them,
some test cases like
gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
  #include <arm_neon.h>

  /* { dg-do compile } */
  /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */

  poly8x8x2_t
  f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
  {
    poly8x8x2_t res;
    /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
    res = vld2_lane_p8 (p, v, 8);
    /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
    res = vld2_lane_p8 (p, v, -1);
    return res;
  }
would fail in regression.  Because the first statement
  res = vld2_lane_p8 (p, v, 8);
would be eliminated as dead code in gimple phase but the error message is
generated in expand pass.  So I am going to replace the second statement
  res = vld2_lane_p8 (p, v, -1);
with
  res = vld2_lane_p8 (p, res, -1);
or do you have any other suggestions?

And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the result
to prevent the statement
  result = vrsra_n_s32 (arg1, arg2, a);
from being eliminated by treated as dead code.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7a71b4367d4..217344d7d1f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-25  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAGS
+	for intrinsic functions.
+

diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index b9562e67883..e10bcc9b28a 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-25  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* gcc.target/aarch64/arg-type-diagnostics-1.c: Return result
+	to prevent statement from being eliminated.
+
Richard Sandiford Aug. 25, 2020, 11:07 a.m. UTC | #23
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, August 21, 2020 5:02 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>
> Cut...
>  
>> Looks like the saturating intrinsics might need a bit more thought.
>> Would you mind submitting the patch with just the other parts?
>> Those were uncontroversial and it would be a shame to hold them
>> up over this.
>
> Okay, I reorganized the existing patch and finished the first half of the intrinsics
> except saturating intrinsics and load intrinsics.
>
> Bootstrapped and tested on aarch64 Linux platform.

I know this'll be frustrating, sorry, but could you post the
2020-08-17 patch without the saturation changes?  It's going to be
easier to track and review if each patch deals with similar intrinsics.
The non-saturating part of the 2020-08-17 patch was good because it was
dealing purely with arithmetic operations.  Loads should really be a
separate change.

BTW, for something like this, it's OK to test and submit several patches
at once, so separating the patches doesn't need to mean longer test cycles.
It's just that for review purposes, it's easier if one patch does one thing.

> For load intrinsics, I have one problem when I set FLAG_READ_MEMORY for them,
> some test cases like
> gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
>   #include <arm_neon.h>
>
>   /* { dg-do compile } */
>   /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
>
>   poly8x8x2_t
>   f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
>   {
>     poly8x8x2_t res;
>     /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
>     res = vld2_lane_p8 (p, v, 8);
>     /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
>     res = vld2_lane_p8 (p, v, -1);
>     return res;
>   }
> would fail in regression.  Because the first statement
>   res = vld2_lane_p8 (p, v, 8);
> would be eliminated as dead code in gimple phase but the error message is
> generated in expand pass.  So I am going to replace the second statement
>   res = vld2_lane_p8 (p, v, -1);
> with
>   res = vld2_lane_p8 (p, res, -1);
> or do you have any other suggestions?

The test is valid as-is, so it would be better not to change it.

I guess this means that we should leave the _lane loads and stores until
we implement the range checks in a different way.  This is somewhat
related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95969 ,
although your example shows that the “dummy const function” approach
might not work.

So to start with, could you just patch the non-lane loads?

> And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the result
> to prevent the statement
>   result = vrsra_n_s32 (arg1, arg2, a);
> from being eliminated by treated as dead code.

Hmm.  Here too I think the test is valid as-is.  I think we need
to ensure that the range check still happens even if the call is
dead code (similar to PR95969 above).

So I guess here too, it might be better to leave the _n forms to
a separate patch.

That doesn't mean we shouldn't fix the _lane and _n cases (or the
previous saturating cases).  It's just that each time we find a group
of functions that's awkward for some reason, it'd be better to deal
with those functions separately.

Thanks,
Richard
xiezhiheng Aug. 26, 2020, 1:39 a.m. UTC | #24
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, August 25, 2020 7:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Friday, August 21, 2020 5:02 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >
> > Cut...
> >
> >> Looks like the saturating intrinsics might need a bit more thought.
> >> Would you mind submitting the patch with just the other parts?
> >> Those were uncontroversial and it would be a shame to hold them
> >> up over this.
> >
> > Okay, I reorganized the existing patch and finished the first half of the
> intrinsics
> > except saturating intrinsics and load intrinsics.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> I know this'll be frustrating, sorry, but could you post the
> 2020-08-17 patch without the saturation changes?  It's going to be
> easier to track and review if each patch deals with similar intrinsics.
> The non-saturating part of the 2020-08-17 patch was good because it was
> dealing purely with arithmetic operations.  Loads should really be a
> separate change.
> 
> BTW, for something like this, it's OK to test and submit several patches
> at once, so separating the patches doesn't need to mean longer test cycles.
> It's just that for review purposes, it's easier if one patch does one thing.
> 

That's true.  And I finished the patch to add FLAG for add/sub arithmetic
intrinsics except saturating intrinsics.  Later I will try to separate the rest
into several subsets to fix.

Bootstrapped and tested on aarch64 Linux platform.


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 7a71b4367d4..a93712ae0a5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-26  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for add/sub arithmetic intrinsics.
+

> > For load intrinsics, I have one problem when I set FLAG_READ_MEMORY
> for them,
> > some test cases like
> > gcc.target/aarch64/advsimd-intrinsics/vld2_lane_p8_indices_1.c
> >   #include <arm_neon.h>
> >
> >   /* { dg-do compile } */
> >   /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> >
> >   poly8x8x2_t
> >   f_vld2_lane_p8 (poly8_t * p, poly8x8x2_t v)
> >   {
> >     poly8x8x2_t res;
> >     /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
> >     res = vld2_lane_p8 (p, v, 8);
> >     /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
> >     res = vld2_lane_p8 (p, v, -1);
> >     return res;
> >   }
> > would fail in regression.  Because the first statement
> >   res = vld2_lane_p8 (p, v, 8);
> > would be eliminated as dead code in gimple phase but the error message is
> > generated in expand pass.  So I am going to replace the second statement
> >   res = vld2_lane_p8 (p, v, -1);
> > with
> >   res = vld2_lane_p8 (p, res, -1);
> > or do you have any other suggestions?
> 
> The test is valid as-is, so it would be better not to change it.
> 
> I guess this means that we should leave the _lane loads and stores until
> we implement the range checks in a different way.  This is somewhat
> related to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95969 ,
> although your example shows that the “dummy const function” approach
> might not work.
> 
> So to start with, could you just patch the non-lane loads?

Okay.

> 
> > And for test case gcc.target/aarch64/arg-type-diagnostics-1.c, I return the
> result
> > to prevent the statement
> >   result = vrsra_n_s32 (arg1, arg2, a);
> > from being eliminated by treated as dead code.
> 
> Hmm.  Here too I think the test is valid as-is.  I think we need
> to ensure that the range check still happens even if the call is
> dead code (similar to PR95969 above).

I agree.  That would be more reasonable.

> 
> So I guess here too, it might be better to leave the _n forms to
> a separate patch.
> 
> That doesn't mean we shouldn't fix the _lane and _n cases (or the
> previous saturating cases).  It's just that each time we find a group
> of functions that's awkward for some reason, it'd be better to deal
> with those functions separately.
> 
> Thanks,
> Richard
Richard Sandiford Aug. 26, 2020, 10:14 a.m. UTC | #25
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, August 25, 2020 7:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Friday, August 21, 2020 5:02 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >
>> > Cut...
>> >
>> >> Looks like the saturating intrinsics might need a bit more thought.
>> >> Would you mind submitting the patch with just the other parts?
>> >> Those were uncontroversial and it would be a shame to hold them
>> >> up over this.
>> >
>> > Okay, I reorganized the existing patch and finished the first half of the
>> intrinsics
>> > except saturating intrinsics and load intrinsics.
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> I know this'll be frustrating, sorry, but could you post the
>> 2020-08-17 patch without the saturation changes?  It's going to be
>> easier to track and review if each patch deals with similar intrinsics.
>> The non-saturating part of the 2020-08-17 patch was good because it was
>> dealing purely with arithmetic operations.  Loads should really be a
>> separate change.
>> 
>> BTW, for something like this, it's OK to test and submit several patches
>> at once, so separating the patches doesn't need to mean longer test cycles.
>> It's just that for review purposes, it's easier if one patch does one thing.
>> 
>
> That's true.  And I finished the patch to add FLAG for add/sub arithmetic
> intrinsics except saturating intrinsics.  Later I will try to separate the rest
> into several subsets to fix.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, looks great.  Pushed to master.

Richard
xiezhiheng Aug. 27, 2020, 2:50 a.m. UTC | #26
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, August 26, 2020 6:14 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut...

> 
> Thanks, looks great.  Pushed to master.
> 
> Richard

I made two separate patches for these two groups for review purposes.

Note: Patch for min/max intrinsics should be applied before the patch for rounding intrinsics

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f6605eae08c..939aae71ecd 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for min/max intrinsics.
+


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f6605eae08c..b0d3ec6cf19 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2020-08-27  Zhiheng Xie  <xiezhiheng@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for rounding intrinsics.
+
Richard Sandiford Aug. 27, 2020, 8:08 a.m. UTC | #27
xiezhiheng <xiezhiheng@huawei.com> writes:
> I made two separate patches for these two groups for review purposes.
>
> Note: Patch for min/max intrinsics should be applied before the patch for rounding intrinsics
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, LGTM.  Pushed to master.

Richard
xiezhiheng Oct. 9, 2020, 9:32 a.m. UTC | #28
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Thursday, August 27, 2020 4:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I made two separate patches for these two groups for review purposes.
> >
> > Note: Patch for min/max intrinsics should be applied before the patch for
> rounding intrinsics
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks, LGTM.  Pushed to master.
> 
> Richard

I made the patch for multiply and multiply accumulator intrinsics.

Note that bfmmlaq intrinsic is special because this instruction ignores the FPCR and does not update the FPSR exception status.
  https://developer.arm.com/docs/ddi0596/h/simd-and-floating-point-instructions-alphabetic-order/bfmmla-bfloat16-floating-point-matrix-multiply-accumulate-into-2x2-matrix
So I set it to the AUTO_FP flag.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 75b62b590e2..8ca9746189a 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for mul/mla/mls intrinsics.
+
Richard Sandiford Oct. 13, 2020, 8:07 a.m. UTC | #29
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Thursday, August 27, 2020 4:08 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I made two separate patches for these two groups for review purposes.
>> >
>> > Note: Patch for min/max intrinsics should be applied before the patch for
>> rounding intrinsics
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> Thanks, LGTM.  Pushed to master.
>> 
>> Richard
>
> I made the patch for multiply and multiply accumulator intrinsics.
>
> Note that bfmmlaq intrinsic is special because this instruction ignores the FPCR and does not update the FPSR exception status.
>   https://developer.arm.com/docs/ddi0596/h/simd-and-floating-point-instructions-alphabetic-order/bfmmla-bfloat16-floating-point-matrix-multiply-accumulate-into-2x2-matrix
> So I set it to the AUTO_FP flag.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, LGTM.  Pushed to trunk.

Richard
xiezhiheng Oct. 19, 2020, 9:21 a.m. UTC | #30
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, October 13, 2020 4:08 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 

Cut ...

> 
> Thanks, LGTM.  Pushed to trunk.
> 

I made two separate patches for these two groups, get/set register intrinsics and store intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1ce634eb2b..8828cc5929d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for get/set reg intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index d1ce634eb2b..bab5c1faf3c 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-19  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for store intrinsics.
+
Richard Sandiford Oct. 20, 2020, 4:53 p.m. UTC | #31
xiezhiheng <xiezhiheng@huawei.com> writes:
> I made two separate patches for these two groups, get/set register intrinsics and store intrinsics.
>
> Note: It does not matter which patch is applied first.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks.  I pushed the get/set patch.  For the store patch, I think
we should have:

const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;

since the FP forms don't (for example) read the FPCR.

Thanks,
Richard
xiezhiheng Oct. 22, 2020, 9:16 a.m. UTC | #32
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Wednesday, October 21, 2020 12:54 AM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> > I made two separate patches for these two groups, get/set register
> intrinsics and store intrinsics.
> >
> > Note: It does not matter which patch is applied first.
> >
> > Bootstrapped and tested on aarch64 Linux platform.
> 
> Thanks.  I pushed the get/set patch.  For the store patch, I think
> we should have:
> 
> const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;
> 
> since the FP forms don't (for example) read the FPCR.
> 

That's true.  I added FLAG_STORE for the store intrinsics and made the patch for them.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 59fa1ad4d5d..26edaa309c8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2020-10-22  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-builtins.c: Add FLAG STORE.
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for store intrinsics.
+
Richard Sandiford Oct. 26, 2020, 1:03 p.m. UTC | #33
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Wednesday, October 21, 2020 12:54 AM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> > I made two separate patches for these two groups, get/set register
>> intrinsics and store intrinsics.
>> >
>> > Note: It does not matter which patch is applied first.
>> >
>> > Bootstrapped and tested on aarch64 Linux platform.
>> 
>> Thanks.  I pushed the get/set patch.  For the store patch, I think
>> we should have:
>> 
>> const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY | FLAG_AUTO_FP;
>> 
>> since the FP forms don't (for example) read the FPCR.
>> 
>
> That's true.  I added FLAG_STORE for the store intrinsics and made the patch for them.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, pushed to trunk.

Sorry for the delayed response.

Richard

>
> Thanks,
> Xie Zhiheng
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 59fa1ad4d5d..26edaa309c8 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2020-10-22  Zhiheng Xie  <xiezhiheng@huawei.com>
> +	    Nannan Zheng  <zhengnannan@huawei.com>
> +
> +	* config/aarch64/aarch64-builtins.c: Add FLAG STORE.
> +	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
> +	for store intrinsics.
> +
xiezhiheng Oct. 30, 2020, 6:41 a.m. UTC | #34
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Monday, October 26, 2020 9:03 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
>
> Thanks, pushed to trunk.
>

Thanks, and I made the patch for float conversion intrinsics.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 58ed7b12850..af910066ba0 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-30  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for conversion intrinsics.
+
Richard Sandiford Oct. 30, 2020, 10:23 a.m. UTC | #35
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Monday, October 26, 2020 9:03 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>>
>> Thanks, pushed to trunk.
>>
>
> Thanks, and I made the patch for float conversion intrinsics.

LGTM, thanks.  Pushed.

Richard
xiezhiheng Nov. 3, 2020, 11:59 a.m. UTC | #36
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Friday, October 30, 2020 6:24 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Monday, October 26, 2020 9:03 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>;
> gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> Thanks, pushed to trunk.
> >>
> >
> > Thanks, and I made the patch for float conversion intrinsics.
> 
> LGTM, thanks.  Pushed.
> 

Thanks.  And I made two separate patches for these two groups, compare intrinsics
and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng



diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..ba5e3dc7c55 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for compare intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 9f743ecc89a..d6b943fc0df 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-03  Zhiheng Xie  <xiezhiheng@huawei.com>
+	    Nannan Zheng  <zhengnannan@huawei.com>
+
+	* config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+	for AES/SHA/SM3/SM4 intrinsics.
+
Richard Sandiford Nov. 3, 2020, 1:57 p.m. UTC | #37
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Friday, October 30, 2020 6:24 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Monday, October 26, 2020 9:03 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: Richard Biener <richard.guenther@gmail.com>;
>> gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> Thanks, pushed to trunk.
>> >>
>> >
>> > Thanks, and I made the patch for float conversion intrinsics.
>> 
>> LGTM, thanks.  Pushed.
>> 
>
> Thanks.  And I made two separate patches for these two groups, compare intrinsics
> and encryption algorithm (AES/SHA/SM3/SM4) intrinsics.
>
> Note: It does not matter which patch is applied first.
>
> Bootstrapped and tested on aarch64 Linux platform.

Thanks, I pushed both patches to trunk.

Richard
xiezhiheng Nov. 9, 2020, 3:27 a.m. UTC | #38
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, November 3, 2020 9:57 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> Thanks, I pushed both patches to trunk.
> 

Thanks.  And I made two separate patches for these two groups, tbl/tbx intrinsics and
the rest of the arithmetic operation intrinsics.

Note: It does not matter which patch is applied first.

Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index eab26b5f3a9..4f81c86fc76 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for tbl/tbx intrinsics.
+

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index eab26b5f3a9..193fbe4cf7d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-09  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for arithmetic operation intrinsics.
+
Richard Sandiford Nov. 10, 2020, 11:53 a.m. UTC | #39
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, November 3, 2020 9:57 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> Thanks, I pushed both patches to trunk.
>> 
>
> Thanks.  And I made two separate patches for these two groups, tbl/tbx intrinsics and
> the rest of the arithmetic operation intrinsics.
>
> Note: It does not matter which patch is applied first.

I pushed the TBL/TBX one, but on the other patch:

> @@ -297,7 +297,7 @@
>    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
>  
>    /* Implemented by aarch64_reduc_plus_<mode>.  */
> -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
> +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)

This is defined for integer and FP modes, so I think it should be
NONE instead of FP.  We'll automatically add FLAGS_FP based on the
mode where necessary.

Otherwise it looks good, thanks.

Richard
xiezhiheng Nov. 11, 2020, 7:59 a.m. UTC | #40
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> Sent: Tuesday, November 10, 2020 7:54 PM
> To: xiezhiheng <xiezhiheng@huawei.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> emitted at -O3
> 
> xiezhiheng <xiezhiheng@huawei.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
> >> Sent: Tuesday, November 3, 2020 9:57 PM
> >> To: xiezhiheng <xiezhiheng@huawei.com>
> >> Cc: gcc-patches@gcc.gnu.org
> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
> >> emitted at -O3
> >>
> >> Thanks, I pushed both patches to trunk.
> >>
> >
> > Thanks.  And I made two separate patches for these two groups, tbl/tbx
> intrinsics and
> > the rest of the arithmetic operation intrinsics.
> >
> > Note: It does not matter which patch is applied first.
> 
> I pushed the TBL/TBX one, but on the other patch:
> 
> > @@ -297,7 +297,7 @@
> >    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
> >
> >    /* Implemented by aarch64_reduc_plus_<mode>.  */
> > -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
> > +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)
> 
> This is defined for integer and FP modes, so I think it should be
> NONE instead of FP.  We'll automatically add FLAGS_FP based on the
> mode where necessary.
> 

Sorry, and I have revised a new patch.
Bootstrapped and tested on aarch64 Linux platform.

Thanks,
Xie Zhiheng


diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 75092451216..d6a49d65214 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-11-11  Zhiheng Xie  <xiezhiheng@huawei.com>
+           Nannan Zheng  <zhengnannan@huawei.com>
+
+       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
+       for arithmetic operation intrinsics.
+
Richard Sandiford Nov. 11, 2020, 10:59 a.m. UTC | #41
xiezhiheng <xiezhiheng@huawei.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> Sent: Tuesday, November 10, 2020 7:54 PM
>> To: xiezhiheng <xiezhiheng@huawei.com>
>> Cc: gcc-patches@gcc.gnu.org
>> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> emitted at -O3
>> 
>> xiezhiheng <xiezhiheng@huawei.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford [mailto:richard.sandiford@arm.com]
>> >> Sent: Tuesday, November 3, 2020 9:57 PM
>> >> To: xiezhiheng <xiezhiheng@huawei.com>
>> >> Cc: gcc-patches@gcc.gnu.org
>> >> Subject: Re: [PATCH PR94442] [AArch64] Redundant ldp/stp instructions
>> >> emitted at -O3
>> >>
>> >> Thanks, I pushed both patches to trunk.
>> >>
>> >
>> > Thanks.  And I made two separate patches for these two groups, tbl/tbx
>> intrinsics and
>> > the rest of the arithmetic operation intrinsics.
>> >
>> > Note: It does not matter which patch is applied first.
>> 
>> I pushed the TBL/TBX one, but on the other patch:
>> 
>> > @@ -297,7 +297,7 @@
>> >    BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0, ALL)
>> >
>> >    /* Implemented by aarch64_reduc_plus_<mode>.  */
>> > -  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, ALL)
>> > +  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10, FP)
>> 
>> This is defined for integer and FP modes, so I think it should be
>> NONE instead of FP.  We'll automatically add FLAGS_FP based on the
>> mode where necessary.
>> 
>
> Sorry, and I have revised a new patch.
> Bootstrapped and tested on aarch64 Linux platform.

LGTM, thanks.  Pushed to trunk.

Richard

> Thanks,
> Xie Zhiheng
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 75092451216..d6a49d65214 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,9 @@
> +2020-11-11  Zhiheng Xie  <xiezhiheng@huawei.com>
> +           Nannan Zheng  <zhengnannan@huawei.com>
> +
> +       * config/aarch64/aarch64-simd-builtins.def: Add proper FLAG
> +       for arithmetic operation intrinsics.
> +
diff mbox series

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index 3c68b0d754c..8cc18449a0c 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -7362,7 +7362,8 @@  tree
 get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		     poly_int64_pod *pbitpos, tree *poffset,
 		     machine_mode *pmode, int *punsignedp,
-		     int *preversep, int *pvolatilep)
+		     int *preversep, int *pvolatilep,
+		     bool include_memref_p)
 {
   tree size_tree = 0;
   machine_mode mode = VOIDmode;
@@ -7509,6 +7510,21 @@  get_inner_reference (tree exp, poly_int64_pod *pbitsize,
 		}
 	      exp = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
 	    }
+	  else if (include_memref_p
+		   && TREE_CODE (TREE_OPERAND (exp, 0)) == SSA_NAME)
+	    {
+	      tree off = TREE_OPERAND (exp, 1);
+	      if (!integer_zerop (off))
+		{
+		  poly_offset_int boff = mem_ref_offset (exp);
+		  boff <<= LOG2_BITS_PER_UNIT;
+		  bit_offset += boff;
+
+		  exp = build2 (MEM_REF, TREE_TYPE (exp),
+				TREE_OPERAND (exp, 0),
+				build_int_cst (TREE_TYPE (off), 0));
+		}
+	    }
 	  goto done;
 
 	default:
@@ -10786,7 +10802,7 @@  expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
 	int reversep, volatilep = 0, must_force_mem;
 	tree tem
 	  = get_inner_reference (exp, &bitsize, &bitpos, &offset, &mode1,
-				 &unsignedp, &reversep, &volatilep);
+				 &unsignedp, &reversep, &volatilep, true);
 	rtx orig_op0, memloc;
 	bool clear_mem_expr = false;
 
diff --git a/gcc/tree.h b/gcc/tree.h
index a74872f5f3e..7df0d15f7f9 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -6139,7 +6139,8 @@  extern bool complete_ctor_at_level_p (const_tree, HOST_WIDE_INT, const_tree);
    look for the ultimate containing object, which is returned and specify
    the access position and size.  */
 extern tree get_inner_reference (tree, poly_int64_pod *, poly_int64_pod *,
-				 tree *, machine_mode *, int *, int *, int *);
+				 tree *, machine_mode *, int *, int *, int *,
+				 bool = false);
 
 extern tree build_personality_function (const char *);