diff mbox series

[2/2] AArch64: Add better costing for vector constants and operations

Message ID patch-14774-tamar@arm.com
State New
Headers show
Series [1/2] middle-end Teach CSE to be able to do vector extracts. | expand

Commit Message

Tamar Christina Aug. 31, 2021, 1:30 p.m. UTC
Hi All,

This patch adds extended costing to cost the creation of constants and the
manipulation of constants.  The default values provided are based on
architectural expectations and each cost models can be individually tweaked as
needed.

The changes in this patch covers:

* Construction of PARALLEL or CONST_VECTOR:
  Adds better costing for vector of constants which is based on the constant
  being created and the instruction that can be used to create it.  i.e. a movi
  is cheaper than a literal load etc.
* Construction of a vector through a vec_dup.
* Extraction of part of a vector using a vec_select.  In this part we had to
  make some opportunistic assumptions.  In particular we had to model extracting
  of the high-half of a register as being "free" in order to get fusion using
  NEON high-part instructions possible.  In the event that there is no <insn>2
  variant for the instruction the select would still be cheaper than the load.

Unfortunately on AArch64 you need -O3 when using intrinsics for this to kick
in until we fix vld1/2/3 to be gimple instead of RTL intrinsics.

This should also fix the stack allocations.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
	(aarch64_rtx_costs): Add extra costs.
	(aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644


--

Comments

Richard Sandiford Aug. 31, 2021, 3:13 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> @@ -13936,8 +13937,65 @@ cost_plus:
>  			     mode, MULT, 1, speed);
>            return true;
>          }
> +	break;
> +    case PARALLEL:
> +      /* Fall through */

Which code paths lead to getting a PARALLEL here?

> +    case CONST_VECTOR:
> +	{
> +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> +	  /* Not a valid const vector.  */
> +	  if (!gen_insn)
> +	    break;
>  
> -      /* Fall through.  */
> +	  switch (GET_CODE (gen_insn))
> +	  {
> +	  case CONST_VECTOR:
> +	    /* Load using MOVI/MVNI.  */
> +	    if (aarch64_simd_valid_immediate (x, NULL))
> +	      *cost += extra_cost->vect.movi;
> +	    else /* Load using constant pool.  */
> +	      *cost += extra_cost->ldst.load;
> +	    break;
> +	  /* Load using a DUP.  */
> +	  case VEC_DUPLICATE:
> +	    *cost += extra_cost->vect.dup;
> +	    break;

Does this trigger in practice?  The new check==true path (rightly) stops
the duplicated element from being forced into a register, but then
I would have expected:

rtx
gen_vec_duplicate (machine_mode mode, rtx x)
{
  if (valid_for_const_vector_p (mode, x))
    return gen_const_vec_duplicate (mode, x);
  return gen_rtx_VEC_DUPLICATE (mode, x);
}

to generate the original CONST_VECTOR again.

> +	  default:
> +	    *cost += extra_cost->ldst.load;
> +	    break;
> +	  }
> +	  return true;
> +	}
> +    case VEC_CONCAT:
> +	/* depending on the operation, either DUP or INS.
> +	   For now, keep default costing.  */
> +	break;
> +    case VEC_DUPLICATE:
> +	*cost += extra_cost->vect.dup;
> +	return true;
> +    case VEC_SELECT:
> +	{
> +	  /* cost subreg of 0 as free, otherwise as DUP */
> +	  rtx op1 = XEXP (x, 1);
> +	  int nelts;
> +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
> +	      || (BYTES_BIG_ENDIAN
> +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
> +		  && INTVAL (op1) == nelts - 1))
> +	    ;
> +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> +	    ;
> +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> +	  /* Selecting the high part is not technically free, but we lack
> +	     enough information to decide that here.  For instance selecting
> +	     the high-part of a vec_dup *is* free or to feed into any _high
> +	     instruction.   Both of which we can't really tell.  That said
> +	     have a better chance to optimize an dup vs multiple constants.  */
> +	    ;

Not sure about this.  We already try to detect the latter case
(_high instructions) via aarch64_strip_extend_vec_half.  We might
be missing some cases, but that still feels like the right way
to go IMO.

Selecting the high part of a vec_dup should get folded into
another vec_dup.

The lowpart bits look OK, but which paths call this function
without first simplifying the select to a subreg?  The subreg
is now the canonical form (thanks to r12-2288).

Thanks,
Richard
Tamar Christina Aug. 31, 2021, 3:47 p.m. UTC | #2
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, August 31, 2021 4:14 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > @@ -13936,8 +13937,65 @@ cost_plus:
> >  			     mode, MULT, 1, speed);
> >            return true;
> >          }
> > +	break;
> > +    case PARALLEL:
> > +      /* Fall through */
> 
> Which code paths lead to getting a PARALLEL here?

Hi,

Thanks for the review!

I added it for completeness because CSE treats a parallel and CONST_VECTOR as
equivalent when they each entry in the parallel defines a constant.

> 
> > +    case CONST_VECTOR:
> > +	{
> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> > +	  /* Not a valid const vector.  */
> > +	  if (!gen_insn)
> > +	    break;
> >
> > -      /* Fall through.  */
> > +	  switch (GET_CODE (gen_insn))
> > +	  {
> > +	  case CONST_VECTOR:
> > +	    /* Load using MOVI/MVNI.  */
> > +	    if (aarch64_simd_valid_immediate (x, NULL))
> > +	      *cost += extra_cost->vect.movi;
> > +	    else /* Load using constant pool.  */
> > +	      *cost += extra_cost->ldst.load;
> > +	    break;
> > +	  /* Load using a DUP.  */
> > +	  case VEC_DUPLICATE:
> > +	    *cost += extra_cost->vect.dup;
> > +	    break;
> 
> Does this trigger in practice?  The new check==true path (rightly) stops the
> duplicated element from being forced into a register, but then I would have
> expected:
> 
> rtx
> gen_vec_duplicate (machine_mode mode, rtx x) {
>   if (valid_for_const_vector_p (mode, x))
>     return gen_const_vec_duplicate (mode, x);
>   return gen_rtx_VEC_DUPLICATE (mode, x); }
> 
> to generate the original CONST_VECTOR again.

Yes, but CSE is trying to see whether using a DUP is cheaper than another instruction.
Normal code won't hit this but CSE is just costing all the different ways one can semantically
construct a vector, which RTL actually comes out of it depends on how it's folded as you say.

> 
> > +	  default:
> > +	    *cost += extra_cost->ldst.load;
> > +	    break;
> > +	  }
> > +	  return true;
> > +	}
> > +    case VEC_CONCAT:
> > +	/* depending on the operation, either DUP or INS.
> > +	   For now, keep default costing.  */
> > +	break;
> > +    case VEC_DUPLICATE:
> > +	*cost += extra_cost->vect.dup;
> > +	return true;
> > +    case VEC_SELECT:
> > +	{
> > +	  /* cost subreg of 0 as free, otherwise as DUP */
> > +	  rtx op1 = XEXP (x, 1);
> > +	  int nelts;
> > +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
> > +	      || (BYTES_BIG_ENDIAN
> > +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
> > +		  && INTVAL (op1) == nelts - 1))
> > +	    ;
> > +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> > +	    ;
> > +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> > +	  /* Selecting the high part is not technically free, but we lack
> > +	     enough information to decide that here.  For instance selecting
> > +	     the high-part of a vec_dup *is* free or to feed into any _high
> > +	     instruction.   Both of which we can't really tell.  That said
> > +	     have a better chance to optimize an dup vs multiple constants.  */
> > +	    ;
> 
> Not sure about this.  We already try to detect the latter case (_high
> instructions) via aarch64_strip_extend_vec_half.  We might be missing some
> cases, but that still feels like the right way to go IMO.

That's a different problem from what I understand.  What this is trying to say is that
If you have a vector [x y a b] and you need vector [x y] that you can use the top part
of the original vector for this.

This is an approximation, because something that can be created with a movi is probably
Cheaper to keep distinct if it's not going to be paired with a _high operation (since you will have a dup then).

The problem is that the front end has already spit the two Vectors into [x y a b] and [x y].
There's nothing else that tries to consolidate them back up if both survive.

As a consequence of this, the testcase test0 is not handled optimally.  It would instead create
2 vectors, both of movi 0x3, just one being 64-bits and one being 128-bits.

So if the cost of selecting it is cheaper than the movi, cse will not consolidate the vectors,
and because movi's are so cheap, the only cost that worked was 0.  But increasing the costs
of movi's requires the costs of everything to be increased (including loads).

I preferred to 0 out the cost, because the worst that can happen is an dup instead of a movi,
And at best a dup instead of a load from a pool (if the constant is complicated).

> 
> Selecting the high part of a vec_dup should get folded into another vec_dup.
> 
> The lowpart bits look OK, but which paths call this function without first
> simplifying the select to a subreg?  The subreg is now the canonical form
> (thanks to r12-2288).

The simplification will happen during folding in cse or in combine.  This costing happens before the folding,
When CSE is trying to decide whether to undo the front end's lowering of constants.

To do so it models the constants and the semantic operation required to extract them. E.g. to get
2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't treat the first element/bottom part special
Here.  Costing wise they would be the same.

Regards,
Tamar
> 
> Thanks,
> Richard
Richard Sandiford Aug. 31, 2021, 4:07 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Tuesday, August 31, 2021 4:14 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
>> and operations
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > @@ -13936,8 +13937,65 @@ cost_plus:
>> >  			     mode, MULT, 1, speed);
>> >            return true;
>> >          }
>> > +	break;
>> > +    case PARALLEL:
>> > +      /* Fall through */
>> 
>> Which code paths lead to getting a PARALLEL here?
>
> Hi,
>
> Thanks for the review!
>
> I added it for completeness because CSE treats a parallel and CONST_VECTOR as
> equivalent when they each entry in the parallel defines a constant.

Could you test whether it ever triggers in practice though?
The code would be much simpler without it.

>> > +    case CONST_VECTOR:
>> > +	{
>> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
>> > +	  /* Not a valid const vector.  */
>> > +	  if (!gen_insn)
>> > +	    break;
>> >
>> > -      /* Fall through.  */
>> > +	  switch (GET_CODE (gen_insn))
>> > +	  {
>> > +	  case CONST_VECTOR:
>> > +	    /* Load using MOVI/MVNI.  */
>> > +	    if (aarch64_simd_valid_immediate (x, NULL))
>> > +	      *cost += extra_cost->vect.movi;
>> > +	    else /* Load using constant pool.  */
>> > +	      *cost += extra_cost->ldst.load;
>> > +	    break;
>> > +	  /* Load using a DUP.  */
>> > +	  case VEC_DUPLICATE:
>> > +	    *cost += extra_cost->vect.dup;
>> > +	    break;
>> 
>> Does this trigger in practice?  The new check==true path (rightly) stops the
>> duplicated element from being forced into a register, but then I would have
>> expected:
>> 
>> rtx
>> gen_vec_duplicate (machine_mode mode, rtx x) {
>>   if (valid_for_const_vector_p (mode, x))
>>     return gen_const_vec_duplicate (mode, x);
>>   return gen_rtx_VEC_DUPLICATE (mode, x); }
>> 
>> to generate the original CONST_VECTOR again.
>
> Yes, but CSE is trying to see whether using a DUP is cheaper than another instruction.
> Normal code won't hit this but CSE is just costing all the different ways one can semantically
> construct a vector, which RTL actually comes out of it depends on how it's folded as you say.

But what I mean is, you call:

	  rtx gen_insn = aarch64_simd_make_constant (x, true);
	  /* Not a valid const vector.  */
	  if (!gen_insn)
	    break;

where aarch64_simd_make_constant does:

  if (CONST_VECTOR_P (vals))
    const_vec = vals;
  else if (GET_CODE (vals) == PARALLEL)
    {
      /* A CONST_VECTOR must contain only CONST_INTs and
	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
	 Only store valid constants in a CONST_VECTOR.  */
      int n_elts = XVECLEN (vals, 0);
      for (i = 0; i < n_elts; ++i)
	{
	  rtx x = XVECEXP (vals, 0, i);
	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
	    n_const++;
	}
      if (n_const == n_elts)
	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
    }
  else
    gcc_unreachable ();

  if (const_vec != NULL_RTX
      && aarch64_simd_valid_immediate (const_vec, NULL))
    /* Load using MOVI/MVNI.  */
    return const_vec;
  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
    /* Loaded using DUP.  */
    return const_dup;

and aarch64_simd_dup_constant does:

  machine_mode mode = GET_MODE (vals);
  machine_mode inner_mode = GET_MODE_INNER (mode);
  rtx x;

  if (!const_vec_duplicate_p (vals, &x))
    return NULL_RTX;

  /* We can load this constant by using DUP and a constant in a
     single ARM register.  This will be cheaper than a vector
     load.  */
  if (!check)
    x = copy_to_mode_reg (inner_mode, x);
  return gen_vec_duplicate (mode, x);

For the โ€œcheckโ€ case, โ€œxโ€ will be a constant, and so gen_vec_duplicate
will call gen_const_vec_duplicate, which will return a CONST_VECTOR.
It didn't seem to be possible for gen_insn to be a VEC_DUPLICATE.

This would be much simpler if we could call aarch64_simd_valid_immediate
and aarch64_simd_dup_constant directly from the rtx cost code, hence the
question about whether the PARALLEL stuff was really needed in practice.

>> > +	  default:
>> > +	    *cost += extra_cost->ldst.load;
>> > +	    break;
>> > +	  }
>> > +	  return true;
>> > +	}
>> > +    case VEC_CONCAT:
>> > +	/* depending on the operation, either DUP or INS.
>> > +	   For now, keep default costing.  */
>> > +	break;
>> > +    case VEC_DUPLICATE:
>> > +	*cost += extra_cost->vect.dup;
>> > +	return true;
>> > +    case VEC_SELECT:
>> > +	{
>> > +	  /* cost subreg of 0 as free, otherwise as DUP */
>> > +	  rtx op1 = XEXP (x, 1);
>> > +	  int nelts;
>> > +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
>> > +	      || (BYTES_BIG_ENDIAN
>> > +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
>> > +		  && INTVAL (op1) == nelts - 1))
>> > +	    ;
>> > +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
>> > +	    ;
>> > +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
>> > +	  /* Selecting the high part is not technically free, but we lack
>> > +	     enough information to decide that here.  For instance selecting
>> > +	     the high-part of a vec_dup *is* free or to feed into any _high
>> > +	     instruction.   Both of which we can't really tell.  That said
>> > +	     have a better chance to optimize an dup vs multiple constants.  */
>> > +	    ;
>> 
>> Not sure about this.  We already try to detect the latter case (_high
>> instructions) via aarch64_strip_extend_vec_half.  We might be missing some
>> cases, but that still feels like the right way to go IMO.
>
> That's a different problem from what I understand.  What this is trying to say is that
> If you have a vector [x y a b] and you need vector [x y] that you can use the top part
> of the original vector for this.
>
> This is an approximation, because something that can be created with a movi is probably
> Cheaper to keep distinct if it's not going to be paired with a _high operation (since you will have a dup then).
>
> The problem is that the front end has already spit the two Vectors into [x y a b] and [x y].
> There's nothing else that tries to consolidate them back up if both survive.
>
> As a consequence of this, the testcase test0 is not handled optimally.  It would instead create
> 2 vectors, both of movi 0x3, just one being 64-bits and one being 128-bits.
>
> So if the cost of selecting it is cheaper than the movi, cse will not consolidate the vectors,
> and because movi's are so cheap, the only cost that worked was 0.  But increasing the costs
> of movi's requires the costs of everything to be increased (including loads).
>
> I preferred to 0 out the cost, because the worst that can happen is an dup instead of a movi,
> And at best a dup instead of a load from a pool (if the constant is complicated).

Hmm, will need to look at this more tomorrow.

>> Selecting the high part of a vec_dup should get folded into another vec_dup.
>> 
>> The lowpart bits look OK, but which paths call this function without first
>> simplifying the select to a subreg?  The subreg is now the canonical form
>> (thanks to r12-2288).
>
> The simplification will happen during folding in cse or in combine.  This costing happens before the folding,
> When CSE is trying to decide whether to undo the front end's lowering of constants.
>
> To do so it models the constants and the semantic operation required to extract them. E.g. to get
> 2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't treat the first element/bottom part special
> Here.  Costing wise they would be the same.

But which code path creates the VEC_SELECT?  We don't need any context
to know that the VEC_SELECT is non-canonical.  It's obvious from the
operands of the VEC_SELECT in isolation.

I'd just rather tackle this at source than try to get the cost code
to handle non-canonical rtl.

Thanks,
Richard
Tamar Christina Aug. 31, 2021, 4:45 p.m. UTC | #4
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, August 31, 2021 5:07 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Tuesday, August 31, 2021 4:14 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > @@ -13936,8 +13937,65 @@ cost_plus:
> >> >  			     mode, MULT, 1, speed);
> >> >            return true;
> >> >          }
> >> > +	break;
> >> > +    case PARALLEL:
> >> > +      /* Fall through */
> >>
> >> Which code paths lead to getting a PARALLEL here?
> >
> > Hi,
> >
> > Thanks for the review!
> >
> > I added it for completeness because CSE treats a parallel and
> > CONST_VECTOR as equivalent when they each entry in the parallel defines
> a constant.
> 
> Could you test whether it ever triggers in practice though?
> The code would be much simpler without it.

Will check ๐Ÿ˜Š

> 
> >> > +    case CONST_VECTOR:
> >> > +	{
> >> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> >> > +	  /* Not a valid const vector.  */
> >> > +	  if (!gen_insn)
> >> > +	    break;
> >> >
> >> > -      /* Fall through.  */
> >> > +	  switch (GET_CODE (gen_insn))
> >> > +	  {
> >> > +	  case CONST_VECTOR:
> >> > +	    /* Load using MOVI/MVNI.  */
> >> > +	    if (aarch64_simd_valid_immediate (x, NULL))
> >> > +	      *cost += extra_cost->vect.movi;
> >> > +	    else /* Load using constant pool.  */
> >> > +	      *cost += extra_cost->ldst.load;
> >> > +	    break;
> >> > +	  /* Load using a DUP.  */
> >> > +	  case VEC_DUPLICATE:
> >> > +	    *cost += extra_cost->vect.dup;
> >> > +	    break;
> >>
> >> Does this trigger in practice?  The new check==true path (rightly)
> >> stops the duplicated element from being forced into a register, but
> >> then I would have
> >> expected:
> >>
> >> rtx
> >> gen_vec_duplicate (machine_mode mode, rtx x) {
> >>   if (valid_for_const_vector_p (mode, x))
> >>     return gen_const_vec_duplicate (mode, x);
> >>   return gen_rtx_VEC_DUPLICATE (mode, x); }
> >>
> >> to generate the original CONST_VECTOR again.
> >
> > Yes, but CSE is trying to see whether using a DUP is cheaper than another
> instruction.
> > Normal code won't hit this but CSE is just costing all the different
> > ways one can semantically construct a vector, which RTL actually comes out
> of it depends on how it's folded as you say.
> 
> But what I mean is, you call:
> 
> 	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> 	  /* Not a valid const vector.  */
> 	  if (!gen_insn)
> 	    break;
> 
> where aarch64_simd_make_constant does:
> 
>   if (CONST_VECTOR_P (vals))
>     const_vec = vals;
>   else if (GET_CODE (vals) == PARALLEL)
>     {
>       /* A CONST_VECTOR must contain only CONST_INTs and
> 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
> 	 Only store valid constants in a CONST_VECTOR.  */
>       int n_elts = XVECLEN (vals, 0);
>       for (i = 0; i < n_elts; ++i)
> 	{
> 	  rtx x = XVECEXP (vals, 0, i);
> 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> 	    n_const++;
> 	}
>       if (n_const == n_elts)
> 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
>     }
>   else
>     gcc_unreachable ();
> 
>   if (const_vec != NULL_RTX
>       && aarch64_simd_valid_immediate (const_vec, NULL))
>     /* Load using MOVI/MVNI.  */
>     return const_vec;
>   else if ((const_dup = aarch64_simd_dup_constant (vals, check)) !=
> NULL_RTX)
>     /* Loaded using DUP.  */
>     return const_dup;
> 
> and aarch64_simd_dup_constant does:
> 
>   machine_mode mode = GET_MODE (vals);
>   machine_mode inner_mode = GET_MODE_INNER (mode);
>   rtx x;
> 
>   if (!const_vec_duplicate_p (vals, &x))
>     return NULL_RTX;
> 
>   /* We can load this constant by using DUP and a constant in a
>      single ARM register.  This will be cheaper than a vector
>      load.  */
>   if (!check)
>     x = copy_to_mode_reg (inner_mode, x);
>   return gen_vec_duplicate (mode, x);
> 
> For the โ€œcheckโ€ case, โ€œxโ€ will be a constant, and so gen_vec_duplicate will call
> gen_const_vec_duplicate, which will return a CONST_VECTOR.
> It didn't seem to be possible for gen_insn to be a VEC_DUPLICATE.
>

Yes, but CSE can ask the cost of a VEC_DUPLICATE directly on a register without going through gen_const_vec_duplicate
which is intended as the gen_ functions can have side effects (e.g. creating new psuedos etc)

If say it sees a constant x and a vector [x x x x] it wants to know what the cost keeping
x and materializing [x x x x] vs doing a duplicate of x into [x x x x] is.

In this case since both the constant and the vectors are needed you won't get a constant there but a register so you'll actually see a
vec_dup. If CSE pushes in the constant that would defeat the point ๐Ÿ˜Š. Right now it's CSE that's pushing constants of vec_dup into vec_constants.

My change is making it explicitly ask for the cost of doing this instead of assuming it always cheaper because for a large majority of
cases it's not actually cheaper and is highly dependent on the targets ability to create said constant.

So this hook will see both versions, the dup of the register and the vec_constant while CSE is trying to decide which one to keep.

> This would be much simpler if we could call aarch64_simd_valid_immediate
> and aarch64_simd_dup_constant directly from the rtx cost code,

Agreed... I tried to separate them before, but the logic was annoying to split and I thought not worth the effort, so instead I just
changed it to have a checking only mode.

> hence the
> question about whether the PARALLEL stuff was really needed in practice.
> 
> >> > +	  default:
> >> > +	    *cost += extra_cost->ldst.load;
> >> > +	    break;
> >> > +	  }
> >> > +	  return true;
> >> > +	}
> >> > +    case VEC_CONCAT:
> >> > +	/* depending on the operation, either DUP or INS.
> >> > +	   For now, keep default costing.  */
> >> > +	break;
> >> > +    case VEC_DUPLICATE:
> >> > +	*cost += extra_cost->vect.dup;
> >> > +	return true;
> >> > +    case VEC_SELECT:
> >> > +	{
> >> > +	  /* cost subreg of 0 as free, otherwise as DUP */
> >> > +	  rtx op1 = XEXP (x, 1);
> >> > +	  int nelts;
> >> > +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
> >> > +	      || (BYTES_BIG_ENDIAN
> >> > +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
> >> > +		  && INTVAL (op1) == nelts - 1))
> >> > +	    ;
> >> > +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> >> > +	    ;
> >> > +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> >> > +	  /* Selecting the high part is not technically free, but we lack
> >> > +	     enough information to decide that here.  For instance selecting
> >> > +	     the high-part of a vec_dup *is* free or to feed into any _high
> >> > +	     instruction.   Both of which we can't really tell.  That said
> >> > +	     have a better chance to optimize an dup vs multiple constants.  */
> >> > +	    ;
> >>
> >> Not sure about this.  We already try to detect the latter case (_high
> >> instructions) via aarch64_strip_extend_vec_half.  We might be missing
> >> some cases, but that still feels like the right way to go IMO.
> >
> > That's a different problem from what I understand.  What this is
> > trying to say is that If you have a vector [x y a b] and you need
> > vector [x y] that you can use the top part of the original vector for this.
> >
> > This is an approximation, because something that can be created with a
> > movi is probably Cheaper to keep distinct if it's not going to be paired with a
> _high operation (since you will have a dup then).
> >
> > The problem is that the front end has already spit the two Vectors into [x y
> a b] and [x y].
> > There's nothing else that tries to consolidate them back up if both survive.
> >
> > As a consequence of this, the testcase test0 is not handled optimally.
> > It would instead create
> > 2 vectors, both of movi 0x3, just one being 64-bits and one being 128-bits.
> >
> > So if the cost of selecting it is cheaper than the movi, cse will not
> > consolidate the vectors, and because movi's are so cheap, the only
> > cost that worked was 0.  But increasing the costs of movi's requires the
> costs of everything to be increased (including loads).
> >
> > I preferred to 0 out the cost, because the worst that can happen is an
> > dup instead of a movi, And at best a dup instead of a load from a pool (if
> the constant is complicated).
> 
> Hmm, will need to look at this more tomorrow.
> 
> >> Selecting the high part of a vec_dup should get folded into another
> vec_dup.
> >>
> >> The lowpart bits look OK, but which paths call this function without
> >> first simplifying the select to a subreg?  The subreg is now the
> >> canonical form (thanks to r12-2288).
> >
> > The simplification will happen during folding in cse or in combine.
> > This costing happens before the folding, When CSE is trying to decide
> whether to undo the front end's lowering of constants.
> >
> > To do so it models the constants and the semantic operation required
> > to extract them. E.g. to get
> > 2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't treat
> > the first element/bottom part special Here.  Costing wise they would be
> the same.
> 
> But which code path creates the VEC_SELECT?  We don't need any context to
> know that the VEC_SELECT is non-canonical.  It's obvious from the operands
> of the VEC_SELECT in isolation.

The non-cannonical RTL is never generated. I assume we're talking about the 0 case here
Since subregs can't select arbitrary elements (as I asked before).

For the 0 case it's only temporarily modelled as such as such to keep the CSE alternative costing simple.
Currently it's just a for loop for I = 0 to vec_elems.

When it comes time to generate the actual insn fold_rtx is called which will fold the VEC_SELECT
Into a subreg.

So it's never emitted into the instruction stream in its non canonical form.

> 
> I'd just rather tackle this at source than try to get the cost code to handle
> non-canonical rtl.

If that's what is preferred I can change the CSE patch to generate a subreg for the 0 case, I'm not sure I agree with it
as CSE is just trying to ask "what Is the cost of selecting the element 0 in this vector".  And as I mentioned before
it never emits the instruction unfolded.  This representation seems to a more logical representation for costing to me.

It's however unfortunate that there's only one costing callback, as far as CSE is concerned the representation/form
doesn't matter, it's just looking at the high level operation.

Or is the concern here that most targets will have costing for subreg 0 but not VEC_SELECT? In which case without
Actually handling the costs of the other operations the CSE changes won't do anything for targets anyway.  And it would
be odd for a target to cost VEC_SELECT from 1 to <N> instead of just costing 0 too.

Regards,
Tamar

> 
> Thanks,
> Richard
Richard Sandiford Aug. 31, 2021, 6:37 p.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Tuesday, August 31, 2021 5:07 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
>> and operations
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Tuesday, August 31, 2021 4:14 PM
>> >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
>> >> constants and operations
>> >>
>> >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> > @@ -13936,8 +13937,65 @@ cost_plus:
>> >> >  			     mode, MULT, 1, speed);
>> >> >            return true;
>> >> >          }
>> >> > +	break;
>> >> > +    case PARALLEL:
>> >> > +      /* Fall through */
>> >>
>> >> Which code paths lead to getting a PARALLEL here?
>> >
>> > Hi,
>> >
>> > Thanks for the review!
>> >
>> > I added it for completeness because CSE treats a parallel and
>> > CONST_VECTOR as equivalent when they each entry in the parallel defines
>> a constant.
>> 
>> Could you test whether it ever triggers in practice though?
>> The code would be much simpler without it.
>
> Will check ๐Ÿ˜Š
>
>> 
>> >> > +    case CONST_VECTOR:
>> >> > +	{
>> >> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
>> >> > +	  /* Not a valid const vector.  */
>> >> > +	  if (!gen_insn)
>> >> > +	    break;
>> >> >
>> >> > -      /* Fall through.  */
>> >> > +	  switch (GET_CODE (gen_insn))
>> >> > +	  {
>> >> > +	  case CONST_VECTOR:
>> >> > +	    /* Load using MOVI/MVNI.  */
>> >> > +	    if (aarch64_simd_valid_immediate (x, NULL))
>> >> > +	      *cost += extra_cost->vect.movi;
>> >> > +	    else /* Load using constant pool.  */
>> >> > +	      *cost += extra_cost->ldst.load;
>> >> > +	    break;
>> >> > +	  /* Load using a DUP.  */
>> >> > +	  case VEC_DUPLICATE:
>> >> > +	    *cost += extra_cost->vect.dup;
>> >> > +	    break;
>> >>
>> >> Does this trigger in practice?  The new check==true path (rightly)
>> >> stops the duplicated element from being forced into a register, but
>> >> then I would have
>> >> expected:
>> >>
>> >> rtx
>> >> gen_vec_duplicate (machine_mode mode, rtx x) {
>> >>   if (valid_for_const_vector_p (mode, x))
>> >>     return gen_const_vec_duplicate (mode, x);
>> >>   return gen_rtx_VEC_DUPLICATE (mode, x); }
>> >>
>> >> to generate the original CONST_VECTOR again.
>> >
>> > Yes, but CSE is trying to see whether using a DUP is cheaper than another
>> instruction.
>> > Normal code won't hit this but CSE is just costing all the different
>> > ways one can semantically construct a vector, which RTL actually comes out
>> of it depends on how it's folded as you say.
>> 
>> But what I mean is, you call:
>> 
>> 	  rtx gen_insn = aarch64_simd_make_constant (x, true);
>> 	  /* Not a valid const vector.  */
>> 	  if (!gen_insn)
>> 	    break;
>> 
>> where aarch64_simd_make_constant does:
>> 
>>   if (CONST_VECTOR_P (vals))
>>     const_vec = vals;
>>   else if (GET_CODE (vals) == PARALLEL)
>>     {
>>       /* A CONST_VECTOR must contain only CONST_INTs and
>> 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
>> 	 Only store valid constants in a CONST_VECTOR.  */
>>       int n_elts = XVECLEN (vals, 0);
>>       for (i = 0; i < n_elts; ++i)
>> 	{
>> 	  rtx x = XVECEXP (vals, 0, i);
>> 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
>> 	    n_const++;
>> 	}
>>       if (n_const == n_elts)
>> 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
>>     }
>>   else
>>     gcc_unreachable ();
>> 
>>   if (const_vec != NULL_RTX
>>       && aarch64_simd_valid_immediate (const_vec, NULL))
>>     /* Load using MOVI/MVNI.  */
>>     return const_vec;
>>   else if ((const_dup = aarch64_simd_dup_constant (vals, check)) !=
>> NULL_RTX)
>>     /* Loaded using DUP.  */
>>     return const_dup;
>> 
>> and aarch64_simd_dup_constant does:
>> 
>>   machine_mode mode = GET_MODE (vals);
>>   machine_mode inner_mode = GET_MODE_INNER (mode);
>>   rtx x;
>> 
>>   if (!const_vec_duplicate_p (vals, &x))
>>     return NULL_RTX;
>> 
>>   /* We can load this constant by using DUP and a constant in a
>>      single ARM register.  This will be cheaper than a vector
>>      load.  */
>>   if (!check)
>>     x = copy_to_mode_reg (inner_mode, x);
>>   return gen_vec_duplicate (mode, x);
>> 
>> For the โ€œcheckโ€ case, โ€œxโ€ will be a constant, and so gen_vec_duplicate will call
>> gen_const_vec_duplicate, which will return a CONST_VECTOR.
>> It didn't seem to be possible for gen_insn to be a VEC_DUPLICATE.
>>
>
> Yes, but CSE can ask the cost of a VEC_DUPLICATE directly on a register without going through gen_const_vec_duplicate
> which is intended as the gen_ functions can have side effects (e.g. creating new psuedos etc)
>
> If say it sees a constant x and a vector [x x x x] it wants to know what the cost keeping
> x and materializing [x x x x] vs doing a duplicate of x into [x x x x] is.
>
> In this case since both the constant and the vectors are needed you won't get a constant there but a register so you'll actually see a
> vec_dup. If CSE pushes in the constant that would defeat the point ๐Ÿ˜Š. Right now it's CSE that's pushing constants of vec_dup into vec_constants.
>
> My change is making it explicitly ask for the cost of doing this instead of assuming it always cheaper because for a large majority of
> cases it's not actually cheaper and is highly dependent on the targets ability to create said constant.
>
> So this hook will see both versions, the dup of the register and the vec_constant while CSE is trying to decide which one to keep.

But the code I quoted above is from:

+	break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    *cost += extra_cost->vect.dup;
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}

Here, CSE is passing in a PARALLEL or a CONST_VECTOR.  That rtx
then gets passed to aarch64_simd_make_constant.  We then switch
based on the result of aarch64_simd_make_constant, with a case
statement for VEC_DUPLICATE.  So the code is handling a case in
which aarch64_simd_make_constant converts a PARALLEL or a
CONST_VECTOR (passed by CSE) into a VEC_DUPLICATE.  For the reasons
above, that doesn't seem to be possible.  aarch64_simd_make_constant
would return duplicated constants as a CONST_VECTOR rather than
a VEC_DUPLICATE.

It sounds like you're talking about the separate top-level
VEC_DUPLICATE case, which is obviously OK/needed.

Maybe it would be better to turn it around and say: do you have
a case in which the nested VEC_DUPLICATE case above is reached?

>> This would be much simpler if we could call aarch64_simd_valid_immediate
>> and aarch64_simd_dup_constant directly from the rtx cost code,

BTW, I meant const_vec_duplicate_p here. sorry.

> Agreed... I tried to separate them before, but the logic was annoying to split and I thought not worth the effort, so instead I just
> changed it to have a checking only mode.
>
>> hence the
>> question about whether the PARALLEL stuff was really needed in practice.
>> 
>> >> > +	  default:
>> >> > +	    *cost += extra_cost->ldst.load;
>> >> > +	    break;
>> >> > +	  }
>> >> > +	  return true;
>> >> > +	}
>> >> > +    case VEC_CONCAT:
>> >> > +	/* depending on the operation, either DUP or INS.
>> >> > +	   For now, keep default costing.  */
>> >> > +	break;
>> >> > +    case VEC_DUPLICATE:
>> >> > +	*cost += extra_cost->vect.dup;
>> >> > +	return true;
>> >> > +    case VEC_SELECT:
>> >> > +	{
>> >> > +	  /* cost subreg of 0 as free, otherwise as DUP */
>> >> > +	  rtx op1 = XEXP (x, 1);
>> >> > +	  int nelts;
>> >> > +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
>> >> > +	      || (BYTES_BIG_ENDIAN
>> >> > +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
>> >> > +		  && INTVAL (op1) == nelts - 1))
>> >> > +	    ;
>> >> > +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
>> >> > +	    ;
>> >> > +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
>> >> > +	  /* Selecting the high part is not technically free, but we lack
>> >> > +	     enough information to decide that here.  For instance selecting
>> >> > +	     the high-part of a vec_dup *is* free or to feed into any _high
>> >> > +	     instruction.   Both of which we can't really tell.  That said
>> >> > +	     have a better chance to optimize an dup vs multiple constants.  */
>> >> > +	    ;
>> >>
>> >> Not sure about this.  We already try to detect the latter case (_high
>> >> instructions) via aarch64_strip_extend_vec_half.  We might be missing
>> >> some cases, but that still feels like the right way to go IMO.
>> >
>> > That's a different problem from what I understand.  What this is
>> > trying to say is that If you have a vector [x y a b] and you need
>> > vector [x y] that you can use the top part of the original vector for this.
>> >
>> > This is an approximation, because something that can be created with a
>> > movi is probably Cheaper to keep distinct if it's not going to be paired with a
>> _high operation (since you will have a dup then).
>> >
>> > The problem is that the front end has already spit the two Vectors into [x y
>> a b] and [x y].
>> > There's nothing else that tries to consolidate them back up if both survive.
>> >
>> > As a consequence of this, the testcase test0 is not handled optimally.
>> > It would instead create
>> > 2 vectors, both of movi 0x3, just one being 64-bits and one being 128-bits.
>> >
>> > So if the cost of selecting it is cheaper than the movi, cse will not
>> > consolidate the vectors, and because movi's are so cheap, the only
>> > cost that worked was 0.  But increasing the costs of movi's requires the
>> costs of everything to be increased (including loads).
>> >
>> > I preferred to 0 out the cost, because the worst that can happen is an
>> > dup instead of a movi, And at best a dup instead of a load from a pool (if
>> the constant is complicated).
>> 
>> Hmm, will need to look at this more tomorrow.
>> 
>> >> Selecting the high part of a vec_dup should get folded into another
>> vec_dup.
>> >>
>> >> The lowpart bits look OK, but which paths call this function without
>> >> first simplifying the select to a subreg?  The subreg is now the
>> >> canonical form (thanks to r12-2288).
>> >
>> > The simplification will happen during folding in cse or in combine.
>> > This costing happens before the folding, When CSE is trying to decide
>> whether to undo the front end's lowering of constants.
>> >
>> > To do so it models the constants and the semantic operation required
>> > to extract them. E.g. to get
>> > 2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't treat
>> > the first element/bottom part special Here.  Costing wise they would be
>> the same.
>> 
>> But which code path creates the VEC_SELECT?  We don't need any context to
>> know that the VEC_SELECT is non-canonical.  It's obvious from the operands
>> of the VEC_SELECT in isolation.
>
> The non-cannonical RTL is never generated. I assume we're talking about the 0 case here
> Since subregs can't select arbitrary elements (as I asked before).
>
> For the 0 case it's only temporarily modelled as such as such to keep the CSE alternative costing simple.
> Currently it's just a for loop for I = 0 to vec_elems.

Ah, sorry, I see now that you're talking about the 1/2 patch.
I looked at this one first :-)

> When it comes time to generate the actual insn fold_rtx is called which will fold the VEC_SELECT
> Into a subreg.
>
> So it's never emitted into the instruction stream in its non canonical form.
>
>> 
>> I'd just rather tackle this at source than try to get the cost code to handle
>> non-canonical rtl.
>
> If that's what is preferred I can change the CSE patch to generate a subreg for the 0 case, I'm not sure I agree with it
> as CSE is just trying to ask "what Is the cost of selecting the element 0 in this vector".  And as I mentioned before
> it never emits the instruction unfolded.  This representation seems to a more logical representation for costing to me.

I think it's better to cost what we intend to generate.  Otherwise each
target needs to handle both forms: โ€œCSE asks about this, but actually
intends to generate that insteadโ€.

> It's however unfortunate that there's only one costing callback, as far as CSE is concerned the representation/form
> doesn't matter, it's just looking at the high level operation.
>
> Or is the concern here that most targets will have costing for subreg 0 but not VEC_SELECT? In which case without
> Actually handling the costs of the other operations the CSE changes won't do anything for targets anyway.  And it would
> be odd for a target to cost VEC_SELECT from 1 to <N> instead of just costing 0 too.

Well, even for the motivating target (aarch64), we had to make changes
to treat index 0 as especially cheap.  That's likely to be necessary on
other targets too, if they want to take advantage of this.  The for
loop exists because the index matters.

I'm still a bit sceptical about treating the high-part cost as lower.
ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and any
others should have a normal cost.  So if CSE handled the subreg case
itself (to model how the rtx would actually be generated) then aarch64
code would have to do less work.  I imagine that will be true for other
targets as well.

Thanks,
Richard
Tamar Christina Sept. 8, 2021, 12:58 p.m. UTC | #6
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, August 31, 2021 7:38 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Tuesday, August 31, 2021 5:07 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Tuesday, August 31, 2021 4:14 PM
> >> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> >> constants and operations
> >> >>
> >> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > @@ -13936,8 +13937,65 @@ cost_plus:
> >> >> >  			     mode, MULT, 1, speed);
> >> >> >            return true;
> >> >> >          }
> >> >> > +	break;
> >> >> > +    case PARALLEL:
> >> >> > +      /* Fall through */
> >> >>
> >> >> Which code paths lead to getting a PARALLEL here?
> >> >
> >> > Hi,
> >> >
> >> > Thanks for the review!
> >> >
> >> > I added it for completeness because CSE treats a parallel and
> >> > CONST_VECTOR as equivalent when they each entry in the parallel
> >> > defines
> >> a constant.
> >>
> >> Could you test whether it ever triggers in practice though?
> >> The code would be much simpler without it.
> >
> > Will check ๐Ÿ˜Š

Looks like for AArch64 there's no real way for this to happen so I've removed this case.


> >
> >>
> >> >> > +    case CONST_VECTOR:
> >> >> > +	{
> >> >> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> >> >> > +	  /* Not a valid const vector.  */
> >> >> > +	  if (!gen_insn)
> >> >> > +	    break;
> >> >> >
> >> >> > -      /* Fall through.  */
> >> >> > +	  switch (GET_CODE (gen_insn))
> >> >> > +	  {
> >> >> > +	  case CONST_VECTOR:
> >> >> > +	    /* Load using MOVI/MVNI.  */
> >> >> > +	    if (aarch64_simd_valid_immediate (x, NULL))
> >> >> > +	      *cost += extra_cost->vect.movi;
> >> >> > +	    else /* Load using constant pool.  */
> >> >> > +	      *cost += extra_cost->ldst.load;
> >> >> > +	    break;
> >> >> > +	  /* Load using a DUP.  */
> >> >> > +	  case VEC_DUPLICATE:
> >> >> > +	    *cost += extra_cost->vect.dup;
> >> >> > +	    break;
> >> >>
> >> >> Does this trigger in practice?  The new check==true path (rightly)
> >> >> stops the duplicated element from being forced into a register,
> >> >> but then I would have
> >> >> expected:
> >> >>
> >> >> rtx
> >> >> gen_vec_duplicate (machine_mode mode, rtx x) {
> >> >>   if (valid_for_const_vector_p (mode, x))
> >> >>     return gen_const_vec_duplicate (mode, x);
> >> >>   return gen_rtx_VEC_DUPLICATE (mode, x); }
> >> >>
> >> >> to generate the original CONST_VECTOR again.
> >> >
> >> > Yes, but CSE is trying to see whether using a DUP is cheaper than
> >> > another
> >> instruction.
> >> > Normal code won't hit this but CSE is just costing all the
> >> > different ways one can semantically construct a vector, which RTL
> >> > actually comes out
> >> of it depends on how it's folded as you say.
> >>
> >> But what I mean is, you call:
> >>
> >> 	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> >> 	  /* Not a valid const vector.  */
> >> 	  if (!gen_insn)
> >> 	    break;
> >>
> >> where aarch64_simd_make_constant does:
> >>
> >>   if (CONST_VECTOR_P (vals))
> >>     const_vec = vals;
> >>   else if (GET_CODE (vals) == PARALLEL)
> >>     {
> >>       /* A CONST_VECTOR must contain only CONST_INTs and
> >> 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
> >> 	 Only store valid constants in a CONST_VECTOR.  */
> >>       int n_elts = XVECLEN (vals, 0);
> >>       for (i = 0; i < n_elts; ++i)
> >> 	{
> >> 	  rtx x = XVECEXP (vals, 0, i);
> >> 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
> >> 	    n_const++;
> >> 	}
> >>       if (n_const == n_elts)
> >> 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
> >>     }
> >>   else
> >>     gcc_unreachable ();
> >>
> >>   if (const_vec != NULL_RTX
> >>       && aarch64_simd_valid_immediate (const_vec, NULL))
> >>     /* Load using MOVI/MVNI.  */
> >>     return const_vec;
> >>   else if ((const_dup = aarch64_simd_dup_constant (vals, check)) !=
> >> NULL_RTX)
> >>     /* Loaded using DUP.  */
> >>     return const_dup;
> >>
> >> and aarch64_simd_dup_constant does:
> >>
> >>   machine_mode mode = GET_MODE (vals);
> >>   machine_mode inner_mode = GET_MODE_INNER (mode);
> >>   rtx x;
> >>
> >>   if (!const_vec_duplicate_p (vals, &x))
> >>     return NULL_RTX;
> >>
> >>   /* We can load this constant by using DUP and a constant in a
> >>      single ARM register.  This will be cheaper than a vector
> >>      load.  */
> >>   if (!check)
> >>     x = copy_to_mode_reg (inner_mode, x);
> >>   return gen_vec_duplicate (mode, x);
> >>
> >> For the โ€œcheckโ€ case, โ€œxโ€ will be a constant, and so
> >> gen_vec_duplicate will call gen_const_vec_duplicate, which will return a
> CONST_VECTOR.
> >> It didn't seem to be possible for gen_insn to be a VEC_DUPLICATE.
> >>
> >
> > Yes, but CSE can ask the cost of a VEC_DUPLICATE directly on a
> > register without going through gen_const_vec_duplicate which is
> > intended as the gen_ functions can have side effects (e.g. creating
> > new psuedos etc)
> >
> > If say it sees a constant x and a vector [x x x x] it wants to know
> > what the cost keeping x and materializing [x x x x] vs doing a duplicate of x
> into [x x x x] is.
> >
> > In this case since both the constant and the vectors are needed you
> > won't get a constant there but a register so you'll actually see a vec_dup. If
> CSE pushes in the constant that would defeat the point ๐Ÿ˜Š. Right now it's
> CSE that's pushing constants of vec_dup into vec_constants.
> >
> > My change is making it explicitly ask for the cost of doing this
> > instead of assuming it always cheaper because for a large majority of cases
> it's not actually cheaper and is highly dependent on the targets ability to
> create said constant.
> >
> > So this hook will see both versions, the dup of the register and the
> vec_constant while CSE is trying to decide which one to keep.
> 
> But the code I quoted above is from:
> 
> +	break;
> +    case PARALLEL:
> +      /* Fall through */
> +    case CONST_VECTOR:
> +	{
> +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> +	  /* Not a valid const vector.  */
> +	  if (!gen_insn)
> +	    break;
> 
> -      /* Fall through.  */
> +	  switch (GET_CODE (gen_insn))
> +	  {
> +	  case CONST_VECTOR:
> +	    /* Load using MOVI/MVNI.  */
> +	    if (aarch64_simd_valid_immediate (x, NULL))
> +	      *cost += extra_cost->vect.movi;
> +	    else /* Load using constant pool.  */
> +	      *cost += extra_cost->ldst.load;
> +	    break;
> +	  /* Load using a DUP.  */
> +	  case VEC_DUPLICATE:
> +	    *cost += extra_cost->vect.dup;
> +	    break;
> +	  default:
> +	    *cost += extra_cost->ldst.load;
> +	    break;
> +	  }
> +	  return true;
> +	}
> 
> Here, CSE is passing in a PARALLEL or a CONST_VECTOR.  That rtx then gets
> passed to aarch64_simd_make_constant.  We then switch based on the
> result of aarch64_simd_make_constant, with a case statement for
> VEC_DUPLICATE.  So the code is handling a case in which
> aarch64_simd_make_constant converts a PARALLEL or a CONST_VECTOR
> (passed by CSE) into a VEC_DUPLICATE.  For the reasons above, that doesn't
> seem to be possible.  aarch64_simd_make_constant would return duplicated
> constants as a CONST_VECTOR rather than a VEC_DUPLICATE.
> 
> It sounds like you're talking about the separate top-level VEC_DUPLICATE
> case, which is obviously OK/needed.

Yes my apologies, I had completely misunderstood the quoted context โ˜น
This case can indeed not happen.  I've placed an assert there in case someone
changes that function in the future.

> 
> Maybe it would be better to turn it around and say: do you have a case in
> which the nested VEC_DUPLICATE case above is reached?
> 
> >> This would be much simpler if we could call
> >> aarch64_simd_valid_immediate and aarch64_simd_dup_constant directly
> >> from the rtx cost code,
> 
> BTW, I meant const_vec_duplicate_p here. sorry.
> 
> > Agreed... I tried to separate them before, but the logic was annoying
> > to split and I thought not worth the effort, so instead I just changed it to
> have a checking only mode.
> >
> >> hence the
> >> question about whether the PARALLEL stuff was really needed in practice.
> >>
> >> >> > +	  default:
> >> >> > +	    *cost += extra_cost->ldst.load;
> >> >> > +	    break;
> >> >> > +	  }
> >> >> > +	  return true;
> >> >> > +	}
> >> >> > +    case VEC_CONCAT:
> >> >> > +	/* depending on the operation, either DUP or INS.
> >> >> > +	   For now, keep default costing.  */
> >> >> > +	break;
> >> >> > +    case VEC_DUPLICATE:
> >> >> > +	*cost += extra_cost->vect.dup;
> >> >> > +	return true;
> >> >> > +    case VEC_SELECT:
> >> >> > +	{
> >> >> > +	  /* cost subreg of 0 as free, otherwise as DUP */
> >> >> > +	  rtx op1 = XEXP (x, 1);
> >> >> > +	  int nelts;
> >> >> > +	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
> >> >> > +	      || (BYTES_BIG_ENDIAN
> >> >> > +		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
> >> >> > +		  && INTVAL (op1) == nelts - 1))
> >> >> > +	    ;
> >> >> > +	  else if (vec_series_lowpart_p (mode, GET_MODE (op1),
> op1))
> >> >> > +	    ;
> >> >> > +	  else if (vec_series_highpart_p (mode, GET_MODE (op1),
> op1))
> >> >> > +	  /* Selecting the high part is not technically free, but we lack
> >> >> > +	     enough information to decide that here.  For instance
> selecting
> >> >> > +	     the high-part of a vec_dup *is* free or to feed into any
> _high
> >> >> > +	     instruction.   Both of which we can't really tell.  That said
> >> >> > +	     have a better chance to optimize an dup vs multiple
> constants.  */
> >> >> > +	    ;
> >> >>
> >> >> Not sure about this.  We already try to detect the latter case
> >> >> (_high
> >> >> instructions) via aarch64_strip_extend_vec_half.  We might be
> >> >> missing some cases, but that still feels like the right way to go IMO.
> >> >
> >> > That's a different problem from what I understand.  What this is
> >> > trying to say is that If you have a vector [x y a b] and you need
> >> > vector [x y] that you can use the top part of the original vector for this.
> >> >
> >> > This is an approximation, because something that can be created
> >> > with a movi is probably Cheaper to keep distinct if it's not going
> >> > to be paired with a
> >> _high operation (since you will have a dup then).
> >> >
> >> > The problem is that the front end has already spit the two Vectors
> >> > into [x y
> >> a b] and [x y].
> >> > There's nothing else that tries to consolidate them back up if both
> survive.
> >> >
> >> > As a consequence of this, the testcase test0 is not handled optimally.
> >> > It would instead create
> >> > 2 vectors, both of movi 0x3, just one being 64-bits and one being 128-
> bits.
> >> >
> >> > So if the cost of selecting it is cheaper than the movi, cse will
> >> > not consolidate the vectors, and because movi's are so cheap, the
> >> > only cost that worked was 0.  But increasing the costs of movi's
> >> > requires the
> >> costs of everything to be increased (including loads).
> >> >
> >> > I preferred to 0 out the cost, because the worst that can happen is
> >> > an dup instead of a movi, And at best a dup instead of a load from
> >> > a pool (if
> >> the constant is complicated).
> >>
> >> Hmm, will need to look at this more tomorrow.
> >>
> >> >> Selecting the high part of a vec_dup should get folded into
> >> >> another
> >> vec_dup.
> >> >>
> >> >> The lowpart bits look OK, but which paths call this function
> >> >> without first simplifying the select to a subreg?  The subreg is
> >> >> now the canonical form (thanks to r12-2288).
> >> >
> >> > The simplification will happen during folding in cse or in combine.
> >> > This costing happens before the folding, When CSE is trying to
> >> > decide
> >> whether to undo the front end's lowering of constants.
> >> >
> >> > To do so it models the constants and the semantic operation
> >> > required to extract them. E.g. to get
> >> > 2 out of [0 2 4 5] it would need a VEC_SELECT of 1. And I don't
> >> > treat the first element/bottom part special Here.  Costing wise
> >> > they would be
> >> the same.
> >>
> >> But which code path creates the VEC_SELECT?  We don't need any
> >> context to know that the VEC_SELECT is non-canonical.  It's obvious
> >> from the operands of the VEC_SELECT in isolation.
> >
> > The non-cannonical RTL is never generated. I assume we're talking
> > about the 0 case here Since subregs can't select arbitrary elements (as I
> asked before).
> >
> > For the 0 case it's only temporarily modelled as such as such to keep the
> CSE alternative costing simple.
> > Currently it's just a for loop for I = 0 to vec_elems.
> 
> Ah, sorry, I see now that you're talking about the 1/2 patch.
> I looked at this one first :-)
> 
> > When it comes time to generate the actual insn fold_rtx is called
> > which will fold the VEC_SELECT Into a subreg.
> >
> > So it's never emitted into the instruction stream in its non canonical form.
> >
> >>
> >> I'd just rather tackle this at source than try to get the cost code
> >> to handle non-canonical rtl.
> >
> > If that's what is preferred I can change the CSE patch to generate a
> > subreg for the 0 case, I'm not sure I agree with it as CSE is just
> > trying to ask "what Is the cost of selecting the element 0 in this vector".
> And as I mentioned before it never emits the instruction unfolded.  This
> representation seems to a more logical representation for costing to me.
> 
> I think it's better to cost what we intend to generate.  Otherwise each target
> needs to handle both forms: โ€œCSE asks about this, but actually intends to
> generate that insteadโ€.
> 
> > It's however unfortunate that there's only one costing callback, as
> > far as CSE is concerned the representation/form doesn't matter, it's just
> looking at the high level operation.
> >
> > Or is the concern here that most targets will have costing for subreg
> > 0 but not VEC_SELECT? In which case without Actually handling the
> > costs of the other operations the CSE changes won't do anything for targets
> anyway.  And it would be odd for a target to cost VEC_SELECT from 1 to <N>
> instead of just costing 0 too.
> 
> Well, even for the motivating target (aarch64), we had to make changes to
> treat index 0 as especially cheap.  That's likely to be necessary on other
> targets too, if they want to take advantage of this.  The for loop exists
> because the index matters.
> 
> I'm still a bit sceptical about treating the high-part cost as lower.
> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and any others
> should have a normal cost.  So if CSE handled the subreg case itself (to model
> how the rtx would actually be generated) then aarch64 code would have to
> do less work.  I imagine that will be true for other targets as well.

I guess the main problem is that CSE lacks context because it's not until after
combine that the high part becomes truly "free" when pushed into a high operation.

For CSE I don't think there's any real point in rematerializing the constant twice unless
It's needed on a different part of the register file. So in the int (v2si) case, if the lowpart was
Needed on the SIMD side but the high on the genreg side, it's most likely cheaper to create
Int on the integer side using mov/movk then doing a transfer from simd to genreg.

But I believe this to be an outlier, in all other cases, having a DUP, which may get removed
should be better then rematerializing.

I think the question Is, can other passes that use the cost model could this be problematic.
The only other pass that could use this is combine I think, which would run after cse1.

But say cse1 didn't handle it, normally when costing a high operation we cost the register usage
as free anyway, so nothing changes.

To the best of my reasoning I think it's safe/beneficial.  But I can run benchmarks on some of our
Intrinsics heavy code if that would help.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
	(aarch64_rtx_costs): Add extra costs.
	(aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 48eddf64e05afe3788abfa05141f6544a9323ea1..371990fbe2cfb72d22f22ed582bb7ebdebb3edc0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 26d59ba1e13758eed47598c101fd214788637be4..483f1079f3d3967bfd16047c2b8447078c37313c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12703,7 +12704,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -14004,8 +14005,57 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    gcc_unreachable ();
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	  /* Selecting the high part is not technically free, but we lack
+	     enough information to decide that here.  For instance selecting
+	     the high-part of a vec_dup *is* free or to feed into any _high
+	     instruction.   Both of which we can't really tell.  That said
+	     have a better chance to optimize an dup vs multiple constants.  */
+	    ;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20634,9 +20684,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20648,7 +20701,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20656,9 +20710,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20690,7 +20747,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**	movi	v2.16b, 0x3
+**	ldr	q0, \[x0\]
+**	uxtl	v1.8h, v0.8b
+**	uxtl2	v0.8h, v0.16b
+**	ldr	q3, \[x1\]
+**	umlal	v1.8h, v3.8b, v2.8b
+**	umlal2	v0.8h, v3.16b, v2.16b
+**	addhn	v0.8b, v1.8h, v0.8h
+**	str	d0, \[x2\]
+**	ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC0
+**	ldr	q1, \[x2, #:lo12:.LC0\]
+**	add	v0.2d, v1.2d, v0.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x2, .LC1
+**	ldr	q1, \[x2, #:lo12:.LC1\]
+**	add	v0.2d, v0.2d, v1.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC2
+**	ldr	q1, \[x2, #:lo12:.LC2\]
+**	add	v0.4s, v1.4s, v0.4s
+**	str	q0, \[x1\]
+**	fmov	w1, s1
+**	orr	w0, w0, w1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v0.16b, v0.16b, 7
+**	mov	x0, 16512
+**	movk	x0, 0x1020, lsl 16
+**	movk	x0, 0x408, lsl 32
+**	movk	x0, 0x102, lsl 48
+**	fmov	d1, x0
+**	pmull	v2.1q, v0.1d, v1.1d
+**	dup	v1.2d, v1.d\[0\]
+**	pmull2	v0.1q, v0.2d, v1.2d
+**	trn2	v2.8b, v2.8b, v0.8b
+**	umov	w0, v2.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+
Richard Sandiford Oct. 23, 2021, 10:39 a.m. UTC | #7
Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> I'm still a bit sceptical about treating the high-part cost as lower.
>> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and any others
>> should have a normal cost.  So if CSE handled the subreg case itself (to model
>> how the rtx would actually be generated) then aarch64 code would have to
>> do less work.  I imagine that will be true for other targets as well.
>
> I guess the main problem is that CSE lacks context because it's not until after
> combine that the high part becomes truly "free" when pushed into a high operation.

Yeah.  And the aarch64 code is just being asked to cost the operation
it's given, which could for example come from an existing
aarch64_simd_mov_from_<mode>high.  I think we should try to ensure that
a aarch64_simd_mov_from_<mode>high followed by some arithmetic on the
result is more expensive than the fused operation (when fusing is
possible).

An analogy might be: if the cost code is given:

  (add (reg X) (reg Y))

then, at some later point, the (reg X) might be replaced with a
multiplication, in which case we'd have a MADD operation and the
addition is effectively free.  Something similar would happen if
(reg X) became a shift by a small amount on newer cores, although
I guess then you could argue either that the cost of the add
disappears or that the cost of the shift disappears.

But we shouldn't count ADD as free on the basis that it could be
combined with a multiplication or shift in future.  We have to cost
what we're given.  I think the same thing applies to the high part.

Here we're trying to prevent cse1 from replacing a DUP (lane) with
a MOVI by saying that the DUP is strictly cheaper than the MOVI.
I don't think that's really true though, and the cost tables in the
patch say that DUP is more expensive (rather than less expensive)
than MOVI.

Also, if I've understood correctly, it looks like we'd be relying
on the vget_high of a constant remaining unfolded until RTL cse1.
I think it's likely in future that we'd try to fold vget_high
at the gimple level instead, since that could expose more
optimisations of a different kind.  The gimple optimisers would
then fold vget_high(constant) in a similar way to cse1 does now.

So perhaps we should continue to allow the vget_high(constant)
to be foloded in cse1 and come up with some way of coping with
the folded form.

Thanks,
Richard
Tamar Christina Oct. 23, 2021, 2:34 p.m. UTC | #8
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Saturday, October 23, 2021 11:40 AM
> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> I'm still a bit sceptical about treating the high-part cost as lower.
> >> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and any
> >> others should have a normal cost.  So if CSE handled the subreg case
> >> itself (to model how the rtx would actually be generated) then
> >> aarch64 code would have to do less work.  I imagine that will be true for
> other targets as well.
> >
> > I guess the main problem is that CSE lacks context because it's not
> > until after combine that the high part becomes truly "free" when pushed
> into a high operation.
> 
> Yeah.  And the aarch64 code is just being asked to cost the operation it's
> given, which could for example come from an existing
> aarch64_simd_mov_from_<mode>high.  I think we should try to ensure that
> a aarch64_simd_mov_from_<mode>high followed by some arithmetic on
> the result is more expensive than the fused operation (when fusing is
> possible).
> 
> An analogy might be: if the cost code is given:
> 
>   (add (reg X) (reg Y))
> 
> then, at some later point, the (reg X) might be replaced with a multiplication,
> in which case we'd have a MADD operation and the addition is effectively
> free.  Something similar would happen if (reg X) became a shift by a small
> amount on newer cores, although I guess then you could argue either that
> the cost of the add disappears or that the cost of the shift disappears.
> 
> But we shouldn't count ADD as free on the basis that it could be combined
> with a multiplication or shift in future.  We have to cost what we're given.  I
> think the same thing applies to the high part.
> 
> Here we're trying to prevent cse1 from replacing a DUP (lane) with a MOVI
> by saying that the DUP is strictly cheaper than the MOVI.
> I don't think that's really true though, and the cost tables in the patch say that
> DUP is more expensive (rather than less expensive) than MOVI.

No we're not. The front end has already pushed the constant into each operation that needs it
which is the entire problem.

MOVI as I mentioned before is the one case where this is a toss up.  But there are far
more constants that cannot be created with a movi.  A simple example is

#include <arm_neon.h>

int8x16_t square(int8x16_t full, int8x8_t small) {
    int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
    int8x8_t low = vget_high_s8 (cst);
    int8x8_t res1 = vmul_s8 (small, low);
    return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1));
}

Where in Gimple we get

  <bb 2> [local count: 1073741824]:
  _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 });
  _4 = _2 * small_3(D);
  _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
  _7 = __builtin_aarch64_combinev8qi (_4, _4);
  _8 = _6 + _7;
  return _8;

Regardless of what happens to __builtin_aarch64_get_highv16qi nothing will recreate the relationship
with cst, whether __builtin_aarch64_get_highv16qi is lowered or not, constant prop will still push in constants.

This codegen results in us rematerializing the constant twice.

square:
        adrp    x0, .LC0
        ldr     d2, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q3, [x0, #:lo12:.LC1]
        mul     v1.8b, v2.8b, v1.8b
        dup     d2, v1.d[0]
        ins     v2.d[1], v1.d[0]
        mla     v2.16b, v0.16b, v3.16b
        mov     v0.16b, v2.16b
        ret
.LC1:
        .byte   0
        .byte   1
        .byte   2
        .byte   3
        .byte   4
        .byte   5
        .byte   6
        .byte   7
        .byte   8
        .byte   9
        .byte   10
        .byte   11
        .byte   12
        .byte   13
        .byte   15
        .byte   0

Regardless whether it's pushed into a high operation or not this codegen it's still far more expensive to do this codegen.

> 
> Also, if I've understood correctly, it looks like we'd be relying on the
> vget_high of a constant remaining unfolded until RTL cse1.
> I think it's likely in future that we'd try to fold vget_high at the gimple level
> instead, since that could expose more optimisations of a different kind.  The
> gimple optimisers would then fold vget_high(constant) in a similar way to
> cse1 does now.
> 
> So perhaps we should continue to allow the vget_high(constant) to be
> foloded in cse1 and come up with some way of coping with the folded form.

CSE1 doesn't fold it, because for CSE the cost is too high to do so. Which is what this costing was attempting to fix.
CSE simply does not touch it. It leaves it as

(insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
        (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
                (parallel:V16QI [
                        (const_int 0 [0])
                        (const_int 1 [0x1])
                        (const_int 2 [0x2])
                        (const_int 3 [0x3])
                        (const_int 4 [0x4])
                        (const_int 5 [0x5])
                        (const_int 6 [0x6])
                        (const_int 7 [0x7])
                    ]))
            (reg:V8QI 93 [ _4 ]))) "":6506:10 1908 {aarch64_simd_move_hi_quad_v16qi}
     (nil))
(insn 12 11 13 2 (set (reg:V16QI 102)
        (const_vector:V16QI [
                (const_int 0 [0])
                (const_int 1 [0x1])
                (const_int 2 [0x2])
                (const_int 3 [0x3])
                (const_int 4 [0x4])
                (const_int 5 [0x5])
                (const_int 6 [0x6])
                (const_int 7 [0x7])
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
     (nil))

And I don't see any way to fix this without having Gimple not push constants in, which would lead to worse regressions.
I can change the patch to cost the high as a dup which fixes this codegen at least and has you rematerialize movi.   If that's
not acceptable I can drop costing for High entirely then, it's not the main thing I am fixing.

Tamar

> 
> Thanks,
> Richard
Richard Sandiford Oct. 25, 2021, 9:54 a.m. UTC | #9
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Saturday, October 23, 2021 11:40 AM
>> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
>> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
>> and operations
>> 
>> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >> I'm still a bit sceptical about treating the high-part cost as lower.
>> >> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and any
>> >> others should have a normal cost.  So if CSE handled the subreg case
>> >> itself (to model how the rtx would actually be generated) then
>> >> aarch64 code would have to do less work.  I imagine that will be true for
>> other targets as well.
>> >
>> > I guess the main problem is that CSE lacks context because it's not
>> > until after combine that the high part becomes truly "free" when pushed
>> into a high operation.
>> 
>> Yeah.  And the aarch64 code is just being asked to cost the operation it's
>> given, which could for example come from an existing
>> aarch64_simd_mov_from_<mode>high.  I think we should try to ensure that
>> a aarch64_simd_mov_from_<mode>high followed by some arithmetic on
>> the result is more expensive than the fused operation (when fusing is
>> possible).
>> 
>> An analogy might be: if the cost code is given:
>> 
>>   (add (reg X) (reg Y))
>> 
>> then, at some later point, the (reg X) might be replaced with a multiplication,
>> in which case we'd have a MADD operation and the addition is effectively
>> free.  Something similar would happen if (reg X) became a shift by a small
>> amount on newer cores, although I guess then you could argue either that
>> the cost of the add disappears or that the cost of the shift disappears.
>> 
>> But we shouldn't count ADD as free on the basis that it could be combined
>> with a multiplication or shift in future.  We have to cost what we're given.  I
>> think the same thing applies to the high part.
>> 
>> Here we're trying to prevent cse1 from replacing a DUP (lane) with a MOVI
>> by saying that the DUP is strictly cheaper than the MOVI.
>> I don't think that's really true though, and the cost tables in the patch say that
>> DUP is more expensive (rather than less expensive) than MOVI.
>
> No we're not. The front end has already pushed the constant into each operation that needs it
> which is the entire problem.

I think we're talking about different things here.  I'll come to the
gimple stuff below, but I was talking purely about the effect on the
RTL optimisers.  What I meant above is that, in the cse1 dumps,
the patch leads to changes like:

 (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
-        (const_vector:V8QI [
+        (vec_select:V8QI (reg:V16QI 116)
+            (parallel:V16QI [
+                    (const_int 8 [0x8])
+                    (const_int 9 [0x9])
+                    (const_int 10 [0xa])
+                    (const_int 11 [0xb])
+                    (const_int 12 [0xc])
+                    (const_int 13 [0xd])
+                    (const_int 14 [0xe])
+                    (const_int 15 [0xf])
+                ]))) "include/arm_neon.h":6477:22 1394 {aarch64_simd_mov_from_v16qihigh}
+     (expr_list:REG_EQUAL (const_vector:V8QI [
                 (const_int 3 [0x3]) repeated x8
-            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
-     (expr_list:REG_DEAD (reg:V16QI 117)
-        (nil)))
+            ])
+        (expr_list:REG_DEAD (reg:V16QI 117)
+            (nil))))

The pre-cse1 code is:

(insn 19 18 20 2 (set (reg:V16QI 117)
        (const_vector:V16QI [
                (const_int 3 [0x3]) repeated x16
            ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
     (nil))
(insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
        (vec_select:V8QI (reg:V16QI 117)
            (parallel:V16QI [
                    (const_int 8 [0x8])
                    (const_int 9 [0x9])
                    (const_int 10 [0xa])
                    (const_int 11 [0xb])
                    (const_int 12 [0xc])
                    (const_int 13 [0xd])
                    (const_int 14 [0xe])
                    (const_int 15 [0xf])
                ]))) "include/arm_neon.h":6477:22 1394 {aarch64_simd_mov_from_v16qihigh}
     (nil))

That is, before the patch, we folded insn 19 into insn 20 to get:

(insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
        (const_vector:V8QI [
                (const_int 3 [0x3]) repeated x8
            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
     (expr_list:REG_DEAD (reg:V16QI 117)
        (nil)))

After the patch we reject that because:

  (set (reg:V8QI X) (const_vector:V8QI [3]))

is costed as a MOVI (cost 4) and the original aarch64_simd_mov_from_v16qihigh
is costed as zero.  In other words, the patch makes the DUP (lane) in the
โ€œmov highโ€ strictly cheaper than a constant move (MOVI).

Preventing this fold seems like a key part of being able to match the
*l2 forms in the testcase, since otherwise the โ€œmov highโ€ disappears and
isn't available for combining later.

> MOVI as I mentioned before is the one case where this is a toss up.  But there are far
> more constants that cannot be created with a movi.  A simple example is
>
> #include <arm_neon.h>
>
> int8x16_t square(int8x16_t full, int8x8_t small) {
>     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
>     int8x8_t low = vget_high_s8 (cst);
>     int8x8_t res1 = vmul_s8 (small, low);
>     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1));
> }
>
> Where in Gimple we get
>
>   <bb 2> [local count: 1073741824]:
>   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 });
>   _4 = _2 * small_3(D);
>   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
>   _7 = __builtin_aarch64_combinev8qi (_4, _4);
>   _8 = _6 + _7;
>   return _8;
>
> Regardless of what happens to __builtin_aarch64_get_highv16qi nothing will recreate the relationship
> with cst, whether __builtin_aarch64_get_highv16qi is lowered or not, constant prop will still push in constants.

Yeah, constants are (by design) free in gimple.  But that's OK in itself,
because RTL optimisers have the job of removing any duplicates that end
up requiring separate moves.  I think we both agree on that.

E.g. for:

#include <arm_neon.h>

void foo(int8x16_t *x) {
  x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
  x[1] = vaddq_s8 (x[1], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
}

the final gimple is:

  <bb 2> [local count: 1073741824]:
  _1 = *x_4(D);
  _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  *x_4(D) = _5;
  _2 = MEM[(int8x16_t *)x_4(D) + 16B];
  _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  MEM[(int8x16_t *)x_4(D) + 16B] = _7;
  return;

but cse1 removes the duplicated constant even before the patch.

> This codegen results in us rematerializing the constant twice.
>
> square:
>         adrp    x0, .LC0
>         ldr     d2, [x0, #:lo12:.LC0]
>         adrp    x0, .LC1
>         ldr     q3, [x0, #:lo12:.LC1]
>         mul     v1.8b, v2.8b, v1.8b
>         dup     d2, v1.d[0]
>         ins     v2.d[1], v1.d[0]
>         mla     v2.16b, v0.16b, v3.16b
>         mov     v0.16b, v2.16b
>         ret
> .LC1:
>         .byte   0
>         .byte   1
>         .byte   2
>         .byte   3
>         .byte   4
>         .byte   5
>         .byte   6
>         .byte   7
>         .byte   8
>         .byte   9
>         .byte   10
>         .byte   11
>         .byte   12
>         .byte   13
>         .byte   15
>         .byte   0
>
> Regardless whether it's pushed into a high operation or not this codegen it's still far more expensive to do this codegen.

The problem is that here, the patch is preventing CSE from first folding
the RTL equivalent of:

  _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 });

to the RTL equivalent of:

  { 8, 9, 10, 11, 12, 13, 15, 0 }

Preventing the fold keeps two instances of the original (RTL) constant and
so prompts CSE to remove the duplicate.

CSE does do the fold without the patch.  Before cse1 we have:

(insn 7 4 8 2 (set (reg:V16QI 99)
        (const_vector:V16QI [
                (const_int 0 [0])
                (const_int 1 [0x1])
                (const_int 2 [0x2])
                (const_int 3 [0x3])
                (const_int 4 [0x4])
                (const_int 5 [0x5])
                (const_int 6 [0x6])
                (const_int 7 [0x7])
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ])) "include/arm_neon.h":6449:11 1166 {*aarch64_simd_movv16qi}
     (nil))
(insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
        (vec_select:V8QI (reg:V16QI 99)
            (parallel:V16QI [
                    (const_int 8 [0x8])
                    (const_int 9 [0x9])
                    (const_int 10 [0xa])
                    (const_int 11 [0xb])
                    (const_int 12 [0xc])
                    (const_int 13 [0xd])
                    (const_int 14 [0xe])
                    (const_int 15 [0xf])
                ]))) "include/arm_neon.h":6449:11 1394 {aarch64_simd_mov_from_v16qihigh}
     (nil))

then unpatched cse1 converts insn 8 to:

(insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
        (const_vector:V8QI [
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ])) "include/arm_neon.h":6449:11 1160 {*aarch64_simd_movv8qi}
     (expr_list:REG_DEAD (reg:V16QI 99)
        (nil)))

so that there are no longer any duplicate constants (as far as the
RTL code is concerned).  Instead we have one 16-byte constant and
one 8-byte constant.

The patch prevents the fold on insn 8 by making the โ€œmov highโ€
strictly cheaper than the constant move, so we keep the โ€œmov highโ€
and its 16-byte input.  Keeping the โ€œmov highโ€ means that we do
have a duplicate constant for CSE to remove.

What I meantโ€ฆ

>> Also, if I've understood correctly, it looks like we'd be relying on the
>> vget_high of a constant remaining unfolded until RTL cse1.
>> I think it's likely in future that we'd try to fold vget_high at the gimple level
>> instead, since that could expose more optimisations of a different kind.  The
>> gimple optimisers would then fold vget_high(constant) in a similar way to
>> cse1 does now.
>> 
>> So perhaps we should continue to allow the vget_high(constant) to be
>> foloded in cse1 and come up with some way of coping with the folded form.

โ€ฆhere was that, in future, the gimple optimisers might be able to
fold the vget_high themselves.  For your example, we'd then have:

  _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
  _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
  _7 = __builtin_aarch64_combinev8qi (_4, _4);
  _8 = _6 + _7;
  return _8;

In this situation, we'd need to recreate the relationship between
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and
{ 8, 9, 10, 11, 12, 13, 15, 0 }.  We can't ensure that the
relationship is never lost.

The same thing would be true for vget_low.  So a constant like:

  cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
  โ€ฆ vget_low* (cst) ..;
  โ€ฆ vget_high* (cst) โ€ฆ;

could be folded to two smaller constants:

  โ€ฆ { 0, 1, 2, 3, 4, 5, 6, 7 } โ€ฆ;
  โ€ฆ { 8, 9, 10, 11, 12, 13, 15, 0 } โ€ฆ;

We might then need to recreate the combined form, rather than relying
on the combined form already existing.

> CSE1 doesn't fold it, because for CSE the cost is too high to do so. Which is what this costing was attempting to fix.
> CSE simply does not touch it. It leaves it as
>
> (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
>         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
>                 (parallel:V16QI [
>                         (const_int 0 [0])
>                         (const_int 1 [0x1])
>                         (const_int 2 [0x2])
>                         (const_int 3 [0x3])
>                         (const_int 4 [0x4])
>                         (const_int 5 [0x5])
>                         (const_int 6 [0x6])
>                         (const_int 7 [0x7])
>                     ]))
>             (reg:V8QI 93 [ _4 ]))) "":6506:10 1908 {aarch64_simd_move_hi_quad_v16qi}
>      (nil))
> (insn 12 11 13 2 (set (reg:V16QI 102)
>         (const_vector:V16QI [
>                 (const_int 0 [0])
>                 (const_int 1 [0x1])
>                 (const_int 2 [0x2])
>                 (const_int 3 [0x3])
>                 (const_int 4 [0x4])
>                 (const_int 5 [0x5])
>                 (const_int 6 [0x6])
>                 (const_int 7 [0x7])
>                 (const_int 8 [0x8])
>                 (const_int 9 [0x9])
>                 (const_int 10 [0xa])
>                 (const_int 11 [0xb])
>                 (const_int 12 [0xc])
>                 (const_int 13 [0xd])
>                 (const_int 15 [0xf])
>                 (const_int 0 [0])
>             ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
>      (nil))

I don't think that's true for the unpatched compiler.  Are you sure
this isn't the โ€œpre-CSEโ€ part of the dump?  CSE is confusing (to me)
in that it prints each function twice, once in unoptimised form and
later in optimised form.

> And I don't see any way to fix this without having Gimple not push constants in, which would lead to worse regressions.
> I can change the patch to cost the high as a dup which fixes this codegen at least and has you rematerialize movi.   If that's
> not acceptable I can drop costing for High entirely then, it's not the main thing I am fixing.

Costing the high as a dup leaves us in the same situation as before the
patch: the folded V8QI constant is cheaper than the unfolded mov high.

Thanks,
Richard
Tamar Christina Oct. 25, 2021, 11:49 a.m. UTC | #10
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, October 25, 2021 10:54 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Saturday, October 23, 2021 11:40 AM
> >> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
> >> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> I'm still a bit sceptical about treating the high-part cost as lower.
> >> >> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and
> >> >> any others should have a normal cost.  So if CSE handled the
> >> >> subreg case itself (to model how the rtx would actually be
> >> >> generated) then
> >> >> aarch64 code would have to do less work.  I imagine that will be
> >> >> true for
> >> other targets as well.
> >> >
> >> > I guess the main problem is that CSE lacks context because it's not
> >> > until after combine that the high part becomes truly "free" when
> >> > pushed
> >> into a high operation.
> >>
> >> Yeah.  And the aarch64 code is just being asked to cost the operation
> >> it's given, which could for example come from an existing
> >> aarch64_simd_mov_from_<mode>high.  I think we should try to ensure
> >> that a aarch64_simd_mov_from_<mode>high followed by some
> arithmetic
> >> on the result is more expensive than the fused operation (when fusing
> >> is possible).
> >>
> >> An analogy might be: if the cost code is given:
> >>
> >>   (add (reg X) (reg Y))
> >>
> >> then, at some later point, the (reg X) might be replaced with a
> >> multiplication, in which case we'd have a MADD operation and the
> >> addition is effectively free.  Something similar would happen if (reg
> >> X) became a shift by a small amount on newer cores, although I guess
> >> then you could argue either that the cost of the add disappears or that
> the cost of the shift disappears.
> >>
> >> But we shouldn't count ADD as free on the basis that it could be
> >> combined with a multiplication or shift in future.  We have to cost
> >> what we're given.  I think the same thing applies to the high part.
> >>
> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with a
> >> MOVI by saying that the DUP is strictly cheaper than the MOVI.
> >> I don't think that's really true though, and the cost tables in the
> >> patch say that DUP is more expensive (rather than less expensive) than
> MOVI.
> >
> > No we're not. The front end has already pushed the constant into each
> > operation that needs it which is the entire problem.
> 
> I think we're talking about different things here.  I'll come to the gimple stuff
> below, but I was talking purely about the effect on the RTL optimisers.  What
> I meant above is that, in the cse1 dumps, the patch leads to changes like:
> 
>  (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> -        (const_vector:V8QI [
> +        (vec_select:V8QI (reg:V16QI 116)
> +            (parallel:V16QI [
> +                    (const_int 8 [0x8])
> +                    (const_int 9 [0x9])
> +                    (const_int 10 [0xa])
> +                    (const_int 11 [0xb])
> +                    (const_int 12 [0xc])
> +                    (const_int 13 [0xd])
> +                    (const_int 14 [0xe])
> +                    (const_int 15 [0xf])
> +                ]))) "include/arm_neon.h":6477:22 1394
> {aarch64_simd_mov_from_v16qihigh}
> +     (expr_list:REG_EQUAL (const_vector:V8QI [
>                  (const_int 3 [0x3]) repeated x8
> -            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> -     (expr_list:REG_DEAD (reg:V16QI 117)
> -        (nil)))
> +            ])
> +        (expr_list:REG_DEAD (reg:V16QI 117)
> +            (nil))))
> 
> The pre-cse1 code is:
> 
> (insn 19 18 20 2 (set (reg:V16QI 117)
>         (const_vector:V16QI [
>                 (const_int 3 [0x3]) repeated x16
>             ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
>      (nil))
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (vec_select:V8QI (reg:V16QI 117)
>             (parallel:V16QI [
>                     (const_int 8 [0x8])
>                     (const_int 9 [0x9])
>                     (const_int 10 [0xa])
>                     (const_int 11 [0xb])
>                     (const_int 12 [0xc])
>                     (const_int 13 [0xd])
>                     (const_int 14 [0xe])
>                     (const_int 15 [0xf])
>                 ]))) "include/arm_neon.h":6477:22 1394
> {aarch64_simd_mov_from_v16qihigh}
>      (nil))
> 
> That is, before the patch, we folded insn 19 into insn 20 to get:
> 
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (const_vector:V8QI [
>                 (const_int 3 [0x3]) repeated x8
>             ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 117)
>         (nil)))
> 
> After the patch we reject that because:
> 
>   (set (reg:V8QI X) (const_vector:V8QI [3]))
> 
> is costed as a MOVI (cost 4) and the original
> aarch64_simd_mov_from_v16qihigh is costed as zero.  In other words, the
> patch makes the DUP (lane) in the โ€œmov highโ€ strictly cheaper than a
> constant move (MOVI).

Yes, this was done intentionally because as we talked about a month ago there's
no real way to cost this correctly. The use of `X` there determines whether it's cheaper
to use the movi over the dup.  The MOVI not only prevent re-use of the value, it also
prevents combining into high operations.  All of which is impossible to tell currently
in how CSE and costing are done.

This is an unmodified compiler created from last night's trunk https://godbolt.org/z/1saTP4xWs

While yes, it did fold movi into the set, reg 19 wasn't dead, so you now materialized the constant 3 times

test0:
        ldr     q0, [x0]
        movi    v3.8b, 0x3  <<<< first
        ldr     q2, [x1]
        movi    v5.16b, 0x3 <<< second
        uxtl    v1.8h, v0.8b
        dup     d4, v2.d[1] <<< third
        uxtl2   v0.8h, v0.16b
        umlal   v1.8h, v2.8b, v5.8b
        umlal   v0.8h, v4.8b, v3.8b
        addhn   v0.8b, v1.8h, v0.8h
        str     d0, [x2]
        ret

whilst my patch, generates

test0:
	movi	v2.16b, 0x3 <<< once
	ldr	q0, \[x0\]
	uxtl	v1.8h, v0.8b
	uxtl2	v0.8h, v0.16b
	ldr	q3, \[x1\]
	umlal	v1.8h, v3.8b, v2.8b
	umlal2	v0.8h, v3.16b, v2.16b
	addhn	v0.8b, v1.8h, v0.8h
	str	d0, \[x2\]
	ret

Yes it's not perfect, yes you can end up with a dup instead of two movi's but my argument is it's still a step forward
as the perfect solution doesn't seem to be possible at all with the way things are currently set up.

> 
> Preventing this fold seems like a key part of being able to match the
> *l2 forms in the testcase, since otherwise the โ€œmov highโ€ disappears and isn't
> available for combining later.

Yes, and by preventing the folding combine should in principle be able to fold it back if it wasn't pushed into another
Instruction, but combine does not attempt to touch constants and selects on their own. If it did this "regression" would be fixed.

I'm not really quite sure what we're arguing about..  I did think about all three possible cases when making this:

https://godbolt.org/z/hjWhWq1v1

Of the three cases the compiler currently only generates something good for test2.  Both test1 and test0 are deficient.
The patch doesn't change test2, significantly improves test0 and whether  test1 is a regression is likely uArch specific.

On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar has the same throughput and latencies as a MOVI
according to the Arm Performance Software Optimization guides.

So to me this looks like an improvement overall.  And this is where we likely disagree?

> 
> > MOVI as I mentioned before is the one case where this is a toss up.
> > But there are far more constants that cannot be created with a movi.
> > A simple example is
> >
> > #include <arm_neon.h>
> >
> > int8x16_t square(int8x16_t full, int8x8_t small) {
> >     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
> >     int8x8_t low = vget_high_s8 (cst);
> >     int8x8_t res1 = vmul_s8 (small, low);
> >     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1));
> > }
> >
> > Where in Gimple we get
> >
> >   <bb 2> [local count: 1073741824]:
> >   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
> 13, 15, 0 });
> >   _4 = _2 * small_3(D);
> >   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >   _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >   _8 = _6 + _7;
> >   return _8;
> >
> > Regardless of what happens to __builtin_aarch64_get_highv16qi nothing
> > will recreate the relationship with cst, whether
> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will still
> push in constants.
> 
> Yeah, constants are (by design) free in gimple.  But that's OK in itself,
> because RTL optimisers have the job of removing any duplicates that end up
> requiring separate moves.  I think we both agree on that.
> 
> E.g. for:
> 
> #include <arm_neon.h>
> 
> void foo(int8x16_t *x) {
>   x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
>   x[1] = vaddq_s8 (x[1], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> }
> 
> the final gimple is:
> 
>   <bb 2> [local count: 1073741824]:
>   _1 = *x_4(D);
>   _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>   *x_4(D) = _5;
>   _2 = MEM[(int8x16_t *)x_4(D) + 16B];
>   _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>   MEM[(int8x16_t *)x_4(D) + 16B] = _7;
>   return;
> 
> but cse1 removes the duplicated constant even before the patch.

It doesn't for me, again an unmodified compiler:

https://godbolt.org/z/qnvf7496h 

and CSE1 has as the final codegen:

(insn 7 4 8 2 (set (reg:V16QI 99)
        (const_vector:V16QI [
                (const_int 0 [0])
                (const_int 1 [0x1])
                (const_int 2 [0x2])
                (const_int 3 [0x3])
                (const_int 4 [0x4])
                (const_int 5 [0x5])
                (const_int 6 [0x6])
                (const_int 7 [0x7])
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ]))

(insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
        (const_vector:V8QI [
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ]))

(insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
        (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
                (parallel:V16QI [
                        (const_int 0 [0])
                        (const_int 1 [0x1])
                        (const_int 2 [0x2])
                        (const_int 3 [0x3])
                        (const_int 4 [0x4])
                        (const_int 5 [0x5])
                        (const_int 6 [0x6])
                        (const_int 7 [0x7])
                    ]))
            (reg:V8QI 93 [ _4 ])))

So again same constant represented twice, which is reflected in the codegen.

> 
> > This codegen results in us rematerializing the constant twice.
> >
> > square:
> >         adrp    x0, .LC0
> >         ldr     d2, [x0, #:lo12:.LC0]
> >         adrp    x0, .LC1
> >         ldr     q3, [x0, #:lo12:.LC1]
> >         mul     v1.8b, v2.8b, v1.8b
> >         dup     d2, v1.d[0]
> >         ins     v2.d[1], v1.d[0]
> >         mla     v2.16b, v0.16b, v3.16b
> >         mov     v0.16b, v2.16b
> >         ret
> > .LC1:
> >         .byte   0
> >         .byte   1
> >         .byte   2
> >         .byte   3
> >         .byte   4
> >         .byte   5
> >         .byte   6
> >         .byte   7
> >         .byte   8
> >         .byte   9
> >         .byte   10
> >         .byte   11
> >         .byte   12
> >         .byte   13
> >         .byte   15
> >         .byte   0
> >
> > Regardless whether it's pushed into a high operation or not this codegen
> it's still far more expensive to do this codegen.
> 
> The problem is that here, the patch is preventing CSE from first folding the
> RTL equivalent of:
> 
>   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
> 13, 15, 0 });
> 
> to the RTL equivalent of:
> 
>   { 8, 9, 10, 11, 12, 13, 15, 0 }
> 
> Preventing the fold keeps two instances of the original (RTL) constant and so
> prompts CSE to remove the duplicate.
> 
> CSE does do the fold without the patch.  Before cse1 we have:
> 
> (insn 7 4 8 2 (set (reg:V16QI 99)
>         (const_vector:V16QI [
>                 (const_int 0 [0])
>                 (const_int 1 [0x1])
>                 (const_int 2 [0x2])
>                 (const_int 3 [0x3])
>                 (const_int 4 [0x4])
>                 (const_int 5 [0x5])
>                 (const_int 6 [0x6])
>                 (const_int 7 [0x7])
>                 (const_int 8 [0x8])
>                 (const_int 9 [0x9])
>                 (const_int 10 [0xa])
>                 (const_int 11 [0xb])
>                 (const_int 12 [0xc])
>                 (const_int 13 [0xd])
>                 (const_int 15 [0xf])
>                 (const_int 0 [0])
>             ])) "include/arm_neon.h":6449:11 1166 {*aarch64_simd_movv16qi}
>      (nil))
> (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
>         (vec_select:V8QI (reg:V16QI 99)
>             (parallel:V16QI [
>                     (const_int 8 [0x8])
>                     (const_int 9 [0x9])
>                     (const_int 10 [0xa])
>                     (const_int 11 [0xb])
>                     (const_int 12 [0xc])
>                     (const_int 13 [0xd])
>                     (const_int 14 [0xe])
>                     (const_int 15 [0xf])
>                 ]))) "include/arm_neon.h":6449:11 1394
> {aarch64_simd_mov_from_v16qihigh}
>      (nil))
> 
> then unpatched cse1 converts insn 8 to:
> 
> (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
>         (const_vector:V8QI [
>                 (const_int 8 [0x8])
>                 (const_int 9 [0x9])
>                 (const_int 10 [0xa])
>                 (const_int 11 [0xb])
>                 (const_int 12 [0xc])
>                 (const_int 13 [0xd])
>                 (const_int 15 [0xf])
>                 (const_int 0 [0])
>             ])) "include/arm_neon.h":6449:11 1160 {*aarch64_simd_movv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 99)
>         (nil)))
> 

But again, the constant is not single use, so CSE keeps multiple copies live.  My whole
Argument is that CSE should not perform any folding before combine, because it
Simply does not have enough information to do the right thing.

It's still being folded, just not by CSE which is too early. CE2 still folds it into

(insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
        (const_vector:V8QI [
                (const_int 8 [0x8])
                (const_int 9 [0x9])
                (const_int 10 [0xa])
                (const_int 11 [0xb])
                (const_int 12 [0xc])
                (const_int 13 [0xd])
                (const_int 15 [0xf])
                (const_int 0 [0])
            ]))

But at least RTL optimizers until then knew about the relationship.

My patch generates instead of

square:
        adrp    x0, .LC0
        ldr     d2, [x0, #:lo12:.LC0]
        adrp    x0, .LC1
        ldr     q3, [x0, #:lo12:.LC1]
        mul     v1.8b, v2.8b, v1.8b
        dup     d2, v1.d[0]
        ins     v2.d[1], v1.d[0]
        mla     v2.16b, v0.16b, v3.16b
        mov     v0.16b, v2.16b
        ret
single:
        adrp    x0, .LC0
        ldr     d2, [x0, #:lo12:.LC0]
        mul     v1.8b, v2.8b, v1.8b
        dup     d2, v1.d[0]
        ins     v2.d[1], v1.d[0]
        add     v0.16b, v2.16b, v0.16b
        ret

it gives

square:
        adrp    x0, .LC0
        ldr     q3, [x0, #:lo12:.LC0]
        dup     d2, v3.d[1]
        mul     v1.8b, v2.8b, v1.8b
        dup     d2, v1.d[0]
        ins     v2.d[1], v1.d[0]
        mla     v2.16b, v0.16b, v3.16b
        mov     v0.16b, v2.16b
        ret
single:
        adrp    x0, .LC1
        ldr     d2, [x0, #:lo12:.LC1]
        mul     v1.8b, v2.8b, v1.8b
        dup     d2, v1.d[0]
        ins     v2.d[1], v1.d[0]
        add     v0.16b, v2.16b, v0.16b
        ret

which now just selects the high part instead of doing a full addressing + load.

> so that there are no longer any duplicate constants (as far as the RTL code is
> concerned).  Instead we have one 16-byte constant and one 8-byte constant.
> 
> The patch prevents the fold on insn 8 by making the โ€œmov highโ€
> strictly cheaper than the constant move, so we keep the โ€œmov highโ€
> and its 16-byte input.  Keeping the โ€œmov highโ€ means that we do have a
> duplicate constant for CSE to remove.
> 
> What I meantโ€ฆ
> 
> >> Also, if I've understood correctly, it looks like we'd be relying on
> >> the vget_high of a constant remaining unfolded until RTL cse1.
> >> I think it's likely in future that we'd try to fold vget_high at the
> >> gimple level instead, since that could expose more optimisations of a
> >> different kind.  The gimple optimisers would then fold
> >> vget_high(constant) in a similar way to
> >> cse1 does now.
> >>
> >> So perhaps we should continue to allow the vget_high(constant) to be
> >> foloded in cse1 and come up with some way of coping with the folded
> form.
> 
> โ€ฆhere was that, in future, the gimple optimisers might be able to fold the
> vget_high themselves.  For your example, we'd then have:
> 
>   _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
>   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
>   _7 = __builtin_aarch64_combinev8qi (_4, _4);
>   _8 = _6 + _7;
>   return _8;
> 
> In this situation, we'd need to recreate the relationship between { 0, 1, 2, 3, 4,
> 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10, 11, 12, 13, 15, 0 }.  We can't
> ensure that the relationship is never lost.
> 
> The same thing would be true for vget_low.  So a constant like:
> 
>   cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
>   โ€ฆ vget_low* (cst) ..;
>   โ€ฆ vget_high* (cst) โ€ฆ;
> 
> could be folded to two smaller constants:
> 
>   โ€ฆ { 0, 1, 2, 3, 4, 5, 6, 7 } โ€ฆ;
>   โ€ฆ { 8, 9, 10, 11, 12, 13, 15, 0 } โ€ฆ;
> 
> We might then need to recreate the combined form, rather than relying on
> the combined form already existing.

Yes but this is what confuses me. My patch changes it so that CSE1 which is ran
relatively early is able to find the relationship between the two constants.

CSE1 shouldn't do any folding, it doesn't have enough information to do so.
By CSE doing folding it makes it so combine is less efficient.

> 
> > CSE1 doesn't fold it, because for CSE the cost is too high to do so. Which is
> what this costing was attempting to fix.
> > CSE simply does not touch it. It leaves it as
> >
> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >                 (parallel:V16QI [
> >                         (const_int 0 [0])
> >                         (const_int 1 [0x1])
> >                         (const_int 2 [0x2])
> >                         (const_int 3 [0x3])
> >                         (const_int 4 [0x4])
> >                         (const_int 5 [0x5])
> >                         (const_int 6 [0x6])
> >                         (const_int 7 [0x7])
> >                     ]))
> >             (reg:V8QI 93 [ _4 ]))) "":6506:10 1908
> {aarch64_simd_move_hi_quad_v16qi}
> >      (nil))
> > (insn 12 11 13 2 (set (reg:V16QI 102)
> >         (const_vector:V16QI [
> >                 (const_int 0 [0])
> >                 (const_int 1 [0x1])
> >                 (const_int 2 [0x2])
> >                 (const_int 3 [0x3])
> >                 (const_int 4 [0x4])
> >                 (const_int 5 [0x5])
> >                 (const_int 6 [0x6])
> >                 (const_int 7 [0x7])
> >                 (const_int 8 [0x8])
> >                 (const_int 9 [0x9])
> >                 (const_int 10 [0xa])
> >                 (const_int 11 [0xb])
> >                 (const_int 12 [0xc])
> >                 (const_int 13 [0xd])
> >                 (const_int 15 [0xf])
> >                 (const_int 0 [0])
> >             ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
> >      (nil))
> 
> I don't think that's true for the unpatched compiler.  Are you sure this isn't
> the โ€œpre-CSEโ€ part of the dump?  CSE is confusing (to me) in that it prints
> each function twice, once in unoptimised form and later in optimised form.
> 

Yes I'm sure, see all the compiler explorer links above.

> > And I don't see any way to fix this without having Gimple not push
> constants in, which would lead to worse regressions.
> > I can change the patch to cost the high as a dup which fixes this codegen at
> least and has you rematerialize movi.   If that's
> > not acceptable I can drop costing for High entirely then, it's not the main
> thing I am fixing.
> 
> Costing the high as a dup leaves us in the same situation as before the
> patch: the folded V8QI constant is cheaper than the unfolded mov high.

Yes and the dup will reflect that. The argument that it's not the right cost no
longer hold any water in that case.  In particular as I still maintain that is too
early to do any constant folding in CSE1 for AArch64.

Whether it's folded or not doesn't make any difference to combine which will
Fold when combinations are possible with the folder version.

So I have yet to see any actual regression.

But again, if I can't get this one fixed, I'm happy to remove the costing for high
part select completely and just move on.  My aim here was to fix element extracts
and would have been nice to fix the other obviously bad codegen.

Tamar
> 
> Thanks,
> Richard
Richard Sandiford Oct. 25, 2021, 2:32 p.m. UTC | #11
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, October 25, 2021 10:54 AM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
>> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
>> Shawcroft <Marcus.Shawcroft@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
>> and operations
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Saturday, October 23, 2021 11:40 AM
>> >> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
>> >> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>
>> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
>> >> constants and operations
>> >>
>> >> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >> >> I'm still a bit sceptical about treating the high-part cost as lower.
>> >> >> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€ and
>> >> >> any others should have a normal cost.  So if CSE handled the
>> >> >> subreg case itself (to model how the rtx would actually be
>> >> >> generated) then
>> >> >> aarch64 code would have to do less work.  I imagine that will be
>> >> >> true for
>> >> other targets as well.
>> >> >
>> >> > I guess the main problem is that CSE lacks context because it's not
>> >> > until after combine that the high part becomes truly "free" when
>> >> > pushed
>> >> into a high operation.
>> >>
>> >> Yeah.  And the aarch64 code is just being asked to cost the operation
>> >> it's given, which could for example come from an existing
>> >> aarch64_simd_mov_from_<mode>high.  I think we should try to ensure
>> >> that a aarch64_simd_mov_from_<mode>high followed by some
>> arithmetic
>> >> on the result is more expensive than the fused operation (when fusing
>> >> is possible).
>> >>
>> >> An analogy might be: if the cost code is given:
>> >>
>> >>   (add (reg X) (reg Y))
>> >>
>> >> then, at some later point, the (reg X) might be replaced with a
>> >> multiplication, in which case we'd have a MADD operation and the
>> >> addition is effectively free.  Something similar would happen if (reg
>> >> X) became a shift by a small amount on newer cores, although I guess
>> >> then you could argue either that the cost of the add disappears or that
>> the cost of the shift disappears.
>> >>
>> >> But we shouldn't count ADD as free on the basis that it could be
>> >> combined with a multiplication or shift in future.  We have to cost
>> >> what we're given.  I think the same thing applies to the high part.
>> >>
>> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with a
>> >> MOVI by saying that the DUP is strictly cheaper than the MOVI.
>> >> I don't think that's really true though, and the cost tables in the
>> >> patch say that DUP is more expensive (rather than less expensive) than
>> MOVI.
>> >
>> > No we're not. The front end has already pushed the constant into each
>> > operation that needs it which is the entire problem.
>> 
>> I think we're talking about different things here.  I'll come to the gimple stuff
>> below, but I was talking purely about the effect on the RTL optimisers.  What
>> I meant above is that, in the cse1 dumps, the patch leads to changes like:
>> 
>>  (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>> -        (const_vector:V8QI [
>> +        (vec_select:V8QI (reg:V16QI 116)
>> +            (parallel:V16QI [
>> +                    (const_int 8 [0x8])
>> +                    (const_int 9 [0x9])
>> +                    (const_int 10 [0xa])
>> +                    (const_int 11 [0xb])
>> +                    (const_int 12 [0xc])
>> +                    (const_int 13 [0xd])
>> +                    (const_int 14 [0xe])
>> +                    (const_int 15 [0xf])
>> +                ]))) "include/arm_neon.h":6477:22 1394
>> {aarch64_simd_mov_from_v16qihigh}
>> +     (expr_list:REG_EQUAL (const_vector:V8QI [
>>                  (const_int 3 [0x3]) repeated x8
>> -            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
>> -     (expr_list:REG_DEAD (reg:V16QI 117)
>> -        (nil)))
>> +            ])
>> +        (expr_list:REG_DEAD (reg:V16QI 117)
>> +            (nil))))
>> 
>> The pre-cse1 code is:
>> 
>> (insn 19 18 20 2 (set (reg:V16QI 117)
>>         (const_vector:V16QI [
>>                 (const_int 3 [0x3]) repeated x16
>>             ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
>>      (nil))
>> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>>         (vec_select:V8QI (reg:V16QI 117)
>>             (parallel:V16QI [
>>                     (const_int 8 [0x8])
>>                     (const_int 9 [0x9])
>>                     (const_int 10 [0xa])
>>                     (const_int 11 [0xb])
>>                     (const_int 12 [0xc])
>>                     (const_int 13 [0xd])
>>                     (const_int 14 [0xe])
>>                     (const_int 15 [0xf])
>>                 ]))) "include/arm_neon.h":6477:22 1394
>> {aarch64_simd_mov_from_v16qihigh}
>>      (nil))
>> 
>> That is, before the patch, we folded insn 19 into insn 20 to get:
>> 
>> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>>         (const_vector:V8QI [
>>                 (const_int 3 [0x3]) repeated x8
>>             ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
>>      (expr_list:REG_DEAD (reg:V16QI 117)
>>         (nil)))
>> 
>> After the patch we reject that because:
>> 
>>   (set (reg:V8QI X) (const_vector:V8QI [3]))
>> 
>> is costed as a MOVI (cost 4) and the original
>> aarch64_simd_mov_from_v16qihigh is costed as zero.  In other words, the
>> patch makes the DUP (lane) in the โ€œmov highโ€ strictly cheaper than a
>> constant move (MOVI).
>
> Yes, this was done intentionally because as we talked about a month ago there's
> no real way to cost this correctly. The use of `X` there determines whether it's cheaper
> to use the movi over the dup.  The MOVI not only prevent re-use of the value, it also
> prevents combining into high operations.  All of which is impossible to tell currently
> in how CSE and costing are done.
>
> This is an unmodified compiler created from last night's trunk https://godbolt.org/z/1saTP4xWs
>
> While yes, it did fold movi into the set, reg 19 wasn't dead, so you now materialized the constant 3 times
>
> test0:
>         ldr     q0, [x0]
>         movi    v3.8b, 0x3  <<<< first
>         ldr     q2, [x1]
>         movi    v5.16b, 0x3 <<< second
>         uxtl    v1.8h, v0.8b
>         dup     d4, v2.d[1] <<< third
>         uxtl2   v0.8h, v0.16b
>         umlal   v1.8h, v2.8b, v5.8b
>         umlal   v0.8h, v4.8b, v3.8b
>         addhn   v0.8b, v1.8h, v0.8h
>         str     d0, [x2]
>         ret
>
> whilst my patch, generates
>
> test0:
> 	movi	v2.16b, 0x3 <<< once
> 	ldr	q0, \[x0\]
> 	uxtl	v1.8h, v0.8b
> 	uxtl2	v0.8h, v0.16b
> 	ldr	q3, \[x1\]
> 	umlal	v1.8h, v3.8b, v2.8b
> 	umlal2	v0.8h, v3.16b, v2.16b
> 	addhn	v0.8b, v1.8h, v0.8h
> 	str	d0, \[x2\]
> 	ret
>
> Yes it's not perfect, yes you can end up with a dup instead of two movi's but my argument is it's still a step forward
> as the perfect solution doesn't seem to be possible at all with the way things are currently set up.

I agree there's no out-of-the-box way of getting what we want for the
original testcases.  It would require changes outside the target or
(if the worst comes to the worst) a target-specific pass.

>> Preventing this fold seems like a key part of being able to match the
>> *l2 forms in the testcase, since otherwise the โ€œmov highโ€ disappears and isn't
>> available for combining later.
>
> Yes, and by preventing the folding combine should in principle be able to fold it back if it wasn't pushed into another
> Instruction, but combine does not attempt to touch constants and selects on their own. If it did this "regression" would be fixed.

The problem is that combine is limited to individual EBBs and only
combines def-use chains when there is a single use.  It's not a general
folding engine.

> I'm not really quite sure what we're arguing about..  I did think about all three possible cases when making this:
>
> https://godbolt.org/z/hjWhWq1v1
>
> Of the three cases the compiler currently only generates something good for test2.  Both test1 and test0 are deficient.
> The patch doesn't change test2, significantly improves test0 and whether  test1 is a regression is likely uArch specific.
>
> On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar has the same throughput and latencies as a MOVI
> according to the Arm Performance Software Optimization guides.

Costing them as equal would be OK when they are equal.  It's the โ€œDUP (lane)/
mov high is strictly cheaper bitโ€ I'm concerned about.

> So to me this looks like an improvement overall.  And this is where we likely disagree?

Well, the disagreement isn't about whether the new compiler output for
these testcases is better than the old compiler output.  It's more a
question of how we're getting there.

>> > MOVI as I mentioned before is the one case where this is a toss up.
>> > But there are far more constants that cannot be created with a movi.
>> > A simple example is
>> >
>> > #include <arm_neon.h>
>> >
>> > int8x16_t square(int8x16_t full, int8x8_t small) {
>> >     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
>> >     int8x8_t low = vget_high_s8 (cst);
>> >     int8x8_t res1 = vmul_s8 (small, low);
>> >     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1));
>> > }
>> >
>> > Where in Gimple we get
>> >
>> >   <bb 2> [local count: 1073741824]:
>> >   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
>> 13, 15, 0 });
>> >   _4 = _2 * small_3(D);
>> >   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
>> >   _7 = __builtin_aarch64_combinev8qi (_4, _4);
>> >   _8 = _6 + _7;
>> >   return _8;
>> >
>> > Regardless of what happens to __builtin_aarch64_get_highv16qi nothing
>> > will recreate the relationship with cst, whether
>> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will still
>> push in constants.
>> 
>> Yeah, constants are (by design) free in gimple.  But that's OK in itself,
>> because RTL optimisers have the job of removing any duplicates that end up
>> requiring separate moves.  I think we both agree on that.
>> 
>> E.g. for:
>> 
>> #include <arm_neon.h>
>> 
>> void foo(int8x16_t *x) {
>>   x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
>>   x[1] = vaddq_s8 (x[1], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
>> }
>> 
>> the final gimple is:
>> 
>>   <bb 2> [local count: 1073741824]:
>>   _1 = *x_4(D);
>>   _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>>   *x_4(D) = _5;
>>   _2 = MEM[(int8x16_t *)x_4(D) + 16B];
>>   _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>>   MEM[(int8x16_t *)x_4(D) + 16B] = _7;
>>   return;
>> 
>> but cse1 removes the duplicated constant even before the patch.
>
> It doesn't for me, again an unmodified compiler:
>
> https://godbolt.org/z/qnvf7496h 

FWIW, the link for my example is:

  https://godbolt.org/z/G6vaE3nab

but it sounds like the disagreement wasn't where I thought it was.

> and CSE1 has as the final codegen:
>
> (insn 7 4 8 2 (set (reg:V16QI 99)
>         (const_vector:V16QI [
>                 (const_int 0 [0])
>                 (const_int 1 [0x1])
>                 (const_int 2 [0x2])
>                 (const_int 3 [0x3])
>                 (const_int 4 [0x4])
>                 (const_int 5 [0x5])
>                 (const_int 6 [0x6])
>                 (const_int 7 [0x7])
>                 (const_int 8 [0x8])
>                 (const_int 9 [0x9])
>                 (const_int 10 [0xa])
>                 (const_int 11 [0xb])
>                 (const_int 12 [0xc])
>                 (const_int 13 [0xd])
>                 (const_int 15 [0xf])
>                 (const_int 0 [0])
>             ]))
>
> (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
>         (const_vector:V8QI [
>                 (const_int 8 [0x8])
>                 (const_int 9 [0x9])
>                 (const_int 10 [0xa])
>                 (const_int 11 [0xb])
>                 (const_int 12 [0xc])
>                 (const_int 13 [0xd])
>                 (const_int 15 [0xf])
>                 (const_int 0 [0])
>             ]))
>
> (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
>         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
>                 (parallel:V16QI [
>                         (const_int 0 [0])
>                         (const_int 1 [0x1])
>                         (const_int 2 [0x2])
>                         (const_int 3 [0x3])
>                         (const_int 4 [0x4])
>                         (const_int 5 [0x5])
>                         (const_int 6 [0x6])
>                         (const_int 7 [0x7])
>                     ]))
>             (reg:V8QI 93 [ _4 ])))

Here, insn 8 is the folded version of the vget_high_s8 and insn 11
is part of the vcombine_s8.  With that caveatโ€ฆ

> So again same constant represented twice, which is reflected in the codegen.

โ€ฆright, the above is also what I was saying that we generate before
the patch for your square example.

But as you say later this testcase is demonstrating the point that
constants loaded from memory should be more expensive than DUP (lane).
I agree with that.  The bit I don't agree with is costing the DUP (lane)
as zero, so that it's also strictly cheaper than MOVI.

So I think the disagreement is more about things like the first
example in the testcase:

  https://godbolt.org/z/xrMnezrse

Specifically: is it legitimate to fold:

(insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
        (vec_select:V8QI (reg:V16QI 117)
            (parallel:V16QI [
                    (const_int 8 [0x8])
                    (const_int 9 [0x9])
                    (const_int 10 [0xa])
                    (const_int 11 [0xb])
                    (const_int 12 [0xc])
                    (const_int 13 [0xd])
                    (const_int 14 [0xe])
                    (const_int 15 [0xf])
                ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-gnu/12.0.0/include/arm_neon.h":6477:22 1394 {aarch64_simd_mov_from_v16qihigh}
     (nil))

to:

(insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
        (const_vector:V8QI [
                (const_int 3 [0x3]) repeated x8
            ])) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-gnu/12.0.0/include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
     (expr_list:REG_DEAD (reg:V16QI 117)
        (nil)))

without first trying to get rid of the instruction some other way
(through combine)?

I think it is legitimate, since the new MOVI instruction is at least as
cheap as the original DUP.  Even if CSE didn't do the fold itself, and
just CSEd the two uses of the V16QI constant, I think it would be
legitimate for a later patch to fold the instruction to a constant
independently of CSE.

IMO:

  vget_high_s8(vdupq_n_u8(3))

is just a roundabout way of writing:

  vdup_n_u8(3)

We've described what vget_high_s8 does in target-independent rtl
(i.e. without unspecs) so it's natural that operations with constant
operands will themselves get folded to a constant.

I think we should accept that and try to generate the output we want in
an environment where such folds do happen, rather than trying to prevent
the folds from happening until during or after combine.

That approach could also work for autovec output, and cases where the
user wrote the 8-byte constants directly.  E.g. I think we should aim
to optimise:

void test0_mod (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
{
  uint8x8_t three_u8 = vdup_n_u8(3);
  uint8x16_t x = vld1q_u8(inptr0);
  uint8x16_t y = vld1q_u8(inptr1);
  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), three_u8);
  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), three_u8);
  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
}

in the same way as the original test0.  Similarly we should aim
to optimise:

int8x16_t square_mode(int8x16_t full, int8x8_t small) {
    int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
    int8x8_t low = {8,9,10,11,12,13,15};
    int8x8_t res1 = vmul_s8 (small, low);
    return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1));
}

in the same way as square.

>> so that there are no longer any duplicate constants (as far as the RTL code is
>> concerned).  Instead we have one 16-byte constant and one 8-byte constant.
>> 
>> The patch prevents the fold on insn 8 by making the โ€œmov highโ€
>> strictly cheaper than the constant move, so we keep the โ€œmov highโ€
>> and its 16-byte input.  Keeping the โ€œmov highโ€ means that we do have a
>> duplicate constant for CSE to remove.
>> 
>> What I meantโ€ฆ
>> 
>> >> Also, if I've understood correctly, it looks like we'd be relying on
>> >> the vget_high of a constant remaining unfolded until RTL cse1.
>> >> I think it's likely in future that we'd try to fold vget_high at the
>> >> gimple level instead, since that could expose more optimisations of a
>> >> different kind.  The gimple optimisers would then fold
>> >> vget_high(constant) in a similar way to
>> >> cse1 does now.
>> >>
>> >> So perhaps we should continue to allow the vget_high(constant) to be
>> >> foloded in cse1 and come up with some way of coping with the folded
>> form.
>> 
>> โ€ฆhere was that, in future, the gimple optimisers might be able to fold the
>> vget_high themselves.  For your example, we'd then have:
>> 
>>   _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
>>   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
>>   _7 = __builtin_aarch64_combinev8qi (_4, _4);
>>   _8 = _6 + _7;
>>   return _8;
>> 
>> In this situation, we'd need to recreate the relationship between { 0, 1, 2, 3, 4,
>> 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10, 11, 12, 13, 15, 0 }.  We can't
>> ensure that the relationship is never lost.
>> 
>> The same thing would be true for vget_low.  So a constant like:
>> 
>>   cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
>>   โ€ฆ vget_low* (cst) ..;
>>   โ€ฆ vget_high* (cst) โ€ฆ;
>> 
>> could be folded to two smaller constants:
>> 
>>   โ€ฆ { 0, 1, 2, 3, 4, 5, 6, 7 } โ€ฆ;
>>   โ€ฆ { 8, 9, 10, 11, 12, 13, 15, 0 } โ€ฆ;
>> 
>> We might then need to recreate the combined form, rather than relying on
>> the combined form already existing.
>
> Yes but this is what confuses me. My patch changes it so that CSE1 which is ran
> relatively early is able to find the relationship between the two constants.

Yeah, it does that for the case where the vector constant is a duplicate
of a single element.  My example above doesn't fall into that category
though.

What I was saying was: let's suppose that a vget_low/vget_high pair for
a general V16QI vector constant is folded at the gimple level (by later
patches).  Then the RTL optimisers just see two V8QI constants rather
than a single V16QI constant.  The optimisers would need to generate the
V16QI โ€œfrom scratchโ€ if they wanted to, as for test0_mod above.

> CSE1 shouldn't do any folding, it doesn't have enough information to do so.
> By CSE doing folding it makes it so combine is less efficient.

I don't agree with that as a general statement.  I agree that stopping
pre-combine passes from folding helps examples like test0, but I don't
think that means that pre-combine passes are doing the wrong thing by
folding.  IMO the problem is more that we are very opportunistic in
looking for high-part operations (and by-lane operations).  Legitimate
optimisations can easily defeat this opportunistic matching.

>> > CSE1 doesn't fold it, because for CSE the cost is too high to do so. Which is
>> what this costing was attempting to fix.
>> > CSE simply does not touch it. It leaves it as
>> >
>> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
>> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
>> >                 (parallel:V16QI [
>> >                         (const_int 0 [0])
>> >                         (const_int 1 [0x1])
>> >                         (const_int 2 [0x2])
>> >                         (const_int 3 [0x3])
>> >                         (const_int 4 [0x4])
>> >                         (const_int 5 [0x5])
>> >                         (const_int 6 [0x6])
>> >                         (const_int 7 [0x7])
>> >                     ]))
>> >             (reg:V8QI 93 [ _4 ]))) "":6506:10 1908
>> {aarch64_simd_move_hi_quad_v16qi}
>> >      (nil))
>> > (insn 12 11 13 2 (set (reg:V16QI 102)
>> >         (const_vector:V16QI [
>> >                 (const_int 0 [0])
>> >                 (const_int 1 [0x1])
>> >                 (const_int 2 [0x2])
>> >                 (const_int 3 [0x3])
>> >                 (const_int 4 [0x4])
>> >                 (const_int 5 [0x5])
>> >                 (const_int 6 [0x6])
>> >                 (const_int 7 [0x7])
>> >                 (const_int 8 [0x8])
>> >                 (const_int 9 [0x9])
>> >                 (const_int 10 [0xa])
>> >                 (const_int 11 [0xb])
>> >                 (const_int 12 [0xc])
>> >                 (const_int 13 [0xd])
>> >                 (const_int 15 [0xf])
>> >                 (const_int 0 [0])
>> >             ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
>> >      (nil))
>> 
>> I don't think that's true for the unpatched compiler.  Are you sure this isn't
>> the โ€œpre-CSEโ€ part of the dump?  CSE is confusing (to me) in that it prints
>> each function twice, once in unoptimised form and later in optimised form.
>> 
>
> Yes I'm sure, see all the compiler explorer links above.

Ah, yeah, I misunderstood which insn you were quoting.  But insn 11 in:

  https://godbolt.org/z/rrbP14var

is part of the vcombine_s8.  The preceding instructions are:

(insn 9 8 10 2 (set (reg:V8QI 93 [ _4 ])
        (mult:V8QI (reg:V8QI 92 [ _2 ])
            (reg/v:V8QI 98 [ small ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-gnu/12.0.0/include/arm_neon.h":1402:14 1428 {mulv8qi3}
     (expr_list:REG_DEAD (reg/v:V8QI 98 [ small ])
        (expr_list:REG_DEAD (reg:V8QI 92 [ _2 ])
            (nil))))
(insn 10 9 11 2 (set (reg:V16QI 95 [ _7 ])
        (vec_concat:V16QI (reg:V8QI 93 [ _4 ])
            (const_vector:V8QI [
                    (const_int 0 [0]) repeated x8
                ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-gnu/12.0.0/include/arm_neon.h":6506:10 1892 {move_lo_quad_internal_v16qi}
     (nil))

and since the multiplication result is variable, we can't fold this.

The vget_high is insn 8, which does get folded (but it sounds like
we agree on that).

> > > And I don't see any way to fix this without having Gimple not push
> > constants in, which would lead to worse regressions.
> > > I can change the patch to cost the high as a dup which fixes this codegen at
> > least and has you rematerialize movi.   If that's
> > > not acceptable I can drop costing for High entirely then, it's not the main
> > thing I am fixing.
> > 
> > Costing the high as a dup leaves us in the same situation as before the
> > patch: the folded V8QI constant is cheaper than the unfolded mov high.
>
> Yes and the dup will reflect that. The argument that it's not the right cost no
> longer hold any water in that case.

Yeah, my concerns disappear in that case.

> In particular as I still maintain that is too
> early to do any constant folding in CSE1 for AArch64.
>
> Whether it's folded or not doesn't make any difference to combine which will
> Fold when combinations are possible with the folder version.
>
> So I have yet to see any actual regression.

Well, this is going to win any awards for realism :-), but:

#include <arm_neon.h>

int8x16_t foo() {
  int8x16_t a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  int8x8_t b = vget_high_s8 (a);
  int8x8_t c = { 4, 5, 6, 7, 8, 9, 10, 11 };
  int8x8_t d = vadd_s8 (b, c);
  int8x16_t e = vcombine_s8 (d, b);
  return vaddq_s8 (e, a);
}

is folded to a constant before the patch and isn't after the patch.

Your examples are more realistic than that one, but I think this does
show why preventing folding can be counter-productive in some cases.

My hope is that one day gimple would fold that example to a constant.
But if it does, it will also fold the vget_highs and vget_lows in the
original testcase to constants, meaning that we can't rely on the
original V16QI constant existing as well.

Thanks,
Richard
Tamar Christina Oct. 26, 2021, 1:01 p.m. UTC | #12
Hi,

Following the discussion below here's a revised patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
	(aarch64_rtx_costs): Add extra costs.
	(aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..1fb4350916572c915e5af339102444daf324efc7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12705,7 +12706,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -14006,8 +14007,52 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    gcc_unreachable ();
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost += extra_cost->vect.dup;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, October 25, 2021 3:32 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, October 25, 2021 10:54 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>;
> >> Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus
> >> Shawcroft <Marcus.Shawcroft@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Saturday, October 23, 2021 11:40 AM
> >> >> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
> >> >> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>
> >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> >> constants and operations
> >> >>
> >> >> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> >> I'm still a bit sceptical about treating the high-part cost as lower.
> >> >> >> ISTM that the subreg cases are the ones that are truly โ€œfreeโ€
> >> >> >> and any others should have a normal cost.  So if CSE handled
> >> >> >> the subreg case itself (to model how the rtx would actually be
> >> >> >> generated) then
> >> >> >> aarch64 code would have to do less work.  I imagine that will
> >> >> >> be true for
> >> >> other targets as well.
> >> >> >
> >> >> > I guess the main problem is that CSE lacks context because it's
> >> >> > not until after combine that the high part becomes truly "free"
> >> >> > when pushed
> >> >> into a high operation.
> >> >>
> >> >> Yeah.  And the aarch64 code is just being asked to cost the
> >> >> operation it's given, which could for example come from an
> >> >> existing aarch64_simd_mov_from_<mode>high.  I think we should try
> >> >> to ensure that a aarch64_simd_mov_from_<mode>high followed by
> some
> >> arithmetic
> >> >> on the result is more expensive than the fused operation (when
> >> >> fusing is possible).
> >> >>
> >> >> An analogy might be: if the cost code is given:
> >> >>
> >> >>   (add (reg X) (reg Y))
> >> >>
> >> >> then, at some later point, the (reg X) might be replaced with a
> >> >> multiplication, in which case we'd have a MADD operation and the
> >> >> addition is effectively free.  Something similar would happen if
> >> >> (reg
> >> >> X) became a shift by a small amount on newer cores, although I
> >> >> guess then you could argue either that the cost of the add
> >> >> disappears or that
> >> the cost of the shift disappears.
> >> >>
> >> >> But we shouldn't count ADD as free on the basis that it could be
> >> >> combined with a multiplication or shift in future.  We have to
> >> >> cost what we're given.  I think the same thing applies to the high part.
> >> >>
> >> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with
> >> >> a MOVI by saying that the DUP is strictly cheaper than the MOVI.
> >> >> I don't think that's really true though, and the cost tables in
> >> >> the patch say that DUP is more expensive (rather than less
> >> >> expensive) than
> >> MOVI.
> >> >
> >> > No we're not. The front end has already pushed the constant into
> >> > each operation that needs it which is the entire problem.
> >>
> >> I think we're talking about different things here.  I'll come to the
> >> gimple stuff below, but I was talking purely about the effect on the
> >> RTL optimisers.  What I meant above is that, in the cse1 dumps, the patch
> leads to changes like:
> >>
> >>  (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >> -        (const_vector:V8QI [
> >> +        (vec_select:V8QI (reg:V16QI 116)
> >> +            (parallel:V16QI [
> >> +                    (const_int 8 [0x8])
> >> +                    (const_int 9 [0x9])
> >> +                    (const_int 10 [0xa])
> >> +                    (const_int 11 [0xb])
> >> +                    (const_int 12 [0xc])
> >> +                    (const_int 13 [0xd])
> >> +                    (const_int 14 [0xe])
> >> +                    (const_int 15 [0xf])
> >> +                ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >> +     (expr_list:REG_EQUAL (const_vector:V8QI [
> >>                  (const_int 3 [0x3]) repeated x8
> >> -            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >> -     (expr_list:REG_DEAD (reg:V16QI 117)
> >> -        (nil)))
> >> +            ])
> >> +        (expr_list:REG_DEAD (reg:V16QI 117)
> >> +            (nil))))
> >>
> >> The pre-cse1 code is:
> >>
> >> (insn 19 18 20 2 (set (reg:V16QI 117)
> >>         (const_vector:V16QI [
> >>                 (const_int 3 [0x3]) repeated x16
> >>             ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
> >>      (nil))
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >>         (vec_select:V8QI (reg:V16QI 117)
> >>             (parallel:V16QI [
> >>                     (const_int 8 [0x8])
> >>                     (const_int 9 [0x9])
> >>                     (const_int 10 [0xa])
> >>                     (const_int 11 [0xb])
> >>                     (const_int 12 [0xc])
> >>                     (const_int 13 [0xd])
> >>                     (const_int 14 [0xe])
> >>                     (const_int 15 [0xf])
> >>                 ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >>      (nil))
> >>
> >> That is, before the patch, we folded insn 19 into insn 20 to get:
> >>
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >>         (const_vector:V8QI [
> >>                 (const_int 3 [0x3]) repeated x8
> >>             ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >>      (expr_list:REG_DEAD (reg:V16QI 117)
> >>         (nil)))
> >>
> >> After the patch we reject that because:
> >>
> >>   (set (reg:V8QI X) (const_vector:V8QI [3]))
> >>
> >> is costed as a MOVI (cost 4) and the original
> >> aarch64_simd_mov_from_v16qihigh is costed as zero.  In other words,
> >> the patch makes the DUP (lane) in the โ€œmov highโ€ strictly cheaper
> >> than a constant move (MOVI).
> >
> > Yes, this was done intentionally because as we talked about a month
> > ago there's no real way to cost this correctly. The use of `X` there
> > determines whether it's cheaper to use the movi over the dup.  The
> > MOVI not only prevent re-use of the value, it also prevents combining
> > into high operations.  All of which is impossible to tell currently in how CSE
> and costing are done.
> >
> > This is an unmodified compiler created from last night's trunk
> > https://godbolt.org/z/1saTP4xWs
> >
> > While yes, it did fold movi into the set, reg 19 wasn't dead, so you
> > now materialized the constant 3 times
> >
> > test0:
> >         ldr     q0, [x0]
> >         movi    v3.8b, 0x3  <<<< first
> >         ldr     q2, [x1]
> >         movi    v5.16b, 0x3 <<< second
> >         uxtl    v1.8h, v0.8b
> >         dup     d4, v2.d[1] <<< third
> >         uxtl2   v0.8h, v0.16b
> >         umlal   v1.8h, v2.8b, v5.8b
> >         umlal   v0.8h, v4.8b, v3.8b
> >         addhn   v0.8b, v1.8h, v0.8h
> >         str     d0, [x2]
> >         ret
> >
> > whilst my patch, generates
> >
> > test0:
> > 	movi	v2.16b, 0x3 <<< once
> > 	ldr	q0, \[x0\]
> > 	uxtl	v1.8h, v0.8b
> > 	uxtl2	v0.8h, v0.16b
> > 	ldr	q3, \[x1\]
> > 	umlal	v1.8h, v3.8b, v2.8b
> > 	umlal2	v0.8h, v3.16b, v2.16b
> > 	addhn	v0.8b, v1.8h, v0.8h
> > 	str	d0, \[x2\]
> > 	ret
> >
> > Yes it's not perfect, yes you can end up with a dup instead of two
> > movi's but my argument is it's still a step forward as the perfect solution
> doesn't seem to be possible at all with the way things are currently set up.
> 
> I agree there's no out-of-the-box way of getting what we want for the
> original testcases.  It would require changes outside the target or (if the
> worst comes to the worst) a target-specific pass.
> 
> >> Preventing this fold seems like a key part of being able to match the
> >> *l2 forms in the testcase, since otherwise the โ€œmov highโ€ disappears
> >> and isn't available for combining later.
> >
> > Yes, and by preventing the folding combine should in principle be able
> > to fold it back if it wasn't pushed into another Instruction, but combine
> does not attempt to touch constants and selects on their own. If it did this
> "regression" would be fixed.
> 
> The problem is that combine is limited to individual EBBs and only combines
> def-use chains when there is a single use.  It's not a general folding engine.
> 
> > I'm not really quite sure what we're arguing about..  I did think about all
> three possible cases when making this:
> >
> > https://godbolt.org/z/hjWhWq1v1
> >
> > Of the three cases the compiler currently only generates something good
> for test2.  Both test1 and test0 are deficient.
> > The patch doesn't change test2, significantly improves test0 and whether
> test1 is a regression is likely uArch specific.
> >
> > On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar
> > has the same throughput and latencies as a MOVI according to the Arm
> Performance Software Optimization guides.
> 
> Costing them as equal would be OK when they are equal.  It's the โ€œDUP
> (lane)/ mov high is strictly cheaper bitโ€ I'm concerned about.
> 
> > So to me this looks like an improvement overall.  And this is where we likely
> disagree?
> 
> Well, the disagreement isn't about whether the new compiler output for
> these testcases is better than the old compiler output.  It's more a question
> of how we're getting there.
> 
> >> > MOVI as I mentioned before is the one case where this is a toss up.
> >> > But there are far more constants that cannot be created with a movi.
> >> > A simple example is
> >> >
> >> > #include <arm_neon.h>
> >> >
> >> > int8x16_t square(int8x16_t full, int8x8_t small) {
> >> >     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
> >> >     int8x8_t low = vget_high_s8 (cst);
> >> >     int8x8_t res1 = vmul_s8 (small, low);
> >> >     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1,
> >> > res1)); }
> >> >
> >> > Where in Gimple we get
> >> >
> >> >   <bb 2> [local count: 1073741824]:
> >> >   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7,
> >> > 8, 9, 10, 11, 12,
> >> 13, 15, 0 });
> >> >   _4 = _2 * small_3(D);
> >> >   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >> >   _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >> >   _8 = _6 + _7;
> >> >   return _8;
> >> >
> >> > Regardless of what happens to __builtin_aarch64_get_highv16qi
> >> > nothing will recreate the relationship with cst, whether
> >> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will
> >> still push in constants.
> >>
> >> Yeah, constants are (by design) free in gimple.  But that's OK in
> >> itself, because RTL optimisers have the job of removing any
> >> duplicates that end up requiring separate moves.  I think we both agree
> on that.
> >>
> >> E.g. for:
> >>
> >> #include <arm_neon.h>
> >>
> >> void foo(int8x16_t *x) {
> >>   x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >>   x[1] = vaddq_s8 (x[1], (int8x16_t)
> >> {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >> }
> >>
> >> the final gimple is:
> >>
> >>   <bb 2> [local count: 1073741824]:
> >>   _1 = *x_4(D);
> >>   _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >>   *x_4(D) = _5;
> >>   _2 = MEM[(int8x16_t *)x_4(D) + 16B];
> >>   _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >>   MEM[(int8x16_t *)x_4(D) + 16B] = _7;
> >>   return;
> >>
> >> but cse1 removes the duplicated constant even before the patch.
> >
> > It doesn't for me, again an unmodified compiler:
> >
> > https://godbolt.org/z/qnvf7496h
> 
> FWIW, the link for my example is:
> 
>   https://godbolt.org/z/G6vaE3nab
> 
> but it sounds like the disagreement wasn't where I thought it was.
> 
> > and CSE1 has as the final codegen:
> >
> > (insn 7 4 8 2 (set (reg:V16QI 99)
> >         (const_vector:V16QI [
> >                 (const_int 0 [0])
> >                 (const_int 1 [0x1])
> >                 (const_int 2 [0x2])
> >                 (const_int 3 [0x3])
> >                 (const_int 4 [0x4])
> >                 (const_int 5 [0x5])
> >                 (const_int 6 [0x6])
> >                 (const_int 7 [0x7])
> >                 (const_int 8 [0x8])
> >                 (const_int 9 [0x9])
> >                 (const_int 10 [0xa])
> >                 (const_int 11 [0xb])
> >                 (const_int 12 [0xc])
> >                 (const_int 13 [0xd])
> >                 (const_int 15 [0xf])
> >                 (const_int 0 [0])
> >             ]))
> >
> > (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
> >         (const_vector:V8QI [
> >                 (const_int 8 [0x8])
> >                 (const_int 9 [0x9])
> >                 (const_int 10 [0xa])
> >                 (const_int 11 [0xb])
> >                 (const_int 12 [0xc])
> >                 (const_int 13 [0xd])
> >                 (const_int 15 [0xf])
> >                 (const_int 0 [0])
> >             ]))
> >
> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >                 (parallel:V16QI [
> >                         (const_int 0 [0])
> >                         (const_int 1 [0x1])
> >                         (const_int 2 [0x2])
> >                         (const_int 3 [0x3])
> >                         (const_int 4 [0x4])
> >                         (const_int 5 [0x5])
> >                         (const_int 6 [0x6])
> >                         (const_int 7 [0x7])
> >                     ]))
> >             (reg:V8QI 93 [ _4 ])))
> 
> Here, insn 8 is the folded version of the vget_high_s8 and insn 11 is part of
> the vcombine_s8.  With that caveatโ€ฆ
> 
> > So again same constant represented twice, which is reflected in the
> codegen.
> 
> โ€ฆright, the above is also what I was saying that we generate before the patch
> for your square example.
> 
> But as you say later this testcase is demonstrating the point that constants
> loaded from memory should be more expensive than DUP (lane).
> I agree with that.  The bit I don't agree with is costing the DUP (lane) as zero,
> so that it's also strictly cheaper than MOVI.
> 
> So I think the disagreement is more about things like the first example in the
> testcase:
> 
>   https://godbolt.org/z/xrMnezrse
> 
> Specifically: is it legitimate to fold:
> 
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (vec_select:V8QI (reg:V16QI 117)
>             (parallel:V16QI [
>                     (const_int 8 [0x8])
>                     (const_int 9 [0x9])
>                     (const_int 10 [0xa])
>                     (const_int 11 [0xb])
>                     (const_int 12 [0xc])
>                     (const_int 13 [0xd])
>                     (const_int 14 [0xe])
>                     (const_int 15 [0xf])
>                 ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1394
> {aarch64_simd_mov_from_v16qihigh}
>      (nil))
> 
> to:
> 
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (const_vector:V8QI [
>                 (const_int 3 [0x3]) repeated x8
>             ])) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 117)
>         (nil)))
> 
> without first trying to get rid of the instruction some other way (through
> combine)?
> 
> I think it is legitimate, since the new MOVI instruction is at least as cheap as
> the original DUP.  Even if CSE didn't do the fold itself, and just CSEd the two
> uses of the V16QI constant, I think it would be legitimate for a later patch to
> fold the instruction to a constant independently of CSE.
> 
> IMO:
> 
>   vget_high_s8(vdupq_n_u8(3))
> 
> is just a roundabout way of writing:
> 
>   vdup_n_u8(3)
> 
> We've described what vget_high_s8 does in target-independent rtl (i.e.
> without unspecs) so it's natural that operations with constant operands will
> themselves get folded to a constant.
> 
> I think we should accept that and try to generate the output we want in an
> environment where such folds do happen, rather than trying to prevent the
> folds from happening until during or after combine.
> 
> That approach could also work for autovec output, and cases where the user
> wrote the 8-byte constants directly.  E.g. I think we should aim to optimise:
> 
> void test0_mod (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0) {
>   uint8x8_t three_u8 = vdup_n_u8(3);
>   uint8x16_t x = vld1q_u8(inptr0);
>   uint8x16_t y = vld1q_u8(inptr1);
>   uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
>   uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
>   uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), three_u8);
>   uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), three_u8);
>   vst1_u8(outptr0, vaddhn_u16(z_l, z_h)); }
> 
> in the same way as the original test0.  Similarly we should aim to optimise:
> 
> int8x16_t square_mode(int8x16_t full, int8x8_t small) {
>     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
>     int8x8_t low = {8,9,10,11,12,13,15};
>     int8x8_t res1 = vmul_s8 (small, low);
>     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1)); }
> 
> in the same way as square.
> 
> >> so that there are no longer any duplicate constants (as far as the
> >> RTL code is concerned).  Instead we have one 16-byte constant and one 8-
> byte constant.
> >>
> >> The patch prevents the fold on insn 8 by making the โ€œmov highโ€
> >> strictly cheaper than the constant move, so we keep the โ€œmov highโ€
> >> and its 16-byte input.  Keeping the โ€œmov highโ€ means that we do have
> >> a duplicate constant for CSE to remove.
> >>
> >> What I meantโ€ฆ
> >>
> >> >> Also, if I've understood correctly, it looks like we'd be relying
> >> >> on the vget_high of a constant remaining unfolded until RTL cse1.
> >> >> I think it's likely in future that we'd try to fold vget_high at
> >> >> the gimple level instead, since that could expose more
> >> >> optimisations of a different kind.  The gimple optimisers would
> >> >> then fold
> >> >> vget_high(constant) in a similar way to
> >> >> cse1 does now.
> >> >>
> >> >> So perhaps we should continue to allow the vget_high(constant) to
> >> >> be foloded in cse1 and come up with some way of coping with the
> >> >> folded
> >> form.
> >>
> >> โ€ฆhere was that, in future, the gimple optimisers might be able to
> >> fold the vget_high themselves.  For your example, we'd then have:
> >>
> >>   _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
> >>   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >>   _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >>   _8 = _6 + _7;
> >>   return _8;
> >>
> >> In this situation, we'd need to recreate the relationship between {
> >> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10,
> >> 11, 12, 13, 15, 0 }.  We can't ensure that the relationship is never lost.
> >>
> >> The same thing would be true for vget_low.  So a constant like:
> >>
> >>   cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
> >>   โ€ฆ vget_low* (cst) ..;
> >>   โ€ฆ vget_high* (cst) โ€ฆ;
> >>
> >> could be folded to two smaller constants:
> >>
> >>   โ€ฆ { 0, 1, 2, 3, 4, 5, 6, 7 } โ€ฆ;
> >>   โ€ฆ { 8, 9, 10, 11, 12, 13, 15, 0 } โ€ฆ;
> >>
> >> We might then need to recreate the combined form, rather than relying
> >> on the combined form already existing.
> >
> > Yes but this is what confuses me. My patch changes it so that CSE1
> > which is ran relatively early is able to find the relationship between the two
> constants.
> 
> Yeah, it does that for the case where the vector constant is a duplicate of a
> single element.  My example above doesn't fall into that category though.
> 
> What I was saying was: let's suppose that a vget_low/vget_high pair for a
> general V16QI vector constant is folded at the gimple level (by later patches).
> Then the RTL optimisers just see two V8QI constants rather than a single
> V16QI constant.  The optimisers would need to generate the V16QI โ€œfrom
> scratchโ€ if they wanted to, as for test0_mod above.
> 
> > CSE1 shouldn't do any folding, it doesn't have enough information to do so.
> > By CSE doing folding it makes it so combine is less efficient.
> 
> I don't agree with that as a general statement.  I agree that stopping pre-
> combine passes from folding helps examples like test0, but I don't think that
> means that pre-combine passes are doing the wrong thing by folding.  IMO
> the problem is more that we are very opportunistic in looking for high-part
> operations (and by-lane operations).  Legitimate optimisations can easily
> defeat this opportunistic matching.
> 
> >> > CSE1 doesn't fold it, because for CSE the cost is too high to do
> >> > so. Which is
> >> what this costing was attempting to fix.
> >> > CSE simply does not touch it. It leaves it as
> >> >
> >> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >> >                 (parallel:V16QI [
> >> >                         (const_int 0 [0])
> >> >                         (const_int 1 [0x1])
> >> >                         (const_int 2 [0x2])
> >> >                         (const_int 3 [0x3])
> >> >                         (const_int 4 [0x4])
> >> >                         (const_int 5 [0x5])
> >> >                         (const_int 6 [0x6])
> >> >                         (const_int 7 [0x7])
> >> >                     ]))
> >> >             (reg:V8QI 93 [ _4 ]))) "":6506:10 1908
> >> {aarch64_simd_move_hi_quad_v16qi}
> >> >      (nil))
> >> > (insn 12 11 13 2 (set (reg:V16QI 102)
> >> >         (const_vector:V16QI [
> >> >                 (const_int 0 [0])
> >> >                 (const_int 1 [0x1])
> >> >                 (const_int 2 [0x2])
> >> >                 (const_int 3 [0x3])
> >> >                 (const_int 4 [0x4])
> >> >                 (const_int 5 [0x5])
> >> >                 (const_int 6 [0x6])
> >> >                 (const_int 7 [0x7])
> >> >                 (const_int 8 [0x8])
> >> >                 (const_int 9 [0x9])
> >> >                 (const_int 10 [0xa])
> >> >                 (const_int 11 [0xb])
> >> >                 (const_int 12 [0xc])
> >> >                 (const_int 13 [0xd])
> >> >                 (const_int 15 [0xf])
> >> >                 (const_int 0 [0])
> >> >             ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
> >> >      (nil))
> >>
> >> I don't think that's true for the unpatched compiler.  Are you sure
> >> this isn't the โ€œpre-CSEโ€ part of the dump?  CSE is confusing (to me)
> >> in that it prints each function twice, once in unoptimised form and later in
> optimised form.
> >>
> >
> > Yes I'm sure, see all the compiler explorer links above.
> 
> Ah, yeah, I misunderstood which insn you were quoting.  But insn 11 in:
> 
>   https://godbolt.org/z/rrbP14var
> 
> is part of the vcombine_s8.  The preceding instructions are:
> 
> (insn 9 8 10 2 (set (reg:V8QI 93 [ _4 ])
>         (mult:V8QI (reg:V8QI 92 [ _2 ])
>             (reg/v:V8QI 98 [ small ]))) "/opt/compiler-explorer/arm64/gcc-trunk-
> 20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":1402:14 1428 {mulv8qi3}
>      (expr_list:REG_DEAD (reg/v:V8QI 98 [ small ])
>         (expr_list:REG_DEAD (reg:V8QI 92 [ _2 ])
>             (nil))))
> (insn 10 9 11 2 (set (reg:V16QI 95 [ _7 ])
>         (vec_concat:V16QI (reg:V8QI 93 [ _4 ])
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6506:10 1892
> {move_lo_quad_internal_v16qi}
>      (nil))
> 
> and since the multiplication result is variable, we can't fold this.
> 
> The vget_high is insn 8, which does get folded (but it sounds like we agree on
> that).
> 
> > > > And I don't see any way to fix this without having Gimple not push
> > > constants in, which would lead to worse regressions.
> > > > I can change the patch to cost the high as a dup which fixes this
> > > > codegen at
> > > least and has you rematerialize movi.   If that's
> > > > not acceptable I can drop costing for High entirely then, it's not
> > > > the main
> > > thing I am fixing.
> > >
> > > Costing the high as a dup leaves us in the same situation as before
> > > the
> > > patch: the folded V8QI constant is cheaper than the unfolded mov high.
> >
> > Yes and the dup will reflect that. The argument that it's not the
> > right cost no longer hold any water in that case.
> 
> Yeah, my concerns disappear in that case.
> 
> > In particular as I still maintain that is too early to do any constant
> > folding in CSE1 for AArch64.
> >
> > Whether it's folded or not doesn't make any difference to combine
> > which will Fold when combinations are possible with the folder version.
> >
> > So I have yet to see any actual regression.
> 
> Well, this is going to win any awards for realism :-), but:
> 
> #include <arm_neon.h>
> 
> int8x16_t foo() {
>   int8x16_t a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>   int8x8_t b = vget_high_s8 (a);
>   int8x8_t c = { 4, 5, 6, 7, 8, 9, 10, 11 };
>   int8x8_t d = vadd_s8 (b, c);
>   int8x16_t e = vcombine_s8 (d, b);
>   return vaddq_s8 (e, a);
> }
> 
> is folded to a constant before the patch and isn't after the patch.
> 
> Your examples are more realistic than that one, but I think this does show
> why preventing folding can be counter-productive in some cases.
> 
> My hope is that one day gimple would fold that example to a constant.
> But if it does, it will also fold the vget_highs and vget_lows in the original
> testcase to constants, meaning that we can't rely on the original V16QI
> constant existing as well.
> 
> Thanks,
> Richard
Richard Sandiford Oct. 26, 2021, 2:46 p.m. UTC | #13
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi,
>
> Following the discussion below here's a revised patch.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?

Looks good functionally, just got some comments about the implementation.

> @@ -14006,8 +14007,52 @@ cost_plus:
>  			     mode, MULT, 1, speed);
>            return true;
>          }
> +	break;
> +    case CONST_VECTOR:
> +	{
> +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> +	  /* Not a valid const vector.  */
> +	  if (!gen_insn)
> +	    break;
>  
> -      /* Fall through.  */
> +	  switch (GET_CODE (gen_insn))
> +	  {
> +	  case CONST_VECTOR:
> +	    /* Load using MOVI/MVNI.  */
> +	    if (aarch64_simd_valid_immediate (x, NULL))
> +	      *cost += extra_cost->vect.movi;
> +	    else /* Load using constant pool.  */
> +	      *cost += extra_cost->ldst.load;
> +	    break;
> +	  /* Load using a DUP.  */
> +	  case VEC_DUPLICATE:
> +	    gcc_unreachable ();
> +	    break;
> +	  default:
> +	    *cost += extra_cost->ldst.load;
> +	    break;
> +	  }
> +	  return true;
> +	}

This might be a problem (if it is a problem) with some of the existing
cases too, but: is using += rather than = the right behaviour here?
It maens that we add our cost on top of whatever the target-independent
rtx_costs thought was a good default choice, whereas it looks like
these table entries specify the correct full cost.

If it's not clear-cut, then I think using = would be better.

Also, going back to an earlier part of the thread, I think the โ€œinnerโ€
CONST_VECTOR case is now a correct replacement for the โ€œouterโ€
CONST_VECTOR case, meaning we don't need the aarch64_simd_make_constant
bits.  I.e. I think we can make the top-level case:

    case CONST_VECTOR:
      /* Load using MOVI/MVNI.  */
      if (aarch64_simd_valid_immediate (x, NULL))
        *cost = extra_cost->vect.movi;
      else /* Load using constant pool.  */
        *cost = extra_cost->ldst.load;
      break;

> +    case VEC_CONCAT:
> +	/* depending on the operation, either DUP or INS.
> +	   For now, keep default costing.  */
> +	break;
> +    case VEC_DUPLICATE:
> +	*cost += extra_cost->vect.dup;
> +	return true;

For this I think we should do:

  *cost = extra_cost->vect.dup;
  return false;

so that we cost the operand of the vec_duplicate as well.
This will have no effect if the operand is a REG, but would
affect more complex expressions.

> +    case VEC_SELECT:
> +	{

Here I think we should recurse on operand 0:

	  rtx op0 = XEXP (x, 0);
	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);

> +	  /* cost subreg of 0 as free, otherwise as DUP */
> +	  rtx op1 = XEXP (x, 1);
> +	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> +	    ;
> +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> +	    *cost += extra_cost->vect.dup;
> +	  else
> +	    *cost += extra_cost->vect.extract;
> +	  return true;
> +	}
>      default:
>        break;
>      }
> @@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
>  
>  /* If VALS is a vector constant that can be loaded into a register
>     using DUP, generate instructions to do so and return an RTX to
> -   assign to the register.  Otherwise return NULL_RTX.  */
> +   assign to the register.  Otherwise return NULL_RTX.
> +
> +   If CHECK then the resulting instruction may not be used in
> +   codegen but can be used for costing.  */
>  static rtx
> -aarch64_simd_dup_constant (rtx vals)
> +aarch64_simd_dup_constant (rtx vals, bool check = false)
>  {
>    machine_mode mode = GET_MODE (vals);
>    machine_mode inner_mode = GET_MODE_INNER (mode);
> @@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
>    /* We can load this constant by using DUP and a constant in a
>       single ARM register.  This will be cheaper than a vector
>       load.  */
> -  x = copy_to_mode_reg (inner_mode, x);
> +  if (!check)
> +    x = copy_to_mode_reg (inner_mode, x);
>    return gen_vec_duplicate (mode, x);
>  }
>  
> @@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
>  /* Generate code to load VALS, which is a PARALLEL containing only
>     constants (for vec_init) or CONST_VECTOR, efficiently into a
>     register.  Returns an RTX to copy into the register, or NULL_RTX
> -   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
> +   for a PARALLEL that cannot be converted into a CONST_VECTOR.
> +
> +   If CHECK then the resulting instruction may not be used in
> +   codegen but can be used for costing.  */
>  static rtx
> -aarch64_simd_make_constant (rtx vals)
> +aarch64_simd_make_constant (rtx vals, bool check = false)
>  {
>    machine_mode mode = GET_MODE (vals);
>    rtx const_dup;
> @@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
>        && aarch64_simd_valid_immediate (const_vec, NULL))
>      /* Load using MOVI/MVNI.  */
>      return const_vec;
> -  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
> +  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
>      /* Loaded using DUP.  */
>      return const_dup;
>    else if (const_vec != NULL_RTX)

With the inner CONST_VECTOR case replacing the outer one, I think we can
drop the aarch64_simd_dup_constant and aarch64_simd_make_constant bits.

> diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
> index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
> --- a/gcc/config/arm/aarch-common-protos.h
> +++ b/gcc/config/arm/aarch-common-protos.h
> @@ -133,6 +133,9 @@ struct vector_cost_table
>  {
>    const int alu;
>    const int mult;
> +  const int movi;
> +  const int dup;
> +  const int extract;
>  };
>  
>  struct cpu_cost_table
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
> index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (0),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (2),  /* alu.  */
> -    COSTS_N_INSNS (8)   /* mult.  */
> +    COSTS_N_INSNS (8),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */

Could you try this with -mabi=ilp32?  It looks like it might fail.
Skipping it is OK if so.

OK with those changes, if they work.

Thanks,
Richard

> +
> +#include <arm_neon.h>
> +
> +/*
> +**test1:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
> +  uint64_t res = a | arr[0];
> +  uint64x2_t val = vld1q_u64 (arr);
> +  *rt = vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test2:
> +**	adrp	x[0-9]+, .LC[0-1]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
> +  uint64_t arr = vgetq_lane_u64 (val, 0);
> +  uint64_t res = a | arr;
> +  *rt = vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test3:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	str	q[0-9]+, \[x1\]
> +**	fmov	w[0-9]+, s[0-9]+
> +**	orr	w[0-9]+, w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +
> +uint32_t
> +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
> +{
> +  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
> +  uint32_t res = a | arr[0];
> +  uint32x4_t val = vld1q_u32 (arr);
> +  *rt = vaddq_u32 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test4:
> +**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
> +**	mov	x[0-9]+, 16512
> +**	movk	x[0-9]+, 0x1020, lsl 16
> +**	movk	x[0-9]+, 0x408, lsl 32
> +**	movk	x[0-9]+, 0x102, lsl 48
> +**	fmov	d[0-9]+, x[0-9]+
> +**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
> +**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
> +**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
> +**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**	umov	w[0-9]+, v[0-9]+.h\[3\]
> +**	ret
> +*/
> +
> +uint64_t
> +test4 (uint8x16_t input)
> +{
> +    uint8x16_t bool_input = vshrq_n_u8(input, 7);
> +    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
> +    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> +                               vgetq_lane_p64(mask, 0));
> +    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
> +    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
> +    return vget_lane_u16((uint16x4_t)res, 3);
> +}
> +
Tamar Christina Oct. 27, 2021, 3:44 p.m. UTC | #14
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, October 26, 2021 3:46 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> > Hi,
> >
> > Following the discussion below here's a revised patch.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> Looks good functionally, just got some comments about the implementation.
> 
> > @@ -14006,8 +14007,52 @@ cost_plus:
> >  			     mode, MULT, 1, speed);
> >            return true;
> >          }
> > +	break;
> > +    case CONST_VECTOR:
> > +	{
> > +	  rtx gen_insn = aarch64_simd_make_constant (x, true);
> > +	  /* Not a valid const vector.  */
> > +	  if (!gen_insn)
> > +	    break;
> >
> > -      /* Fall through.  */
> > +	  switch (GET_CODE (gen_insn))
> > +	  {
> > +	  case CONST_VECTOR:
> > +	    /* Load using MOVI/MVNI.  */
> > +	    if (aarch64_simd_valid_immediate (x, NULL))
> > +	      *cost += extra_cost->vect.movi;
> > +	    else /* Load using constant pool.  */
> > +	      *cost += extra_cost->ldst.load;
> > +	    break;
> > +	  /* Load using a DUP.  */
> > +	  case VEC_DUPLICATE:
> > +	    gcc_unreachable ();
> > +	    break;
> > +	  default:
> > +	    *cost += extra_cost->ldst.load;
> > +	    break;
> > +	  }
> > +	  return true;
> > +	}
> 
> This might be a problem (if it is a problem) with some of the existing cases
> too, but: is using += rather than = the right behaviour here?
> It maens that we add our cost on top of whatever the target-independent
> rtx_costs thought was a good default choice, whereas it looks like these table
> entries specify the correct full cost.
> 
> If it's not clear-cut, then I think using = would be better.

Switched to =

> 
> Also, going back to an earlier part of the thread, I think the โ€œinnerโ€
> CONST_VECTOR case is now a correct replacement for the โ€œouterโ€
> CONST_VECTOR case, meaning we don't need the
> aarch64_simd_make_constant bits.  I.e. I think we can make the top-level
> case:
> 
>     case CONST_VECTOR:
>       /* Load using MOVI/MVNI.  */
>       if (aarch64_simd_valid_immediate (x, NULL))
>         *cost = extra_cost->vect.movi;
>       else /* Load using constant pool.  */
>         *cost = extra_cost->ldst.load;
>       break;
> 
> > +    case VEC_CONCAT:
> > +	/* depending on the operation, either DUP or INS.
> > +	   For now, keep default costing.  */
> > +	break;
> > +    case VEC_DUPLICATE:
> > +	*cost += extra_cost->vect.dup;
> > +	return true;
> 
> For this I think we should do:
> 
>   *cost = extra_cost->vect.dup;
>   return false;
> 
> so that we cost the operand of the vec_duplicate as well.
> This will have no effect if the operand is a REG, but would affect more
> complex expressions.
> 

Unfortunately returning false here had a negative effect on SVE, where the RTL for
Something some instructions have a complex vec_duplicate.

As an example

(note 11 8 12 2 NOTE_INSN_DELETED)
        (zero_extend:DI (unspec:SI [                                                  
                    (const_int 0 [0])                                                 
                    (const_int 2 [0x2])                                               
                    (const_int 1 [0x1])                                               
                ] UNSPEC_SVE_CNT_PAT))) "cntd_pat.c":10:153 8829 {aarch64_sve_cnt_pat}
     (nil))                                                                           

No longer gets pushed into a plus operator by the combiner due the costing

rejecting combination of insns 11, 12 and 13 
original costs 4 + 8 + 8 = 20                
replacement cost 24                          

vs what it was originally

allowing combination of insns 11, 12 and 13
original costs 4 + 4 + 8 = 16
replacement cost 12

which happens because the costing for original costs don't take into effect that the instruction
that semantically handles this operation doesn't actually do any of this.

So now I have left it as true and added code for costing the VEC_SELECT of 0, which can happen if
Lowpart_subreg fails.

Ps. Can you also take a look at [PATCH 1/2][GCC][middle-end] Teach CSE to be able to do vector extracts.
I believe since you had a comment last on it no other reviewer will look at it. โ˜น

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Add extra costs.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..10658424f9667f9479e2199eaa10f28eafd84082 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -14006,8 +14006,44 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
-
-      /* Fall through.  */
+	break;
+    case CONST_VECTOR:
+	{
+	  /* Load using MOVI/MVNI.  */
+	  if (aarch64_simd_valid_immediate (x, NULL))
+	    *cost = extra_cost->vect.movi;
+	  else /* Load using constant pool.  */
+	    *cost = extra_cost->ldst.load;
+	  break;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+	/* Load using a DUP.  */
+    case VEC_DUPLICATE:
+	*cost = extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  rtx op0 = XEXP (x, 0);
+	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
+
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  /* In certain cases we can't generate a subreg for
+	     index 0, in those cases we would have generated
+	     a vec_select instead.  */
+	  if (known_eq (INTVAL (op1),
+			ENDIAN_LANE_N (GET_MODE_NUNITS (mode), 0))
+	      || vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost = extra_cost->vect.dup;
+	  else
+	    *cost = extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile  { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+
Tamar Christina Oct. 29, 2021, 3:03 p.m. UTC | #15
Hi All,

Attached is a new version that fixes the previous SVE fallouts in a new way.

Ok for master?

Thanks,
Tamar

--- inline copy of patch ---


diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..542fc874a4e224fb2cbe94e64eab590458fe935b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13466,8 +13466,7 @@ cost_plus:
 
 	 we must cost the explicit register move.  */
       if (mode == DImode
-	  && GET_MODE (op0) == SImode
-	  && outer == SET)
+	  && GET_MODE (op0) == SImode)
 	{
 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
 
@@ -14006,8 +14005,39 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  /* Load using MOVI/MVNI.  */
+	  if (aarch64_simd_valid_immediate (x, NULL))
+	    *cost = extra_cost->vect.movi;
+	  else /* Load using constant pool.  */
+	    *cost = extra_cost->ldst.load;
+	  break;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+	/* Load using a DUP.  */
+    case VEC_DUPLICATE:
+	*cost = extra_cost->vect.dup;
+	return false;
+    case VEC_SELECT:
+	{
+	  rtx op0 = XEXP (x, 0);
+	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
 
-      /* Fall through.  */
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost = extra_cost->vect.dup;
+	  else
+	    *cost = extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile  { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+
Richard Sandiford Oct. 29, 2021, 3:23 p.m. UTC | #16
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi All,
>
> Attached is a new version that fixes the previous SVE fallouts in a new way.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> --- inline copy of patch ---
>
>
> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
> index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> @@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* Alu.  */
> -    COSTS_N_INSNS (4)	/* mult.  */
> +    COSTS_N_INSNS (4),	/* mult.  */
> +    COSTS_N_INSNS (1),	/* movi.  */
> +    COSTS_N_INSNS (2),	/* dup.  */
> +    COSTS_N_INSNS (2)	/* extract.  */
>    }
>  };
>  
> @@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* Alu.  */
> -    COSTS_N_INSNS (4)	/* Mult.  */
> +    COSTS_N_INSNS (4),	/* Mult.  */
> +    COSTS_N_INSNS (1),	/* movi.  */
> +    COSTS_N_INSNS (2),	/* dup.  */
> +    COSTS_N_INSNS (2)	/* extract.  */
>    }
>  };
>  
> @@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* Alu.  */
> -    COSTS_N_INSNS (4)	/* Mult.  */
> +    COSTS_N_INSNS (4),	/* Mult.  */
> +    COSTS_N_INSNS (1),	/* movi.  */
> +    COSTS_N_INSNS (2),	/* dup.  */
> +    COSTS_N_INSNS (2)	/* extract.  */
>    }
>  };
>  
> @@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
>  )
>  
>  (define_insn "aarch64_simd_dup<mode>"
> -  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
> +  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
>  	(vec_duplicate:VDQF_F16
> -	  (match_operand:<VEL> 1 "register_operand" "w")))]
> +	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
>    "TARGET_SIMD"
> -  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
> -  [(set_attr "type" "neon_dup<q>")]
> +  "@
> +   dup\\t%0.<Vtype>, %1.<Vetype>[0]
> +   dup\\t%0.<Vtype>, %<vw>1"
> +  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
>  )
>  
>  (define_insn "aarch64_dup_lane<mode>"
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 699c105a42a613c06c462e2de686795279d85bc9..542fc874a4e224fb2cbe94e64eab590458fe935b 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
>    rtx op0, op1, op2;
>    const struct cpu_cost_table *extra_cost
>      = aarch64_tune_params.insn_extra_cost;
> -  int code = GET_CODE (x);
> +  rtx_code code = GET_CODE (x);
>    scalar_int_mode int_mode;
>  
>    /* By default, assume that everything has equivalent cost to the
> @@ -13466,8 +13466,7 @@ cost_plus:
>  
>  	 we must cost the explicit register move.  */
>        if (mode == DImode
> -	  && GET_MODE (op0) == SImode
> -	  && outer == SET)
> +	  && GET_MODE (op0) == SImode)
>  	{
>  	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
>  
> @@ -14006,8 +14005,39 @@ cost_plus:
>  			     mode, MULT, 1, speed);
>            return true;
>          }
> +	break;
> +    case CONST_VECTOR:
> +	{
> +	  /* Load using MOVI/MVNI.  */
> +	  if (aarch64_simd_valid_immediate (x, NULL))
> +	    *cost = extra_cost->vect.movi;
> +	  else /* Load using constant pool.  */
> +	    *cost = extra_cost->ldst.load;
> +	  break;
> +	}
> +    case VEC_CONCAT:
> +	/* depending on the operation, either DUP or INS.
> +	   For now, keep default costing.  */
> +	break;
> +	/* Load using a DUP.  */
> +    case VEC_DUPLICATE:

Ultra minor nit, but: putting the comment after the case would be
more consistent with surrounding code.

OK with that change, and thanks for you patience.

Richard

> +	*cost = extra_cost->vect.dup;
> +	return false;
> +    case VEC_SELECT:
> +	{
> +	  rtx op0 = XEXP (x, 0);
> +	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
>  
> -      /* Fall through.  */
> +	  /* cost subreg of 0 as free, otherwise as DUP */
> +	  rtx op1 = XEXP (x, 1);
> +	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> +	    ;
> +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> +	    *cost = extra_cost->vect.dup;
> +	  else
> +	    *cost = extra_cost->vect.extract;
> +	  return true;
> +	}
>      default:
>        break;
>      }
> diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
> index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
> --- a/gcc/config/arm/aarch-common-protos.h
> +++ b/gcc/config/arm/aarch-common-protos.h
> @@ -133,6 +133,9 @@ struct vector_cost_table
>  {
>    const int alu;
>    const int mult;
> +  const int movi;
> +  const int dup;
> +  const int extract;
>  };
>  
>  struct cpu_cost_table
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
> index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (0),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
>    /* Vector */
>    {
>      COSTS_N_INSNS (2),  /* alu.  */
> -    COSTS_N_INSNS (8)   /* mult.  */
> +    COSTS_N_INSNS (8),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile  { target { lp64 } } } */
> +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**test1:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
> +  uint64_t res = a | arr[0];
> +  uint64x2_t val = vld1q_u64 (arr);
> +  *rt = vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test2:
> +**	adrp	x[0-9]+, .LC[0-1]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
> +  uint64_t arr = vgetq_lane_u64 (val, 0);
> +  uint64_t res = a | arr;
> +  *rt = vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test3:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	str	q[0-9]+, \[x1\]
> +**	fmov	w[0-9]+, s[0-9]+
> +**	orr	w[0-9]+, w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +
> +uint32_t
> +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
> +{
> +  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
> +  uint32_t res = a | arr[0];
> +  uint32x4_t val = vld1q_u32 (arr);
> +  *rt = vaddq_u32 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test4:
> +**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
> +**	mov	x[0-9]+, 16512
> +**	movk	x[0-9]+, 0x1020, lsl 16
> +**	movk	x[0-9]+, 0x408, lsl 32
> +**	movk	x[0-9]+, 0x102, lsl 48
> +**	fmov	d[0-9]+, x[0-9]+
> +**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
> +**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
> +**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
> +**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**	umov	w[0-9]+, v[0-9]+.h\[3\]
> +**	ret
> +*/
> +
> +uint64_t
> +test4 (uint8x16_t input)
> +{
> +    uint8x16_t bool_input = vshrq_n_u8(input, 7);
> +    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
> +    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> +                               vgetq_lane_p64(mask, 0));
> +    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
> +    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
> +    return vget_lane_u16((uint16x4_t)res, 3);
> +}
> +
Christophe Lyon Nov. 2, 2021, 10:39 a.m. UTC | #17
Hi Tamar,


On Fri, Oct 29, 2021 at 5:23 PM Richard Sandiford via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> Tamar Christina <Tamar.Christina@arm.com> writes:
> > Hi All,
> >
> > Attached is a new version that fixes the previous SVE fallouts in a new
> way.
> >
> > Ok for master?
>


Looks like you forgot to try to build for arm* targets, you patch breaks
the build:
 gcc/config/arm/arm.c:1194:1: error: uninitialized const member
'vector_cost_table::movi'
[....]

You probably need to initialize the new field for arm targets too.

Can you check?

Thanks,

Christophe



>
> > Thanks,
> > Tamar
> >
> > --- inline copy of patch ---
> >
> >
> > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h
> b/gcc/config/aarch64/aarch64-cost-tables.h
> > index
> dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389
> 100644
> > --- a/gcc/config/aarch64/aarch64-cost-tables.h
> > +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> > @@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),       /* Alu.  */
> > -    COSTS_N_INSNS (4)        /* mult.  */
> > +    COSTS_N_INSNS (4),       /* mult.  */
> > +    COSTS_N_INSNS (1),       /* movi.  */
> > +    COSTS_N_INSNS (2),       /* dup.  */
> > +    COSTS_N_INSNS (2)        /* extract.  */
> >    }
> >  };
> >
> > @@ -333,7 +339,10 @@ const struct cpu_cost_table
> thunderx2t99_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),       /* Alu.  */
> > -    COSTS_N_INSNS (4)        /* Mult.  */
> > +    COSTS_N_INSNS (4),       /* Mult.  */
> > +    COSTS_N_INSNS (1),       /* movi.  */
> > +    COSTS_N_INSNS (2),       /* dup.  */
> > +    COSTS_N_INSNS (2)        /* extract.  */
> >    }
> >  };
> >
> > @@ -437,7 +446,10 @@ const struct cpu_cost_table
> thunderx3t110_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),       /* Alu.  */
> > -    COSTS_N_INSNS (4)        /* Mult.  */
> > +    COSTS_N_INSNS (4),       /* Mult.  */
> > +    COSTS_N_INSNS (1),       /* movi.  */
> > +    COSTS_N_INSNS (2),       /* dup.  */
> > +    COSTS_N_INSNS (2)        /* extract.  */
> >    }
> >  };
> >
> > @@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > index
> 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7
> 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
> >  )
> >
> >  (define_insn "aarch64_simd_dup<mode>"
> > -  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
> > +  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
> >       (vec_duplicate:VDQF_F16
> > -       (match_operand:<VEL> 1 "register_operand" "w")))]
> > +       (match_operand:<VEL> 1 "register_operand" "w,r")))]
> >    "TARGET_SIMD"
> > -  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
> > -  [(set_attr "type" "neon_dup<q>")]
> > +  "@
> > +   dup\\t%0.<Vtype>, %1.<Vetype>[0]
> > +   dup\\t%0.<Vtype>, %<vw>1"
> > +  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
> >  )
> >
> >  (define_insn "aarch64_dup_lane<mode>"
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index
> 699c105a42a613c06c462e2de686795279d85bc9..542fc874a4e224fb2cbe94e64eab590458fe935b
> 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int
> outer ATTRIBUTE_UNUSED,
> >    rtx op0, op1, op2;
> >    const struct cpu_cost_table *extra_cost
> >      = aarch64_tune_params.insn_extra_cost;
> > -  int code = GET_CODE (x);
> > +  rtx_code code = GET_CODE (x);
> >    scalar_int_mode int_mode;
> >
> >    /* By default, assume that everything has equivalent cost to the
> > @@ -13466,8 +13466,7 @@ cost_plus:
> >
> >        we must cost the explicit register move.  */
> >        if (mode == DImode
> > -       && GET_MODE (op0) == SImode
> > -       && outer == SET)
> > +       && GET_MODE (op0) == SImode)
> >       {
> >         int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
> >
> > @@ -14006,8 +14005,39 @@ cost_plus:
> >                            mode, MULT, 1, speed);
> >            return true;
> >          }
> > +     break;
> > +    case CONST_VECTOR:
> > +     {
> > +       /* Load using MOVI/MVNI.  */
> > +       if (aarch64_simd_valid_immediate (x, NULL))
> > +         *cost = extra_cost->vect.movi;
> > +       else /* Load using constant pool.  */
> > +         *cost = extra_cost->ldst.load;
> > +       break;
> > +     }
> > +    case VEC_CONCAT:
> > +     /* depending on the operation, either DUP or INS.
> > +        For now, keep default costing.  */
> > +     break;
> > +     /* Load using a DUP.  */
> > +    case VEC_DUPLICATE:
>
> Ultra minor nit, but: putting the comment after the case would be
> more consistent with surrounding code.
>
> OK with that change, and thanks for you patience.
>
> Richard
>
> > +     *cost = extra_cost->vect.dup;
> > +     return false;
> > +    case VEC_SELECT:
> > +     {
> > +       rtx op0 = XEXP (x, 0);
> > +       *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
> >
> > -      /* Fall through.  */
> > +       /* cost subreg of 0 as free, otherwise as DUP */
> > +       rtx op1 = XEXP (x, 1);
> > +       if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> > +         ;
> > +       else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> > +         *cost = extra_cost->vect.dup;
> > +       else
> > +         *cost = extra_cost->vect.extract;
> > +       return true;
> > +     }
> >      default:
> >        break;
> >      }
> > diff --git a/gcc/config/arm/aarch-common-protos.h
> b/gcc/config/arm/aarch-common-protos.h
> > index
> 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400
> 100644
> > --- a/gcc/config/arm/aarch-common-protos.h
> > +++ b/gcc/config/arm/aarch-common-protos.h
> > @@ -133,6 +133,9 @@ struct vector_cost_table
> >  {
> >    const int alu;
> >    const int mult;
> > +  const int movi;
> > +  const int dup;
> > +  const int extract;
> >  };
> >
> >  struct cpu_cost_table
> > diff --git a/gcc/config/arm/aarch-cost-tables.h
> b/gcc/config/arm/aarch-cost-tables.h
> > index
> 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef
> 100644
> > --- a/gcc/config/arm/aarch-cost-tables.h
> > +++ b/gcc/config/arm/aarch-cost-tables.h
> > @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),       /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),       /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (1),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (0),  /* alu.  */
> > -    COSTS_N_INSNS (4)   /* mult.  */
> > +    COSTS_N_INSNS (4),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
> >    /* Vector */
> >    {
> >      COSTS_N_INSNS (2),  /* alu.  */
> > -    COSTS_N_INSNS (8)   /* mult.  */
> > +    COSTS_N_INSNS (8),  /* mult.  */
> > +    COSTS_N_INSNS (1),  /* movi.  */
> > +    COSTS_N_INSNS (2),  /* dup.  */
> > +    COSTS_N_INSNS (2)   /* extract.  */
> >    }
> >  };
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> > new file mode 100644
> > index
> 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> > @@ -0,0 +1,97 @@
> > +/* { dg-do compile  { target { lp64 } } } */
> > +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto
> -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +**test1:
> > +**   adrp    x[0-9]+, .LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > +**   str     q[0-9]+, \[x[0-9]+\]
> > +**   fmov    x[0-9]+, d[0-9]+
> > +**   orr     x[0-9]+, x[0-9]+, x[0-9]+
> > +**   ret
> > +*/
> > +
> > +uint64_t
> > +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> > +{
> > +  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
> > +  uint64_t res = a | arr[0];
> > +  uint64x2_t val = vld1q_u64 (arr);
> > +  *rt = vaddq_u64 (val, b);
> > +  return res;
> > +}
> > +
> > +/*
> > +**test2:
> > +**   adrp    x[0-9]+, .LC[0-1]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   add     v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> > +**   str     q[0-9]+, \[x[0-9]+\]
> > +**   fmov    x[0-9]+, d[0-9]+
> > +**   orr     x[0-9]+, x[0-9]+, x[0-9]+
> > +**   ret
> > +*/
> > +
> > +uint64_t
> > +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> > +{
> > +  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
> > +  uint64_t arr = vgetq_lane_u64 (val, 0);
> > +  uint64_t res = a | arr;
> > +  *rt = vaddq_u64 (val, b);
> > +  return res;
> > +}
> > +
> > +/*
> > +**test3:
> > +**   adrp    x[0-9]+, .LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > +**   str     q[0-9]+, \[x1\]
> > +**   fmov    w[0-9]+, s[0-9]+
> > +**   orr     w[0-9]+, w[0-9]+, w[0-9]+
> > +**   ret
> > +*/
> > +
> > +uint32_t
> > +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
> > +{
> > +  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
> > +  uint32_t res = a | arr[0];
> > +  uint32x4_t val = vld1q_u32 (arr);
> > +  *rt = vaddq_u32 (val, b);
> > +  return res;
> > +}
> > +
> > +/*
> > +**test4:
> > +**   ushr    v[0-9]+.16b, v[0-9]+.16b, 7
> > +**   mov     x[0-9]+, 16512
> > +**   movk    x[0-9]+, 0x1020, lsl 16
> > +**   movk    x[0-9]+, 0x408, lsl 32
> > +**   movk    x[0-9]+, 0x102, lsl 48
> > +**   fmov    d[0-9]+, x[0-9]+
> > +**   pmull   v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
> > +**   dup     v[0-9]+.2d, v[0-9]+.d\[0\]
> > +**   pmull2  v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
> > +**   trn2    v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +**   umov    w[0-9]+, v[0-9]+.h\[3\]
> > +**   ret
> > +*/
> > +
> > +uint64_t
> > +test4 (uint8x16_t input)
> > +{
> > +    uint8x16_t bool_input = vshrq_n_u8(input, 7);
> > +    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
> > +    poly64_t prodL =
> vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
> > +                               vgetq_lane_p64(mask, 0));
> > +    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
> > +    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
> > +    return vget_lane_u16((uint16x4_t)res, 3);
> > +}
> > +
>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@  const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@  const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@  const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@  const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@  const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@  const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5638d096fa84a27b4ea397f62cd0d05a28e7c8c..6814dae079c9ff40aaa2bb625432bf9eb8906b73 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@  (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f80de2ca8971086d6a4bf3aa7793d0cda953b5c8..26d78ffe98a3445dcc490c93849c46a8c2595cf8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -302,6 +302,7 @@  static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12665,7 +12666,7 @@  aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13936,8 +13937,65 @@  cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case PARALLEL:
+      /* Fall through */
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    *cost += extra_cost->vect.dup;
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  int nelts;
+	  if ((op1 == const0_rtx && !BYTES_BIG_ENDIAN)
+	      || (BYTES_BIG_ENDIAN
+		  && GET_MODE_NUNITS (mode).is_constant(&nelts)
+		  && INTVAL (op1) == nelts - 1))
+	    ;
+	  else if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	  /* Selecting the high part is not technically free, but we lack
+	     enough information to decide that here.  For instance selecting
+	     the high-part of a vec_dup *is* free or to feed into any _high
+	     instruction.   Both of which we can't really tell.  That said
+	     have a better chance to optimize an dup vs multiple constants.  */
+	    ;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20663,9 +20721,12 @@  aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20677,7 +20738,8 @@  aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20685,9 +20747,12 @@  aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20719,7 +20784,7 @@  aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@  struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@  const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@  const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@  const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@  const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@  const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@  const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..36e468aacfadd7701c6a7cd432bee81472111a16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,127 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test0:
+**	movi	v2.16b, 0x3
+**	ldr	q0, \[x0\]
+**	uxtl	v1.8h, v0.8b
+**	uxtl2	v0.8h, v0.16b
+**	ldr	q3, \[x1\]
+**	umlal	v1.8h, v3.8b, v2.8b
+**	umlal2	v0.8h, v3.16b, v2.16b
+**	addhn	v0.8b, v1.8h, v0.8h
+**	str	d0, \[x2\]
+**	ret
+*/
+
+void test0 (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0)
+{
+  uint8x16_t three_u8 = vdupq_n_u8(3);
+  uint8x16_t x = vld1q_u8(inptr0);
+  uint8x16_t y = vld1q_u8(inptr1);
+  uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
+  uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
+  uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), vget_low_u8(three_u8));
+  uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), vget_high_u8(three_u8));
+  vst1_u8(outptr0, vaddhn_u16(z_l, z_h));
+}
+
+/*
+**test1:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC0
+**	ldr	q1, \[x2, #:lo12:.LC0\]
+**	add	v0.2d, v1.2d, v0.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x2, .LC1
+**	ldr	q1, \[x2, #:lo12:.LC1\]
+**	add	v0.2d, v0.2d, v1.2d
+**	str	q0, \[x1\]
+**	fmov	x1, d1
+**	orr	x0, x0, x1
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	sub	sp, sp, #16
+**	adrp	x2, .LC2
+**	ldr	q1, \[x2, #:lo12:.LC2\]
+**	add	v0.4s, v1.4s, v0.4s
+**	str	q0, \[x1\]
+**	fmov	w1, s1
+**	orr	w0, w0, w1
+**	add	sp, sp, 16
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v0.16b, v0.16b, 7
+**	mov	x0, 16512
+**	movk	x0, 0x1020, lsl 16
+**	movk	x0, 0x408, lsl 32
+**	movk	x0, 0x102, lsl 48
+**	fmov	d1, x0
+**	pmull	v2.1q, v0.1d, v1.1d
+**	dup	v1.2d, v1.d\[0\]
+**	pmull2	v0.1q, v0.2d, v1.2d
+**	trn2	v2.8b, v2.8b, v0.8b
+**	umov	w0, v2.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+