diff mbox series

PR rtl-optimization/87763 - generate more bfi instructions on aarch64

Message ID 73a131005a775d06e344d035031005dc765403c4.camel@marvell.com
State New
Headers show
Series PR rtl-optimization/87763 - generate more bfi instructions on aarch64 | expand

Commit Message

Steve Ellcey Jan. 24, 2019, 11:17 p.m. UTC
Here is my attempt at creating a couple of new instructions to
generate more bfi instructions on aarch64.  I haven't finished
testing this but it helps with gcc.target/aarch64/combine_bfi_1.c.

Before I went any further with it I wanted to see if anyone
else was working on something like this and if this seems like
a reasonable approach.

Steve Ellcey
sellcey@marvell.com


2018-01-24  Steve Ellcey  <sellcey@marvell.com>

	PR rtl-optimization/87763
	* config/aarch64/aarch64-protos.h
	(aarch64_masks_and_shift_for_aarch64_bfi_p): New prototype.
	* config/aarch64/aarch64.c (aarch64_masks_and_shift_for_aarch64_bfi_p):
	New function.
	* config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>4_shift):
	New instruction.
	(*aarch64_bfi<GPI:mode>4_noshift): Ditto.

Comments

Richard Earnshaw (lists) Jan. 25, 2019, 10:32 a.m. UTC | #1
On 24/01/2019 23:17, Steve Ellcey wrote:
> Here is my attempt at creating a couple of new instructions to
> generate more bfi instructions on aarch64.  I haven't finished
> testing this but it helps with gcc.target/aarch64/combine_bfi_1.c.
> 
> Before I went any further with it I wanted to see if anyone
> else was working on something like this and if this seems like
> a reasonable approach.
> 
> Steve Ellcey
> sellcey@marvell.com
> 
> 
> 2018-01-24  Steve Ellcey  <sellcey@marvell.com>
> 
> 	PR rtl-optimization/87763
> 	* config/aarch64/aarch64-protos.h
> 	(aarch64_masks_and_shift_for_aarch64_bfi_p): New prototype.
> 	* config/aarch64/aarch64.c (aarch64_masks_and_shift_for_aarch64_bfi_p):
> 	New function.
> 	* config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>4_shift):
> 	New instruction.
> 	(*aarch64_bfi<GPI:mode>4_noshift): Ditto.
> 
> 
> 
> combine.patch
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index b035e35..ec90053 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -429,6 +429,7 @@ bool aarch64_label_mentioned_p (rtx);
>  void aarch64_declare_function_name (FILE *, const char*, tree);
>  bool aarch64_legitimate_pic_operand_p (rtx);
>  bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
> +bool aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode, rtx, rtx, rtx);
>  bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
>  bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
>  opt_machine_mode aarch64_sve_pred_mode (unsigned int);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5df5a8b..69cc69f 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9294,6 +9294,44 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
>  	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
>  }
>  
> +/* Return true if the masks and a shift amount from an RTX of the form
> +   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
> +   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
> +
> +bool
> +aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode mode, rtx mask1,
> +					   rtx shft_amnt, rtx mask2)
> +{
> +  unsigned HOST_WIDE_INT m1, m2, s, t;
> +
> +  if (!CONST_INT_P (mask1) || !CONST_INT_P (mask2) || !CONST_INT_P (shft_amnt))
> +    return false;

Why not change the callers to pass HWI values directly?  They all have
either immediate-only restrictions in the patters or pass a constructed
immediate value anyway.

> +
> +  m1 = UINTVAL (mask1);
> +  m2 = UINTVAL (mask2);
> +  s = UINTVAL (shft_amnt);
> +
> +  /* Verify that there is no overlap in what bits are set in the two masks.  */
> +  if ((m1 + m2 + 1) != 0)
> +    return false;
> +
> +  /* Verify that the shift amount is less than the mode size.  */
> +  if (s >= GET_MODE_BITSIZE (mode))
> +    return false;
> +
> +  /* Verify that the mask being shifted is contigious and would be in the

contiguous

> +     least significant bits after shifting by creating a mask 't' based on
> +     the number of bits set in mask2 and the shift amount for mask2 and
> +     comparing that to the actual mask2.  */
> +  t = popcount_hwi (m2);
> +  t = (1 << t) - 1;
> +  t = t << s;
> +  if (t != m2)
> +    return false;
> +  
> +  return true;
> +}
> +
>  /* Calculate the cost of calculating X, storing it in *COST.  Result
>     is true if the total cost of the operation has now been calculated.  */
>  static bool
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index b7f6fe0..e1f526b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5476,6 +5476,41 @@
>    [(set_attr "type" "bfm")]
>  )
>  
> +;;  Match a bfi instruction where the shift of OP3 means that we are
> +;;  actually copying the least significant bits of OP3 into OP0 by way
> +;;  of the AND masks and the IOR instruction.
> +
> +(define_insn "*aarch64_bfi<GPI:mode>4_shift"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
> +                          (match_operand:GPI 2 "const_int_operand" "n"))
> +                 (and:GPI (ashift:GPI
> +                           (match_operand:GPI 3 "register_operand" "r")
> +                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
> +                          (match_operand:GPI 5 "const_int_operand" "n"))))]
> +  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], operands[4], operands[5])"
> +{
> +  return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5";

You don't need C code for this instruction printer, just use a textual
pattern.

> +}
> +  [(set_attr "type" "bfm")]
> +)
> +
> +;; Like the above instruction but with no shifting, we are just copying the
> +;; least significant bits of OP3 to OP0.
> +
> +(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
> +                          (match_operand:GPI 2 "const_int_operand" "n"))
> +                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
> +                          (match_operand:GPI 4 "const_int_operand" "n"))))]
> +  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], const0_rtx, operands[4])"
> +{
> +  return "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4";
> +}
Likewise.

Do we need another variant pattern to handle the case where the
insertion is into the top of the destination?  In that case the
immediate mask on the shifted operand is technically redundant as the
bottom bits are known zero and there are no top bits.


> +  [(set_attr "type" "bfm")]
> +)
> +
>  (define_insn "*extr_insv_lower_reg<mode>"
>    [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
>  			  (match_operand 1 "const_int_operand" "n")
> 

R.
Steve Ellcey Jan. 25, 2019, 9:55 p.m. UTC | #2
On Fri, 2019-01-25 at 10:32 +0000, Richard Earnshaw (lists) wrote:
> 
> Do we need another variant pattern to handle the case where the
> insertion is into the top of the destination?  In that case the
> immediate mask on the shifted operand is technically redundant as the
> bottom bits are known zero and there are no top bits.

I am not sure about this.  Do you have an example where this might be
needed?

I updated my patch to address your other comments and have bootstrapped
and tested this on aarch64.  Does this version look good for checkin?
I had to modify the two tests because with my new instructions we
sometimes generate bfi instructions where we used to generate bfxil
instructions.  The only regression this is fixing is combine_bfi_1.c,
combine_bfxil.c was passing before but still needed to be changed in
order to keep passing.

Steve Ellcey
sellcey@marvell.com


2018-01-25  Steve Ellcey  <sellcey@marvell.com>

	PR rtl-optimization/87763
	* config/aarch64/aarch64-protos.h (aarch64_masks_and_shift_for_bfi_p):
	New prototype.
	* config/aarch64/aarch64.c (aarch64_masks_and_shift_for_bfi_p):
	New function.
	* config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>4_shift):
	New instruction.
	(*aarch64_bfi<GPI:mode>4_noshift): Ditto.


2018-01-25  Steve Ellcey  <sellcey@marvell.com>

	PR rtl-optimization/87763
	* gcc.target/aarch64/combine_bfi_1.c: Change some bfxil checks
	to bfi.
	* gcc.target/aarch64/combine_bfxil.c: Ditto.
Jakub Jelinek Jan. 25, 2019, 11 p.m. UTC | #3
On Thu, Jan 24, 2019 at 11:17:45PM +0000, Steve Ellcey wrote:
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9294,6 +9294,44 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
>  	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
>  }
>  
> +/* Return true if the masks and a shift amount from an RTX of the form
> +   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
> +   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
> +
> +bool
> +aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode mode, rtx mask1,
> +					   rtx shft_amnt, rtx mask2)
> +{
> +  unsigned HOST_WIDE_INT m1, m2, s, t;
> +
> +  if (!CONST_INT_P (mask1) || !CONST_INT_P (mask2) || !CONST_INT_P (shft_amnt))
> +    return false;
> +
> +  m1 = UINTVAL (mask1);
> +  m2 = UINTVAL (mask2);
> +  s = UINTVAL (shft_amnt);
> +
> +  /* Verify that there is no overlap in what bits are set in the two masks.  */
> +  if ((m1 + m2 + 1) != 0)
> +    return false;

Wouldn't that be clearer to test
  if (m1 + m2 != HOST_WIDE_INT_1U)
    return false;
?
You IMHO also should test
  if ((m1 & m2) != 0)
    return false;

> +
> +  /* Verify that the shift amount is less than the mode size.  */
> +  if (s >= GET_MODE_BITSIZE (mode))
> +    return false;
> +
> +  /* Verify that the mask being shifted is contigious and would be in the
> +     least significant bits after shifting by creating a mask 't' based on
> +     the number of bits set in mask2 and the shift amount for mask2 and
> +     comparing that to the actual mask2.  */
> +  t = popcount_hwi (m2);
> +  t = (1 << t) - 1;

This should be (HOST_WIDE_INT_1U << t), otherwise if popcount of m2 is
>= 32, there will be UB.

> +  t = t << s;
> +  if (t != m2)
> +    return false;
> +  
> +  return true;
> +}
> +

> +;;  Match a bfi instruction where the shift of OP3 means that we are
> +;;  actually copying the least significant bits of OP3 into OP0 by way
> +;;  of the AND masks and the IOR instruction.
> +
> +(define_insn "*aarch64_bfi<GPI:mode>4_shift"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
> +                          (match_operand:GPI 2 "const_int_operand" "n"))
> +                 (and:GPI (ashift:GPI
> +                           (match_operand:GPI 3 "register_operand" "r")
> +                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
> +                          (match_operand:GPI 5 "const_int_operand" "n"))))]
> +  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], operands[4], operands[5])"

Too long lines.

> +{
> +  return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5";
> +}
> +  [(set_attr "type" "bfm")]
> +)

As mentioned in rs6000.md, I believe you also need a similar pattern where
the two ANDs are swapped, because they have the same priority.

> +
> +;; Like the above instruction but with no shifting, we are just copying the
> +;; least significant bits of OP3 to OP0.
> +
> +(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
> +                          (match_operand:GPI 2 "const_int_operand" "n"))
> +                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
> +                          (match_operand:GPI 4 "const_int_operand" "n"))))]
> +  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], const0_rtx, operands[4])"

Too long line.
I guess this one has similar issue that you might need two patterns for both
AND orderings (though the "0" needs to be on the right argument in each
case).

	Jakub
Steve Ellcey Jan. 29, 2019, 12:11 a.m. UTC | #4
On Sat, 2019-01-26 at 00:00 +0100, Jakub Jelinek wrote:
> 
> > +  /* Verify that there is no overlap in what bits are set in the two masks.  */
> > +  if ((m1 + m2 + 1) != 0)
> > +    return false;
> 
> Wouldn't that be clearer to test
>   if (m1 + m2 != HOST_WIDE_INT_1U)
>     return false;
> ?
> You IMHO also should test
>   if ((m1 & m2) != 0)
>     return false;

I think you mean HOST_WIDE_INT_M1U, not HOST_WIDE_INT_1U.  That does
seem clearer and I made that change as well as adding the second test.

> > 
> > +  t = popcount_hwi (m2);
> > +  t = (1 << t) - 1;
> 
> This should be (HOST_WIDE_INT_1U << t), otherwise if popcount of m2 is
> > = 32, there will be UB.

Fixed.

> As mentioned in rs6000.md, I believe you also need a similar pattern where
> the two ANDs are swapped, because they have the same priority.

I fixed the long lines in aarch64.md and I added a second pattern for
the *aarch64_bfi<GPI:mode>4_noshift pattern, swapping the order of the
IOR operands.  I did not swap the AND operands, I assume the compiler
would ensure that the register was always before the constant or that
it would check both orderings.

I tried adding a second version of *aarch64_bfi<GPI:mode>5_shift as
well but when I tested it, it never got used during a bootstrap build
or a GCC test run so I decided it was not needed.

Tested on aarch64 with a bootstrap build and a GCC testsuite run.
OK to checkin?


Steve Ellcey
sellcey@marvell.com


2018-01-28  Steve Ellcey  <sellcey@marvell.com>

	PR rtl-optimization/87763
	* config/aarch64/aarch64-protos.h (aarch64_masks_and_shift_for_bfi_p):
	New prototype.
	* config/aarch64/aarch64.c (aarch64_masks_and_shift_for_bfi_p):
	New function.
	* config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>5_shift):
	New instruction.
	(*aarch64_bfi<GPI:mode>4_noshift): Ditto.
	(*aarch64_bfi<GPI:mode>4_noshift_alt): Ditto.


2018-01-28  Steve Ellcey  <sellcey@marvell.com>

	PR rtl-optimization/87763
	* gcc.target/aarch64/combine_bfxil.c: Change some bfxil checks
	to bfi.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b035e35f33b..b6c0d0a8eb6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -429,6 +429,9 @@ bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
 bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
+bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d7c453cdad0..9a3080b5db8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9330,6 +9330,41 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
 }
 
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+				   unsigned HOST_WIDE_INT mask1,
+				   unsigned HOST_WIDE_INT shft_amnt,
+				   unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if ((mask1 + mask2) != HOST_WIDE_INT_M1U)
+    return false;
+  if ((mask1 & mask2) != 0)
+    return false;
+
+  /* Verify that the shift amount is less than the mode size.  */
+  if (shft_amnt >= GET_MODE_BITSIZE (mode))
+    return false;
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by creating a mask 't' based on
+     the number of bits set in mask2 and the shift amount for mask2 and
+     comparing that to the actual mask2.  */
+  t = popcount_hwi (mask2);
+  t = (HOST_WIDE_INT_1U << t) - 1;
+  t = t << shft_amnt;
+  if (t != mask2)
+    return false;
+  
+  return true;
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b7f6fe0f135..6b5339092ba 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5476,6 +5476,56 @@
   [(set_attr "type" "bfm")]
 )
 
+;;  Match a bfi instruction where the shift of OP3 means that we are
+;;  actually copying the least significant bits of OP3 into OP0 by way
+;;  of the AND masks and the IOR instruction.  A similar instruction
+;;  with the two parts of the IOR swapped around was never triggered
+;;  in a bootstrap build and test of GCC so it was not included.
+
+(define_insn "*aarch64_bfi<GPI:mode>5_shift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (ashift:GPI
+                           (match_operand:GPI 3 "register_operand" "r")
+                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
+                          (match_operand:GPI 5 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      UINTVAL(operands[5]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5"
+  [(set_attr "type" "bfm")]
+)
+
+;; Like the above instruction but with no shifting, we are just copying
+;; the least significant bits of OP3 to OP0.  In this case we do need
+;; two versions of the instruction to handle different checks on the
+;; constant values.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]), 0,
+				      UINTVAL (operands[4]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4"
+  [(set_attr "type" "bfm")]
+)
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift_alt"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "0")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[4]), 0,
+				      UINTVAL (operands[2]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>1, 0, %P2"
+  [(set_attr "type" "bfm")]
+)
+
 (define_insn "*extr_insv_lower_reg<mode>"
   [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
 			  (match_operand 1 "const_int_operand" "n")
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
index 109f989a2f0..c3b5fc58800 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -114,4 +114,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler-times "bfxil\\t" 18 } } */
+/* { dg-final { scan-assembler-times "bfxil\\t" 3 } } */
+/* { dg-final { scan-assembler-times "bfi\\t" 15 } } */
Jakub Jelinek Jan. 29, 2019, 12:22 a.m. UTC | #5
On Tue, Jan 29, 2019 at 12:11:46AM +0000, Steve Ellcey wrote:
> > As mentioned in rs6000.md, I believe you also need a similar pattern where
> > the two ANDs are swapped, because they have the same priority.
> 
> I fixed the long lines in aarch64.md and I added a second pattern for
> the *aarch64_bfi<GPI:mode>4_noshift pattern, swapping the order of the
> IOR operands.  I did not swap the AND operands, I assume the compiler
> would ensure that the register was always before the constant or that
> it would check both orderings.

Yes, you can look at commutative_operand_precedence and
swap_commutative_operands_p.  The issue is just if commutative_operand_precedence
of both operands is equal (and that function doesn't recurse for
suboperands).

> I tried adding a second version of *aarch64_bfi<GPI:mode>5_shift as
> well but when I tested it, it never got used during a bootstrap build
> or a GCC test run so I decided it was not needed.

I'll try tomorrow if I can construct a testcase or not.

In any case, you want an aarch64 maintainer to ack this.

Thanks for working on this.

	Jakub
Steve Ellcey Feb. 4, 2019, 4:55 p.m. UTC | #6
Ping.  And adding Aarch64 Maintainers.


On Mon, 2019-01-28 at 16:11 -0800, Steve Ellcey wrote:
> On Sat, 2019-01-26 at 00:00 +0100, Jakub Jelinek wrote:
> > 
> > > +  /* Verify that there is no overlap in what bits are set in the
> > > two masks.  */
> > > +  if ((m1 + m2 + 1) != 0)
> > > +    return false;
> > 
> > Wouldn't that be clearer to test
> >   if (m1 + m2 != HOST_WIDE_INT_1U)
> >     return false;
> > ?
> > You IMHO also should test
> >   if ((m1 & m2) != 0)
> >     return false;
> 
> I think you mean HOST_WIDE_INT_M1U, not HOST_WIDE_INT_1U.  That does
> seem clearer and I made that change as well as adding the second
> test.
> 
> > > 
> > > +  t = popcount_hwi (m2);
> > > +  t = (1 << t) - 1;
> > 
> > This should be (HOST_WIDE_INT_1U << t), otherwise if popcount of m2
> > is
> > > = 32, there will be UB.
> 
> Fixed.
> 
> > As mentioned in rs6000.md, I believe you also need a similar
> > pattern where
> > the two ANDs are swapped, because they have the same priority.
> 
> I fixed the long lines in aarch64.md and I added a second pattern for
> the *aarch64_bfi<GPI:mode>4_noshift pattern, swapping the order of
> the
> IOR operands.  I did not swap the AND operands, I assume the compiler
> would ensure that the register was always before the constant or that
> it would check both orderings.
> 
> I tried adding a second version of *aarch64_bfi<GPI:mode>5_shift as
> well but when I tested it, it never got used during a bootstrap build
> or a GCC test run so I decided it was not needed.
> 
> Tested on aarch64 with a bootstrap build and a GCC testsuite run.
> OK to checkin?
> 
> 
> Steve Ellcey
> sellcey@marvell.com
> 
> 
> 2018-01-28  Steve Ellcey  <sellcey@marvell.com>
> 
> 	PR rtl-optimization/87763
> 	* config/aarch64/aarch64-protos.h
> (aarch64_masks_and_shift_for_bfi_p):
> 	New prototype.
> 	* config/aarch64/aarch64.c (aarch64_masks_and_shift_for_bfi_p):
> 	New function.
> 	* config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>5_shift):
> 	New instruction.
> 	(*aarch64_bfi<GPI:mode>4_noshift): Ditto.
> 	(*aarch64_bfi<GPI:mode>4_noshift_alt): Ditto.
> 
> 
> 2018-01-28  Steve Ellcey  <sellcey@marvell.com>
> 
> 	PR rtl-optimization/87763
> 	* gcc.target/aarch64/combine_bfxil.c: Change some bfxil checks
> 	to bfi.
> 
>
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b035e35f33b..b6c0d0a8eb6 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -429,6 +429,9 @@ bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
 bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
+bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index d7c453cdad0..9a3080b5db8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9330,6 +9330,41 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
 }
 
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+				   unsigned HOST_WIDE_INT mask1,
+				   unsigned HOST_WIDE_INT shft_amnt,
+				   unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if ((mask1 + mask2) != HOST_WIDE_INT_M1U)
+    return false;
+  if ((mask1 & mask2) != 0)
+    return false;
+
+  /* Verify that the shift amount is less than the mode size.  */
+  if (shft_amnt >= GET_MODE_BITSIZE (mode))
+    return false;
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by creating a mask 't' based on
+     the number of bits set in mask2 and the shift amount for mask2 and
+     comparing that to the actual mask2.  */
+  t = popcount_hwi (mask2);
+  t = (HOST_WIDE_INT_1U << t) - 1;
+  t = t << shft_amnt;
+  if (t != mask2)
+    return false;
+  
+  return true;
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b7f6fe0f135..6b5339092ba 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5476,6 +5476,56 @@
   [(set_attr "type" "bfm")]
 )
 
+;;  Match a bfi instruction where the shift of OP3 means that we are
+;;  actually copying the least significant bits of OP3 into OP0 by way
+;;  of the AND masks and the IOR instruction.  A similar instruction
+;;  with the two parts of the IOR swapped around was never triggered
+;;  in a bootstrap build and test of GCC so it was not included.
+
+(define_insn "*aarch64_bfi<GPI:mode>5_shift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (ashift:GPI
+                           (match_operand:GPI 3 "register_operand" "r")
+                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
+                          (match_operand:GPI 5 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      UINTVAL(operands[5]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5"
+  [(set_attr "type" "bfm")]
+)
+
+;; Like the above instruction but with no shifting, we are just copying
+;; the least significant bits of OP3 to OP0.  In this case we do need
+;; two versions of the instruction to handle different checks on the
+;; constant values.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]), 0,
+				      UINTVAL (operands[4]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4"
+  [(set_attr "type" "bfm")]
+)
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift_alt"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "0")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[4]), 0,
+				      UINTVAL (operands[2]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>1, 0, %P2"
+  [(set_attr "type" "bfm")]
+)
+
 (define_insn "*extr_insv_lower_reg<mode>"
   [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
 			  (match_operand 1 "const_int_operand" "n")
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
index 109f989a2f0..c3b5fc58800 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -114,4 +114,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler-times "bfxil\\t" 18 } } */
+/* { dg-final { scan-assembler-times "bfxil\\t" 3 } } */
+/* { dg-final { scan-assembler-times "bfi\\t" 15 } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b035e35..ec90053 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -429,6 +429,7 @@  bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
 bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
+bool aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode, rtx, rtx, rtx);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5df5a8b..69cc69f 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9294,6 +9294,44 @@  aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
 }
 
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_aarch64_bfi_p (scalar_int_mode mode, rtx mask1,
+					   rtx shft_amnt, rtx mask2)
+{
+  unsigned HOST_WIDE_INT m1, m2, s, t;
+
+  if (!CONST_INT_P (mask1) || !CONST_INT_P (mask2) || !CONST_INT_P (shft_amnt))
+    return false;
+
+  m1 = UINTVAL (mask1);
+  m2 = UINTVAL (mask2);
+  s = UINTVAL (shft_amnt);
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if ((m1 + m2 + 1) != 0)
+    return false;
+
+  /* Verify that the shift amount is less than the mode size.  */
+  if (s >= GET_MODE_BITSIZE (mode))
+    return false;
+
+  /* Verify that the mask being shifted is contigious and would be in the
+     least significant bits after shifting by creating a mask 't' based on
+     the number of bits set in mask2 and the shift amount for mask2 and
+     comparing that to the actual mask2.  */
+  t = popcount_hwi (m2);
+  t = (1 << t) - 1;
+  t = t << s;
+  if (t != m2)
+    return false;
+  
+  return true;
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b7f6fe0..e1f526b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5476,6 +5476,41 @@ 
   [(set_attr "type" "bfm")]
 )
 
+;;  Match a bfi instruction where the shift of OP3 means that we are
+;;  actually copying the least significant bits of OP3 into OP0 by way
+;;  of the AND masks and the IOR instruction.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_shift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (ashift:GPI
+                           (match_operand:GPI 3 "register_operand" "r")
+                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
+                          (match_operand:GPI 5 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], operands[4], operands[5])"
+{
+  return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5";
+}
+  [(set_attr "type" "bfm")]
+)
+
+;; Like the above instruction but with no shifting, we are just copying the
+;; least significant bits of OP3 to OP0.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_aarch64_bfi_p (<MODE>mode, operands[2], const0_rtx, operands[4])"
+{
+  return "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4";
+}
+  [(set_attr "type" "bfm")]
+)
+
 (define_insn "*extr_insv_lower_reg<mode>"
   [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
 			  (match_operand 1 "const_int_operand" "n")