[committed,AArch64] Canonicalise SVE predicate constants
diff mbox series

Message ID mpt1rxoqikh.fsf@arm.com
State New
Headers show
Series
  • [committed,AArch64] Canonicalise SVE predicate constants
Related show

Commit Message

Richard Sandiford Aug. 14, 2019, 8:04 a.m. UTC
This patch makes sure that we build all SVE predicate constants as
VNx16BI before RA, to encourage similar constants to be reused
between modes.  This is also useful for the ACLE, where the single
predicate type svbool_t is always a VNx16BI.

Also, and again to encourage reuse, the patch makes us use a .B PTRUE
for all ptrue-predicated operations, rather than (for example) using
a .S PTRUE for 32-bit operations and a .D PTRUE for 64-bit operations.

The only current case in which a .H, .S or .D operation needs to be
predicated by a "strict" .H/.S/.D PTRUE is the PTEST in a conditional
branch, which an earlier patch fixed to use an appropriate VNx16BI
constant.

Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
Applied as r274415.

Richard


2019-08-14  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/aarch64.c (aarch64_target_reg): New function.
	(aarch64_emit_set_immediate): Likewise.
	(aarch64_ptrue_reg): Build a VNx16BI constant and then bitcast it.
	(aarch64_pfalse_reg): Likewise.
	(aarch64_convert_sve_data_to_pred): New function.
	(aarch64_sve_move_pred_via_while): Take an optional target register
	and the required register mode.
	(aarch64_expand_sve_const_pred_1): New function.
	(aarch64_expand_sve_const_pred): Likewise.
	(aarch64_expand_mov_immediate): Build an all-true predicate
	if the significant bits of the immediate are all true.  Use
	aarch64_expand_sve_const_pred for all compile-time predicate constants.
	(aarch64_mov_operand_p): Force predicate constants to be VNx16BI
	before register allocation.
	* config/aarch64/aarch64-sve.md (*vec_duplicate<mode>_reg): Use
	a VNx16BI PTRUE when splitting the memory alternative.
	(vec_duplicate<mode>): Update accordingly.
	(*pred_cmp<cmp_op><mode>): Rename to...
	(@aarch64_pred_cmp<cmp_op><mode>): ...this.

gcc/testsuite/
	* gcc.target/aarch64/sve/spill_4.c: Expect all ptrues to be .Bs.
	* gcc.target/aarch64/sve/single_1.c: Likewise.
	* gcc.target/aarch64/sve/single_2.c: Likewise.
	* gcc.target/aarch64/sve/single_3.c: Likewise.
	* gcc.target/aarch64/sve/single_4.c: Likewise.

Patch
diff mbox series

Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	2019-08-14 08:58:06.353767448 +0100
+++ gcc/config/aarch64/aarch64.c	2019-08-14 09:00:55.960509992 +0100
@@ -2546,6 +2546,36 @@  aarch64_zero_extend_const_eq (machine_mo
 }
 			      
 
+/* Return TARGET if it is nonnull and a register of mode MODE.
+   Otherwise, return a fresh register of mode MODE if we can,
+   or TARGET reinterpreted as MODE if we can't.  */
+
+static rtx
+aarch64_target_reg (rtx target, machine_mode mode)
+{
+  if (target && REG_P (target) && GET_MODE (target) == mode)
+    return target;
+  if (!can_create_pseudo_p ())
+    {
+      gcc_assert (target);
+      return gen_lowpart (mode, target);
+    }
+  return gen_reg_rtx (mode);
+}
+
+/* Return a register that contains the constant in BUILDER, given that
+   the constant is a legitimate move operand.  Use TARGET as the register
+   if it is nonnull and convenient.  */
+
+static rtx
+aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
+{
+  rtx src = builder.build ();
+  target = aarch64_target_reg (target, GET_MODE (src));
+  emit_insn (gen_rtx_SET (target, src));
+  return target;
+}
+
 static rtx
 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
 {
@@ -2721,7 +2751,8 @@  aarch64_ptrue_all (unsigned int elt_size
 aarch64_ptrue_reg (machine_mode mode)
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
-  return force_reg (mode, CONSTM1_RTX (mode));
+  rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
 }
 
 /* Return an all-false predicate register of mode MODE.  */
@@ -2730,7 +2761,26 @@  aarch64_ptrue_reg (machine_mode mode)
 aarch64_pfalse_reg (machine_mode mode)
 {
   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
-  return force_reg (mode, CONST0_RTX (mode));
+  rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
+  return gen_lowpart (mode, reg);
+}
+
+/* Use a comparison to convert integer vector SRC into MODE, which is
+   the corresponding SVE predicate mode.  Use TARGET for the result
+   if it's nonnull and convenient.  */
+
+static rtx
+aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
+{
+  machine_mode src_mode = GET_MODE (src);
+  insn_code icode = code_for_aarch64_pred_cmp (NE, src_mode);
+  expand_operand ops[4];
+  create_output_operand (&ops[0], target, mode);
+  create_input_operand (&ops[1], CONSTM1_RTX (mode), mode);
+  create_input_operand (&ops[2], src, src_mode);
+  create_input_operand (&ops[3], CONST0_RTX (src_mode), src_mode);
+  expand_insn (icode, 4, ops);
+  return ops[0].value;
 }
 
 /* Return true if we can move VALUE into a register using a single
@@ -3633,15 +3683,80 @@  aarch64_expand_sve_const_vector (rtx tar
   return target;
 }
 
-/* Use WHILE to set predicate register DEST so that the first VL bits
-   are set and the rest are clear.  */
+/* Use WHILE to set a predicate register of mode MODE in which the first
+   VL bits are set and the rest are clear.  Use TARGET for the register
+   if it's nonnull and convenient.  */
 
-static void
-aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl)
+static rtx
+aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
+				 unsigned int vl)
 {
   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
-  emit_insn (gen_while_ult (DImode, GET_MODE (dest),
-			    dest, const0_rtx, limit));
+  target = aarch64_target_reg (target, mode);
+  emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
+  return target;
+}
+
+/* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
+   constant in BUILDER into an SVE predicate register.  Return the register
+   on success, otherwise return null.  Use TARGET for the register if
+   nonnull and convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
+{
+  if (builder.encoded_nelts () == 1)
+    /* A PFALSE or a PTRUE .B ALL.  */
+    return aarch64_emit_set_immediate (target, builder);
+
+  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+  if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
+    {
+      /* If we can load the constant using PTRUE, use it as-is.  */
+      machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
+      if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
+	return aarch64_emit_set_immediate (target, builder);
+
+      /* Otherwise use WHILE to set the first VL bits.  */
+      return aarch64_sve_move_pred_via_while (target, mode, vl);
+    }
+
+  return NULL_RTX;
+}
+
+/* Return an SVE predicate register that contains the VNx16BImode
+   constant in BUILDER, without going through the move expanders.
+
+   The returned register can have whatever mode seems most natural
+   given the contents of BUILDER.  Use TARGET for the result if
+   convenient.  */
+
+static rtx
+aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
+{
+  /* Try loading the constant using pure predicate operations.  */
+  if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder))
+    return res;
+
+  /* Try forcing the constant to memory.  */
+  if (builder.full_nelts ().is_constant ())
+    if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
+      {
+	target = aarch64_target_reg (target, VNx16BImode);
+	emit_move_insn (target, mem);
+	return target;
+      }
+
+  /* The last resort is to load the constant as an integer and then
+     compare it against zero.  Use -1 for set bits in order to increase
+     the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
+  rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
+				  builder.nelts_per_pattern ());
+  for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
+    int_builder.quick_push (INTVAL (builder.elt (i))
+			    ? constm1_rtx : const0_rtx);
+  return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
+					   int_builder.build ());
 }
 
 /* Set DEST to immediate IMM.  */
@@ -3770,6 +3885,32 @@  aarch64_expand_mov_immediate (rtx dest,
 
   if (!CONST_INT_P (imm))
     {
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
+	{
+	  /* Only the low bit of each .H, .S and .D element is defined,
+	     so we can set the upper bits to whatever we like.  If the
+	     predicate is all-true in MODE, prefer to set all the undefined
+	     bits as well, so that we can share a single .B predicate for
+	     all modes.  */
+	  if (imm == CONSTM1_RTX (mode))
+	    imm = CONSTM1_RTX (VNx16BImode);
+
+	  /* All methods for constructing predicate modes wider than VNx16BI
+	     will set the upper bits of each element to zero.  Expose this
+	     by moving such constants as a VNx16BI, so that all bits are
+	     significant and so that constants for different modes can be
+	     shared.  The wider constant will still be available as a
+	     REG_EQUAL note.  */
+	  rtx_vector_builder builder;
+	  if (aarch64_get_sve_pred_bits (builder, imm))
+	    {
+	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
+	      if (dest != res)
+		emit_move_insn (dest, gen_lowpart (mode, res));
+	      return;
+	    }
+	}
+
       if (GET_CODE (imm) == HIGH
 	  || aarch64_simd_valid_immediate (imm, NULL))
 	{
@@ -3777,19 +3918,6 @@  aarch64_expand_mov_immediate (rtx dest,
 	  return;
 	}
 
-      rtx_vector_builder builder;
-      if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL
-	  && aarch64_get_sve_pred_bits (builder, imm))
-	{
-	  unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
-	  int vl = aarch64_partial_ptrue_length (builder, elt_size);
-	  if (vl > 0)
-	    {
-	      aarch64_sve_move_pred_via_while (dest, vl);
-	      return;
-	    }
-	}
-
       if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
 	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
 	  {
@@ -15178,7 +15306,17 @@  aarch64_mov_operand_p (rtx x, machine_mo
     return true;
 
   if (VECTOR_MODE_P (GET_MODE (x)))
-    return aarch64_simd_valid_immediate (x, NULL);
+    {
+      /* Require predicate constants to be VNx16BI before RA, so that we
+	 force everything to have a canonical form.  */
+      if (!lra_in_progress
+	  && !reload_completed
+	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
+	  && GET_MODE (x) != VNx16BImode)
+	return false;
+
+      return aarch64_simd_valid_immediate (x, NULL);
+    }
 
   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
     return true;
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2019-08-14 08:58:06.349767478 +0100
+++ gcc/config/aarch64/aarch64-sve.md	2019-08-14 09:00:55.956510022 +0100
@@ -846,7 +846,7 @@  (define_expand "vec_duplicate<mode>"
     [(set (match_operand:SVE_ALL 0 "register_operand")
 	  (vec_duplicate:SVE_ALL
 	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
-     (clobber (scratch:<VPRED>))])]
+     (clobber (scratch:VNx16BI))])]
   "TARGET_SVE"
   {
     if (MEM_P (operands[1]))
@@ -867,7 +867,7 @@  (define_insn_and_split "*vec_duplicate<m
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
 	(vec_duplicate:SVE_ALL
 	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
-   (clobber (match_scratch:<VPRED> 2 "=X, X, Upl"))]
+   (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
   "TARGET_SVE"
   "@
    mov\t%0.<Vetype>, %<vwcore>1
@@ -877,9 +877,10 @@  (define_insn_and_split "*vec_duplicate<m
   [(const_int 0)]
   {
     if (GET_CODE (operands[2]) == SCRATCH)
-      operands[2] = gen_reg_rtx (<VPRED>mode);
-    emit_move_insn (operands[2], CONSTM1_RTX (<VPRED>mode));
-    emit_insn (gen_sve_ld1r<mode> (operands[0], operands[2], operands[1],
+      operands[2] = gen_reg_rtx (VNx16BImode);
+    emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
+    rtx gp = gen_lowpart (<VPRED>mode, operands[2]);
+    emit_insn (gen_sve_ld1r<mode> (operands[0], gp, operands[1],
 				   CONST0_RTX (<MODE>mode)));
     DONE;
   }
@@ -2971,7 +2972,7 @@  (define_insn_and_split "*pred_cmp<cmp_op
 )
 
 ;; Predicated integer comparisons.
-(define_insn "*pred_cmp<cmp_op><mode>"
+(define_insn "@aarch64_pred_cmp<cmp_op><mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
 	(and:<VPRED>
 	  (SVE_INT_CMP:<VPRED>
Index: gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/spill_4.c	2019-08-13 11:39:54.753376024 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/spill_4.c	2019-08-14 09:00:55.960509992 +0100
@@ -24,9 +24,10 @@  TEST_LOOP (uint16_t, 0x1234);
 TEST_LOOP (uint32_t, 0x12345);
 TEST_LOOP (uint64_t, 0x123456);
 
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.b,} 6 } } */
 /* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.s,} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.d,} } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/single_1.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/single_1.c	2019-05-29 10:49:34.428716761 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/single_1.c	2019-08-14 09:00:55.960509992 +0100
@@ -40,10 +40,7 @@  TEST_LOOP (double, 3.0)
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl8\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl4\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/single_2.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/single_2.c	2019-05-29 10:49:34.428716761 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/single_2.c	2019-08-14 09:00:55.960509992 +0100
@@ -16,10 +16,7 @@  #define N 64
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl16\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl8\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl64\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/single_3.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/single_3.c	2019-05-29 10:49:34.428716761 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/single_3.c	2019-08-14 09:00:55.960509992 +0100
@@ -16,10 +16,7 @@  #define N 128
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl32\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl16\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl128\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/single_4.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/single_4.c	2019-05-29 10:49:34.428716761 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/single_4.c	2019-08-14 09:00:55.960509992 +0100
@@ -16,10 +16,7 @@  #define N 256
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } */
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 2 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl128\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, vl64\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl32\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl256\n} 11 } } */
 
 /* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */
 /* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */