diff mbox

[ARM] Don't force vget_lane returning a 64-bit result to transfer to core registers

Message ID 4F69B96D.7010108@arm.com
State New
Headers show

Commit Message

Richard Earnshaw March 21, 2012, 11:20 a.m. UTC
Semantically the neon intrinsic vgetq_lane_[su]64 returns a 64 bit
sub-object of a 128-bit vector; there's no real need for the intrinsic
to map onto a specific machine instruction.

Indeed, if force a particular instruction that moves the result into a
core register, but then want to use the result in the vector unit, we
don't really want to have to move the result back to the other register
bank.  However, that's what we do today.

This patch changes the way we expand these operations so that we
no-longer force selection of the get-lane operation.

A side effect of this change is that we now spit out the fmrrd mnemonic
rather than the vmov equivalent.  As a consequence I've updated the
testsuite to allow for this change.  The changes to the ML files are
pretty mechanical, but I don't speak ML so it would be helpful if
another pair of eyes could check that bit over and tell me if I've
missed something subtle.

Tested on trunk and gcc-4.7, but only installed on trunk.

R.

	* neon.md (neon_vget_lanev2di): Use gen_lowpart and gen_highpart.
	* config/arm/neon.ml (Fixed_return_reg): Renamed to fixed_vector_reg.
	All callers changed.
	(Fixed_core_reg): New feature.
	(Vget_lane [sizes S64 and U64]): Add Fixed_core_reg.  Allow
	fmrrd in disassembly.
	* neon-testgen.ml: Handle Fixed_core_reg.

testsuite/
	* gcc.target/arm/neon/vgetQ_laneu64.c: Regenerated.
	* gcc.target/arm/neon/vgetQ_lanes64.c: Likewise.

Comments

Julian Brown March 23, 2012, 9:53 a.m. UTC | #1
On Wed, 21 Mar 2012 11:20:13 +0000
Richard Earnshaw <rearnsha@arm.com> wrote:

> Semantically the neon intrinsic vgetq_lane_[su]64 returns a 64 bit
> sub-object of a 128-bit vector; there's no real need for the intrinsic
> to map onto a specific machine instruction.
> 
> Indeed, if force a particular instruction that moves the result into a
> core register, but then want to use the result in the vector unit, we
> don't really want to have to move the result back to the other
> register bank.  However, that's what we do today.
> 
> This patch changes the way we expand these operations so that we
> no-longer force selection of the get-lane operation.
> 
> A side effect of this change is that we now spit out the fmrrd
> mnemonic rather than the vmov equivalent.  As a consequence I've
> updated the testsuite to allow for this change.  The changes to the
> ML files are pretty mechanical, but I don't speak ML so it would be
> helpful if another pair of eyes could check that bit over and tell me
> if I've missed something subtle.

The Ocaml bits look fine to me (the compiler won't accept incorrect
programs, as I'm sure you've noticed ;-)).

> Tested on trunk and gcc-4.7, but only installed on trunk.

Don't forget to check big-endian mode too...

Cheers,

Julian
diff mbox

Patch

--- config/arm/neon-testgen.ml	(revision 185587)
+++ config/arm/neon-testgen.ml	(local)
@@ -79,9 +79,12 @@  let emit_automatics chan c_types feature
           (* The intrinsic returns a value.  We need to do explict register
              allocation for vget_low tests or they fail because of copy
              elimination.  *)
-          ((if List.mem Fixed_return_reg features then
+          ((if List.mem Fixed_vector_reg features then
               Printf.fprintf chan "  register %s out_%s asm (\"d18\");\n"
                              return_ty return_ty
+            else if List.mem Fixed_core_reg features then
+              Printf.fprintf chan "  register %s out_%s asm (\"r0\");\n"
+                             return_ty return_ty
             else
               Printf.fprintf chan "  %s out_%s;\n" return_ty return_ty);
 	   emit ())
--- config/arm/neon.md	(revision 185587)
+++ config/arm/neon.md	(local)
@@ -2720,14 +2720,24 @@  (define_expand "neon_vget_lanedi"
 })
 
 (define_expand "neon_vget_lanev2di"
-  [(match_operand:DI 0 "s_register_operand" "=r")
-   (match_operand:V2DI 1 "s_register_operand" "w")
-   (match_operand:SI 2 "immediate_operand" "i")
-   (match_operand:SI 3 "immediate_operand" "i")]
+  [(match_operand:DI 0 "s_register_operand" "")
+   (match_operand:V2DI 1 "s_register_operand" "")
+   (match_operand:SI 2 "immediate_operand" "")
+   (match_operand:SI 3 "immediate_operand" "")]
   "TARGET_NEON"
 {
-  neon_lane_bounds (operands[2], 0, 2);
-  emit_insn (gen_vec_extractv2di (operands[0], operands[1], operands[2]));
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      emit_move_insn (operands[0], gen_lowpart (DImode, operands[1]));
+      break;
+    case 1:
+      emit_move_insn (operands[0], gen_highpart (DImode, operands[1]));
+      break;
+    default:
+      neon_lane_bounds (operands[2], 0, 1);
+      FAIL;
+    }
   DONE;
 })
 
--- config/arm/neon.ml	(revision 185587)
+++ config/arm/neon.ml	(local)
@@ -234,7 +234,8 @@  type features =
        cases.  The function supplied must return the integer to be written
        into the testcase for the argument number (0-based) supplied to it.  *)
   | Const_valuator of (int -> int)
-  | Fixed_return_reg
+  | Fixed_vector_reg
+  | Fixed_core_reg
 
 exception MixedMode of elts * elts
 
@@ -1009,7 +1010,8 @@  let ops =
     Vget_lane,
       [InfoWord;
        Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
-       Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+       Instruction_name ["vmov"; "fmrrd"]; Const_valuator (fun _ -> 0);
+       Fixed_core_reg],
       Use_operands [| Corereg; Qreg; Immed |],
       "vgetQ_lane", notype_2, [S64; U64];
 
@@ -1125,7 +1127,7 @@  let ops =
       notype_1, pf_su_8_64;
     Vget_low, [Instruction_name ["vmov"];
                Disassembles_as [Use_operands [| Dreg; Dreg |]];
-	       Fixed_return_reg],
+	       Fixed_vector_reg],
       Use_operands [| Dreg; Qreg |], "vget_low",
       notype_1, pf_su_8_32;
      Vget_low, [No_op],
--- testsuite/gcc.target/arm/neon/vgetQ_lanes64.c	(revision 185587)
+++ testsuite/gcc.target/arm/neon/vgetQ_lanes64.c	(local)
@@ -10,11 +10,11 @@ 
 
 void test_vgetQ_lanes64 (void)
 {
-  int64_t out_int64_t;
+  register int64_t out_int64_t asm ("r0");
   int64x2_t arg0_int64x2_t;
 
   out_int64_t = vgetq_lane_s64 (arg0_int64x2_t, 0);
 }
 
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */
--- testsuite/gcc.target/arm/neon/vgetQ_laneu64.c	(revision 185587)
+++ testsuite/gcc.target/arm/neon/vgetQ_laneu64.c	(local)
@@ -10,11 +10,11 @@ 
 
 void test_vgetQ_laneu64 (void)
 {
-  uint64_t out_uint64_t;
+  register uint64_t out_uint64_t asm ("r0");
   uint64x2_t arg0_uint64x2_t;
 
   out_uint64_t = vgetq_lane_u64 (arg0_uint64x2_t, 0);
 }
 
-/* { dg-final { scan-assembler "vmov\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { scan-assembler "((vmov)|(fmrrd))\[ 	\]+\[rR\]\[0-9\]+, \[rR\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ 	\]+@\[a-zA-Z0-9 \]+\)?\n" } } */
 /* { dg-final { cleanup-saved-temps } } */