diff mbox

[AArch64] Add vector pattern for __builtin_ctz

Message ID 54662FF1.1030503@arm.com
State New
Headers show

Commit Message

Jiong Wang Nov. 14, 2014, 4:38 p.m. UTC
This patch add vector pattern for __builtin_ctz.

like __builtin_clz, only 32bit version of ctz supported.

for scalar version ctz, we expand it into:

   rbit
   clz

reverse bits first, then turn cout tailing zero into count leading zero.

while for vector version, rbit only support byte granularity .8B and .16B.
no half-word, and word. so we need to first reverse byte within word,
then reverse bits within byte. thus the generated instruction sequences are:

void
count_tz_v4si (unsigned *__restrict a, int *__restrict b)
{
   int i;
   for (i = 0; i < 4; i++)
     b[i] = __builtin_ctz (a[i]);
}

void
count_tz_v2si (unsigned *__restrict a, int *__restrict b)
{
   int i;
   for (i = 0; i < 2; i++)
     b[i] = __builtin_ctz (a[i]);
}

count_tz_v4si:
         ldr     q0, [x0]
         rev32   v0.16b, v0.16b
         rbit    v0.16b, v0.16b
         clz     v0.4s, v0.4s
         str     q0, [x1]
         ret

count_tz_v2si:
         ldr     d0, [x0]
         rev32   v0.8b, v0.8b
         rbit    v0.8b, v0.8b
         clz     v0.2s, v0.2s
         str     d0, [x1]
         ret

no regression on aarch64-none-gnu-linux qemu test.

ok for trunk?

thanks.

gcc/
   * config/aarch64/iterators.md (VS): New mode iterator.
   (vsi2qi): New mode attribute.
   (VSI2QI): Likewise.
   * config/aarch64/aarch64-simd-builtins.def: New entry for ctz.
   * config/aarch64/aarch64-simd.md (ctz<mode>2): New pattern for ctz.
   * config/aarch64/aarch64-builtins.c
   (aarch64_builtin_vectorized_function): Support BUILT_IN_CTZ.

gcc/testsuite/
   * gcc.target/aarch64/vect_ctz_1.c: New testcase.

Comments

Marcus Shawcroft Nov. 21, 2014, 4:31 p.m. UTC | #1
On 14 November 2014 16:38, Jiong Wang <jiong.wang@arm.com> wrote:
>
> gcc/
>   * config/aarch64/iterators.md (VS): New mode iterator.
>   (vsi2qi): New mode attribute.
>   (VSI2QI): Likewise.
>   * config/aarch64/aarch64-simd-builtins.def: New entry for ctz.
>   * config/aarch64/aarch64-simd.md (ctz<mode>2): New pattern for ctz.
>   * config/aarch64/aarch64-builtins.c
>   (aarch64_builtin_vectorized_function): Support BUILT_IN_CTZ.
>
> gcc/testsuite/
>   * gcc.target/aarch64/vect_ctz_1.c: New testcase.

OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 527445c..3250f3c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1097,6 +1097,14 @@  aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
               return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_clzv4si];
             return NULL_TREE;
           }
+	case BUILT_IN_CTZ:
+          {
+	    if (AARCH64_CHECK_BUILTIN_MODE (2, S))
+	      return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv2si];
+	    else if (AARCH64_CHECK_BUILTIN_MODE (4, S))
+	      return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv4si];
+	    return NULL_TREE;
+          }
 #undef AARCH64_CHECK_BUILTIN_MODE
 #define AARCH64_CHECK_BUILTIN_MODE(C, N) \
   (out_mode == N##Imode && out_n == C \
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 62b7f33..c611b5c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -46,6 +46,7 @@ 
   BUILTIN_VD_BHSI (BINOP, addp, 0)
   VAR1 (UNOP, addp, 0, di)
   BUILTIN_VDQ_BHSI (UNOP, clz, 2)
+  BUILTIN_VS (UNOP, ctz, 2)

   BUILTIN_VALL (GETLANE, be_checked_get_lane, 0)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef196e4..5ee960f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -303,6 +303,20 @@ 
   [(set_attr "type" "neon_rbit")]
 )
 
+(define_expand "ctz<mode>2"
+  [(set (match_operand:VS 0 "register_operand")
+        (ctz:VS (match_operand:VS 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+     emit_insn (gen_bswap<mode> (operands[0], operands[1]));
+     rtx op0_castsi2qi = simplify_gen_subreg(<VS:VSI2QI>mode, operands[0],
+					     <MODE>mode, 0);
+     emit_insn (gen_aarch64_rbit<VS:vsi2qi> (op0_castsi2qi, op0_castsi2qi));
+     emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
+     DONE;
+  }
+)
+
 (define_insn "*aarch64_mul3_elt<mode>"
  [(set (match_operand:VMUL 0 "register_operand" "=w")
     (mult:VMUL
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9935167..b416e6a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -183,6 +183,9 @@ 
 ;; All byte modes.
 (define_mode_iterator VB [V8QI V16QI])
 
+;; 2 and 4 lane SI modes.
+(define_mode_iterator VS [V2SI V4SI])
+
 (define_mode_iterator TX [TI TF])
 
 ;; Opaque structure modes.
@@ -670,6 +673,9 @@ 
 		      (V2DI  "p") (V2DF  "p")
 		      (V2SF "p") (V4SF  "v")])
 
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+
 ;; -------------------------------------------------------------------
 ;; Code Iterators
 ;; -------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
new file mode 100644
index 0000000..40823b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fno-inline" } */
+
+extern void abort ();
+
+#define TEST(name, subname, count) \
+void \
+count_tz_##name (unsigned *__restrict a, int *__restrict b) \
+{ \
+  int i; \
+  for (i = 0; i < count; i++) \
+    b[i] = __builtin_##subname (a[i]); \
+}
+
+#define CHECK(name, count, input, output) \
+  count_tz_##name (input, output); \
+  for (i = 0; i < count; i++) \
+    { \
+      if (output[i] != r[i]) \
+	abort (); \
+    }
+
+TEST (v4si, ctz, 4)
+TEST (v2si, ctz, 2)
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.2s" } } */
+
+int
+main ()
+{
+  unsigned int x4[4] = { 0x0, 0xFF80, 0x1FFFF, 0xFF000000 };
+  int r[4] = { 32, 7, 0, 24 };
+  int d[4], i;
+
+  CHECK (v4si, 4, x4, d);
+  CHECK (v2si, 2, x4, d);
+
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */