diff mbox

[AArch64] NEON vclz intrinsic modified

Message ID 525436A8.30905@arm.com
State New
Headers show

Commit Message

Alex Velenko Oct. 8, 2013, 4:45 p.m. UTC
Hi,

This patch implements the behavior and regression
test for NEON intrinsics vclz[q]_[s,u][8,16,32]
No problems found when running aarch64-none-elf
regressions tests.

Is patch OK?

Thanks,
Alex

gcc/testsuite/

2013-10-08  Alex Velenko  <Alex.Velenko@arm.com>

	    * gcc.target/aarch64/vclz.c: New testcase.

gcc/

2013-10-08  Alex Velenko  <Alex.Velenko@arm.com>

	    * config/aarch64/arm_neon.h (vclz_s8): Asm replaced with C
	      (vclz_s16): Likewise.
	      (vclz_s32): Likewise.
	      (vclzq_s8): Likewise.
	      (vclzq_s16): Likewise.
	      (vclzq_s32): Likewise.
	      (vclz_u8): Likewise.
	      (vclz_u16): Likewise.
	      (vclz_u32): Likewise.
	      (vclzq_u8): Likewise.
	      (vclzq_u16): Likewise.
	      (vclzq_u32): Likewise.

             * config/aarch64/aarch64.h (CLZ_DEFINED_VALUE_AT_ZERO): 
Macro fixed for clz.

             * config/aarch64/aarch64-simd-builtins.def (VAR1 (UNOP, 
clz, 0, v4si)): Replaced with iterator.

Comments

Marcus Shawcroft Oct. 9, 2013, 12:10 p.m. UTC | #1
On 8 October 2013 17:45, Alex Velenko <Alex.Velenko@arm.com> wrote:
>
> 2013-10-08  Alex Velenko  <Alex.Velenko@arm.com>
>
>             * gcc.target/aarch64/vclz.c: New testcase.
>
> gcc/
>
> 2013-10-08  Alex Velenko  <Alex.Velenko@arm.com>
>
>             * config/aarch64/arm_neon.h (vclz_s8): Asm replaced with C
>               (vclz_s16): Likewise.
>               (vclz_s32): Likewise.
>               (vclzq_s8): Likewise.
>               (vclzq_s16): Likewise.
>               (vclzq_s32): Likewise.
>               (vclz_u8): Likewise.
>               (vclz_u16): Likewise.
>               (vclz_u32): Likewise.
>               (vclzq_u8): Likewise.
>               (vclzq_u16): Likewise.
>               (vclzq_u32): Likewise.
>
>             * config/aarch64/aarch64.h (CLZ_DEFINED_VALUE_AT_ZERO): Macro
> fixed for clz.
>
>             * config/aarch64/aarch64-simd-builtins.def (VAR1 (UNOP, clz, 0,
> v4si)): Replaced with iterator.

OK, committed.

/Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 35897f3939556d7bb804d4b4ae692a300b103681..c18b150a1f5f2131deb54e3f66f93330c43bcefd 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -45,7 +45,7 @@ 
   BUILTIN_VDQF (UNOP, sqrt, 2)
   BUILTIN_VD_BHSI (BINOP, addp, 0)
   VAR1 (UNOP, addp, 0, di)
-  VAR1 (UNOP, clz, 2, v4si)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
 
   BUILTIN_VALL (GETLANE, get_lane, 0)
   VAR1 (GETLANE, get_lane, 0, di)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index da2b46d14cf02814f93aeda1535461c242174aae..7a80e96385f935e032bc0421d1aeea52de7bcd1d 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -739,7 +739,7 @@  do {									     \
    : reverse_condition (CODE))
 
 #define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
-  ((VALUE) = ((MODE) == SImode ? 32 : 64), 2)
+  ((VALUE) = GET_MODE_UNIT_BITSIZE (MODE))
 #define CTZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE) \
   ((VALUE) = ((MODE) == SImode ? 32 : 64), 2)
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index db9bf28227e87072b48f5dca8835be8007c6b93d..482d7d03ed4995d46bef14a0c2c42903aafc6986 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -5158,138 +5158,6 @@  vclsq_s32 (int32x4_t a)
   return result;
 }
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vclz_s8 (int8x8_t a)
-{
-  int8x8_t result;
-  __asm__ ("clz %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vclz_s16 (int16x4_t a)
-{
-  int16x4_t result;
-  __asm__ ("clz %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vclz_s32 (int32x2_t a)
-{
-  int32x2_t result;
-  __asm__ ("clz %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclz_u8 (uint8x8_t a)
-{
-  uint8x8_t result;
-  __asm__ ("clz %0.8b,%1.8b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclz_u16 (uint16x4_t a)
-{
-  uint16x4_t result;
-  __asm__ ("clz %0.4h,%1.4h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclz_u32 (uint32x2_t a)
-{
-  uint32x2_t result;
-  __asm__ ("clz %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclzq_s8 (int8x16_t a)
-{
-  int8x16_t result;
-  __asm__ ("clz %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclzq_s16 (int16x8_t a)
-{
-  int16x8_t result;
-  __asm__ ("clz %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclzq_s32 (int32x4_t a)
-{
-  int32x4_t result;
-  __asm__ ("clz %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vclzq_u8 (uint8x16_t a)
-{
-  uint8x16_t result;
-  __asm__ ("clz %0.16b,%1.16b"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vclzq_u16 (uint16x8_t a)
-{
-  uint16x8_t result;
-  __asm__ ("clz %0.8h,%1.8h"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vclzq_u32 (uint32x4_t a)
-{
-  uint32x4_t result;
-  __asm__ ("clz %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vcnt_p8 (poly8x8_t a)
 {
@@ -17934,6 +17802,80 @@  vcltzd_f64 (float64_t __a)
   return __a < 0.0 ? -1ll : 0ll;
 }
 
+/* vclz.  */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vclz_s8 (int8x8_t __a)
+{
+  return __builtin_aarch64_clzv8qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vclz_s16 (int16x4_t __a)
+{
+  return __builtin_aarch64_clzv4hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vclz_s32 (int32x2_t __a)
+{
+  return __builtin_aarch64_clzv2si (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclz_u8 (uint8x8_t __a)
+{
+  return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclz_u16 (uint16x4_t __a)
+{
+  return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclz_u32 (uint32x2_t __a)
+{
+  return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vclzq_s8 (int8x16_t __a)
+{
+  return __builtin_aarch64_clzv16qi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vclzq_s16 (int16x8_t __a)
+{
+  return __builtin_aarch64_clzv8hi (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vclzq_s32 (int32x4_t __a)
+{
+  return __builtin_aarch64_clzv4si (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vclzq_u8 (uint8x16_t __a)
+{
+  return (uint8x16_t)__builtin_aarch64_clzv16qi ((int8x16_t)__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclzq_u16 (uint16x8_t __a)
+{
+  return (uint16x8_t)__builtin_aarch64_clzv8hi ((int16x8_t)__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vclzq_u32 (uint32x4_t __a)
+{
+  return (uint32x4_t)__builtin_aarch64_clzv4si ((int32x4_t)__a);
+}
+
 /* vcvt (double -> float).  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
diff --git a/gcc/testsuite/gcc.target/aarch64/vclz.c b/gcc/testsuite/gcc.target/aarch64/vclz.c
new file mode 100644
index 0000000000000000000000000000000000000000..006f80d77b5ad9a67e68fb48fc0b5e26e6b2fa5e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vclz.c
@@ -0,0 +1,574 @@ 
+/* Test vclz works correctly.  */
+/* { dg-do run } */
+/* { dg-options "-std=gnu99 -O3 -Wno-div-by-zero --save-temps" } */
+#include <arm_neon.h>
+
+extern void abort (void);
+
+/* Tests in binary should look like:
+   0
+   1
+   10
+   101
+   1010
+   10101
+   etc.  */
+
+#define TEST0 0
+#define TEST1 0x1
+#define TEST2 0x2
+#define TEST3 0x5
+#define TEST4 0xa
+#define TEST5 0x15
+#define TEST6 0x2a
+#define TEST7 0x55
+#define TEST8 0xaa
+#define TEST9 0x155
+#define TEST10 0x2aa
+#define TEST11 0x555
+#define TEST12 0xaaa
+#define TEST13 0x1555
+#define TEST14 0x2aaa
+#define TEST15 0x5555
+#define TEST16 0xaaaa
+#define TEST17 0x15555
+#define TEST18 0x2aaaa
+#define TEST19 0x55555
+#define TEST20 0xaaaaa
+#define TEST21 0x155555
+#define TEST22 0x2aaaaa
+#define TEST23 0x555555
+#define TEST24 0xaaaaaa
+#define TEST25 0x1555555
+#define TEST26 0x2aaaaaa
+#define TEST27 0x5555555
+#define TEST28 0xaaaaaaa
+#define TEST29 0x15555555
+#define TEST30 0x2aaaaaaa
+#define TEST31 0x55555555
+#define TEST32 0xaaaaaaaa
+
+#define INHIB_OPTIMIZATION asm volatile ("" : : : "memory")
+
+#define CONCAT(a, b) a##b
+#define CONCAT1(a, b) CONCAT (a, b)
+#define REG_INFEX64 _
+#define REG_INFEX128 q_
+#define SIGNED0 u
+#define SIGNED1 s
+#define SIGNED(x) SIGNED##x
+#define REG_INFEX(reg_len) REG_INFEX##reg_len
+#define POSTFIX(reg_len, data_len, is_signed) \
+  CONCAT1 (REG_INFEX (reg_len), CONCAT1 (SIGNED (is_signed), data_len))
+#define DATA_TYPE(data_len) DATA_TYPE_##data_len
+#define LOAD_INST(reg_len, data_len, is_signed) \
+  CONCAT1 (vld1, POSTFIX (reg_len, data_len, is_signed))
+#define CLZ_INST(reg_len, data_len, is_signed) \
+  CONCAT1 (vclz, POSTFIX (reg_len, data_len, is_signed))
+
+#define RUN_TEST(test_set, answ_set, reg_len, data_len, is_signed, n)	\
+  a = LOAD_INST (reg_len, data_len, is_signed) (test_set);		\
+  b = LOAD_INST (reg_len, data_len, is_signed) (answ_set);	        \
+  INHIB_OPTIMIZATION;							\
+  a = CLZ_INST (reg_len, data_len, is_signed) (a);			\
+  for (i = 0; i < n; i++)						\
+    {									\
+      INHIB_OPTIMIZATION;						\
+      if (a [i] != b [i])						\
+        {								\
+          return 1;							\
+        }								\
+    }
+
+int
+test_vclz_s8 ()
+{
+  int i;
+  int8x8_t a;
+  int8x8_t b;
+
+  int8_t test_set0[8] = {
+    TEST0, TEST1, TEST2, TEST3,
+    TEST4, TEST5, TEST6, TEST7
+  };
+  int8_t test_set1[8] = {
+    TEST8, TEST8, TEST8, TEST8,
+    TEST8, TEST8, TEST8, TEST8
+  };
+  int8_t answ_set0[8] = {
+    8, 7, 6, 5,
+    4, 3, 2, 1
+  };
+  int8_t answ_set1[8] = {
+    0, 0, 0, 0,
+    0, 0, 0, 0
+  };
+  RUN_TEST (test_set0, answ_set0, 64, 8, 1, 8);
+  RUN_TEST (test_set1, answ_set1, 64, 8, 1, 1);
+
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8b, v\[0-9\]+\.8b" 4 } } */
+
+int
+test_vclz_s16 ()
+{
+  int i;
+  int16x4_t a;
+  int16x4_t b;
+
+  int16_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
+  int16_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
+  int16_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
+  int16_t test_set3[4] = { TEST12, TEST13, TEST14, TEST15 };
+  int16_t test_set4[4] = { TEST16, TEST16, TEST16, TEST16 };
+
+  int16_t answ_set0[4] = { 16, 15, 14, 13 };
+  int16_t answ_set1[4] = { 12, 11, 10, 9 };
+  int16_t answ_set2[4] = { 8, 7, 6, 5 };
+  int16_t answ_set3[4] = { 4, 3, 2, 1 };
+  int16_t answ_set4[4] = { 0, 0, 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 64, 16, 1, 4);
+  RUN_TEST (test_set1, answ_set1, 64, 16, 1, 4);
+  RUN_TEST (test_set2, answ_set2, 64, 16, 1, 4);
+  RUN_TEST (test_set3, answ_set3, 64, 16, 1, 4);
+  RUN_TEST (test_set4, answ_set4, 64, 16, 1, 1);
+
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4h, v\[0-9\]+\.4h" 10} } */
+
+int
+test_vclz_s32 ()
+{
+  int i;
+  int32x2_t a;
+  int32x2_t b;
+
+  int32_t test_set0[2] = { TEST0, TEST1 };
+  int32_t test_set1[2] = { TEST2, TEST3 };
+  int32_t test_set2[2] = { TEST4, TEST5 };
+  int32_t test_set3[2] = { TEST6, TEST7 };
+  int32_t test_set4[2] = { TEST8, TEST9 };
+  int32_t test_set5[2] = { TEST10, TEST11 };
+  int32_t test_set6[2] = { TEST12, TEST13 };
+  int32_t test_set7[2] = { TEST14, TEST15 };
+  int32_t test_set8[2] = { TEST16, TEST17 };
+  int32_t test_set9[2] = { TEST18, TEST19 };
+  int32_t test_set10[2] = { TEST20, TEST21 };
+  int32_t test_set11[2] = { TEST22, TEST23 };
+  int32_t test_set12[2] = { TEST24, TEST25 };
+  int32_t test_set13[2] = { TEST26, TEST27 };
+  int32_t test_set14[2] = { TEST28, TEST29 };
+  int32_t test_set15[2] = { TEST30, TEST31 };
+  int32_t test_set16[2] = { TEST32, TEST32 };
+
+  int32_t answ_set0[2] = { 32, 31 };
+  int32_t answ_set1[2] = { 30, 29 };
+  int32_t answ_set2[2] = { 28, 27 };
+  int32_t answ_set3[2] = { 26, 25 };
+  int32_t answ_set4[2] = { 24, 23 };
+  int32_t answ_set5[2] = { 22, 21 };
+  int32_t answ_set6[2] = { 20, 19 };
+  int32_t answ_set7[2] = { 18, 17 };
+  int32_t answ_set8[2] = { 16, 15 };
+  int32_t answ_set9[2] = { 14, 13 };
+  int32_t answ_set10[2] = { 12, 11 };
+  int32_t answ_set11[2] = { 10, 9 };
+  int32_t answ_set12[2] = { 8, 7 };
+  int32_t answ_set13[2] = { 6, 5 };
+  int32_t answ_set14[2] = { 4, 3 };
+  int32_t answ_set15[2] = { 2, 1 };
+  int32_t answ_set16[2] = { 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 64, 32, 1, 2);
+  RUN_TEST (test_set1, answ_set1, 64, 32, 1, 2);
+  RUN_TEST (test_set2, answ_set2, 64, 32, 1, 2);
+  RUN_TEST (test_set3, answ_set3, 64, 32, 1, 2);
+  RUN_TEST (test_set4, answ_set4, 64, 32, 1, 2);
+  RUN_TEST (test_set5, answ_set5, 64, 32, 1, 2);
+  RUN_TEST (test_set6, answ_set6, 64, 32, 1, 2);
+  RUN_TEST (test_set7, answ_set7, 64, 32, 1, 2);
+  RUN_TEST (test_set8, answ_set8, 64, 32, 1, 2);
+  RUN_TEST (test_set9, answ_set9, 64, 32, 1, 2);
+  RUN_TEST (test_set10, answ_set10, 64, 32, 1, 2);
+  RUN_TEST (test_set11, answ_set11, 64, 32, 1, 2);
+  RUN_TEST (test_set12, answ_set12, 64, 32, 1, 2);
+  RUN_TEST (test_set13, answ_set13, 64, 32, 1, 2);
+  RUN_TEST (test_set14, answ_set14, 64, 32, 1, 2);
+  RUN_TEST (test_set15, answ_set15, 64, 32, 1, 2);
+  RUN_TEST (test_set16, answ_set16, 64, 32, 1, 1);
+
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.2s, v\[0-9\]+\.2s" 34 } } */
+
+int
+test_vclzq_s8 ()
+{
+  int i;
+  int8x16_t a;
+  int8x16_t b;
+
+  int8_t test_set0[16] = {
+    TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7,
+    TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8
+  };
+  int8_t answ_set0[16] = {
+    8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  RUN_TEST (test_set0, answ_set0, 128, 8, 1, 9);
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.16b, v\[0-9\]+\.16b" 2 } } */
+
+int
+test_vclzq_s16 ()
+{
+  int i;
+  int16x8_t a;
+  int16x8_t b;
+
+  int16_t test_set0[8] = {
+    TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7
+  };
+  int16_t test_set1[8] = {
+    TEST8, TEST9, TEST10, TEST11, TEST12, TEST13, TEST14, TEST15
+  };
+  int16_t test_set2[8] = {
+    TEST16, TEST16, TEST16, TEST16, TEST16, TEST16, TEST16, TEST16
+  };
+
+  int16_t answ_set0[8] = {
+    16, 15, 14, 13, 12, 11, 10, 9
+  };
+  int16_t answ_set1[8] = {
+    8, 7, 6, 5, 4, 3, 2, 1
+  };
+  int16_t answ_set2[8] = {
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+  RUN_TEST (test_set0, answ_set0, 128, 16, 1, 8);
+  RUN_TEST (test_set1, answ_set1, 128, 16, 1, 8);
+  RUN_TEST (test_set2, answ_set2, 128, 16, 1, 1);
+
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.8h, v\[0-9\]+\.8h" 6 } } */
+
+int
+test_vclzq_s32 ()
+{
+  int i;
+  int32x4_t a;
+  int32x4_t b;
+
+  int32_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
+  int32_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
+  int32_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
+  int32_t test_set3[4] = { TEST12, TEST13, TEST14, TEST15 };
+  int32_t test_set4[4] = { TEST16, TEST17, TEST18, TEST19 };
+  int32_t test_set5[4] = { TEST20, TEST21, TEST22, TEST23 };
+  int32_t test_set6[4] = { TEST24, TEST25, TEST26, TEST27 };
+  int32_t test_set7[4] = { TEST28, TEST29, TEST30, TEST31 };
+  int32_t test_set8[4] = { TEST32, TEST32, TEST32, TEST32 };
+
+  int32_t answ_set0[4] = { 32, 31, 30, 29 };
+  int32_t answ_set1[4] = { 28, 27, 26, 25 };
+  int32_t answ_set2[4] = { 24, 23, 22, 21 };
+  int32_t answ_set3[4] = { 20, 19, 18, 17 };
+  int32_t answ_set4[4] = { 16, 15, 14, 13 };
+  int32_t answ_set5[4] = { 12, 11, 10, 9 };
+  int32_t answ_set6[4] = { 8, 7, 6, 5 };
+  int32_t answ_set7[4] = { 4, 3, 2, 1 };
+  int32_t answ_set8[4] = { 0, 0, 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 128, 32, 1, 4);
+  RUN_TEST (test_set1, answ_set1, 128, 32, 1, 4);
+  RUN_TEST (test_set2, answ_set2, 128, 32, 1, 4);
+  RUN_TEST (test_set3, answ_set3, 128, 32, 1, 4);
+  RUN_TEST (test_set4, answ_set4, 128, 32, 1, 1);
+
+  return 0;
+}
+
+/* Double scan-assembler-times to take account of unsigned functions.  */
+/* { dg-final { scan-assembler-times "clz\\tv\[0-9\]+\.4s, v\[0-9\]+\.4s" 10 } } */
+
+/* Unsigned versions.  */
+
+int
+test_vclz_u8 ()
+{
+  int i;
+  uint8x8_t a;
+  uint8x8_t b;
+
+  uint8_t test_set0[8] = {
+    TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7
+  };
+  uint8_t test_set1[8] = {
+    TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8
+  };
+  uint8_t answ_set0[8] = {
+    8, 7, 6, 5, 4, 3, 2, 1
+  };
+  uint8_t answ_set1[8] = {
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+
+  RUN_TEST (test_set0, answ_set0, 64, 8, 0, 8);
+  RUN_TEST (test_set1, answ_set1, 64, 8, 0, 1);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+test_vclz_u16 ()
+{
+  int i;
+  uint16x4_t a;
+  uint16x4_t b;
+
+  uint16_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
+  uint16_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
+  uint16_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
+  uint16_t test_set3[4] = { TEST12, TEST13, TEST14, TEST15 };
+  uint16_t test_set4[4] = { TEST16, TEST16, TEST16, TEST16 };
+
+  uint16_t answ_set0[4] = { 16, 15, 14, 13 };
+  uint16_t answ_set1[4] = { 12, 11, 10, 9 };
+  uint16_t answ_set2[4] = { 8, 7, 6, 5 };
+  uint16_t answ_set3[4] = { 4, 3, 2, 1 };
+  uint16_t answ_set4[4] = { 0, 0, 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 64, 16, 0, 4);
+  RUN_TEST (test_set1, answ_set1, 64, 16, 0, 4);
+  RUN_TEST (test_set2, answ_set2, 64, 16, 0, 4);
+  RUN_TEST (test_set3, answ_set3, 64, 16, 0, 4);
+  RUN_TEST (test_set4, answ_set4, 64, 16, 0, 1);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+test_vclz_u32 ()
+{
+  int i;
+  uint32x2_t a;
+  uint32x2_t b;
+
+  uint32_t test_set0[2] = { TEST0, TEST1 };
+  uint32_t test_set1[2] = { TEST2, TEST3 };
+  uint32_t test_set2[2] = { TEST4, TEST5 };
+  uint32_t test_set3[2] = { TEST6, TEST7 };
+  uint32_t test_set4[2] = { TEST8, TEST9 };
+  uint32_t test_set5[2] = { TEST10, TEST11 };
+  uint32_t test_set6[2] = { TEST12, TEST13 };
+  uint32_t test_set7[2] = { TEST14, TEST15 };
+  uint32_t test_set8[2] = { TEST16, TEST17 };
+  uint32_t test_set9[2] = { TEST18, TEST19 };
+  uint32_t test_set10[2] = { TEST20, TEST21 };
+  uint32_t test_set11[2] = { TEST22, TEST23 };
+  uint32_t test_set12[2] = { TEST24, TEST25 };
+  uint32_t test_set13[2] = { TEST26, TEST27 };
+  uint32_t test_set14[2] = { TEST28, TEST29 };
+  uint32_t test_set15[2] = { TEST30, TEST31 };
+  uint32_t test_set16[2] = { TEST32, TEST32 };
+
+  uint32_t answ_set0[2] = { 32, 31 };
+  uint32_t answ_set1[2] = { 30, 29 };
+  uint32_t answ_set2[2] = { 28, 27 };
+  uint32_t answ_set3[2] = { 26, 25 };
+  uint32_t answ_set4[2] = { 24, 23 };
+  uint32_t answ_set5[2] = { 22, 21 };
+  uint32_t answ_set6[2] = { 20, 19 };
+  uint32_t answ_set7[2] = { 18, 17 };
+  uint32_t answ_set8[2] = { 16, 15 };
+  uint32_t answ_set9[2] = { 14, 13 };
+  uint32_t answ_set10[2] = { 12, 11 };
+  uint32_t answ_set11[2] = { 10, 9 };
+  uint32_t answ_set12[2] = { 8, 7 };
+  uint32_t answ_set13[2] = { 6, 5 };
+  uint32_t answ_set14[2] = { 4, 3 };
+  uint32_t answ_set15[2] = { 2, 1 };
+  uint32_t answ_set16[2] = { 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 64, 32, 0, 2);
+  RUN_TEST (test_set1, answ_set1, 64, 32, 0, 2);
+  RUN_TEST (test_set2, answ_set2, 64, 32, 0, 2);
+  RUN_TEST (test_set3, answ_set3, 64, 32, 0, 2);
+  RUN_TEST (test_set4, answ_set4, 64, 32, 0, 2);
+  RUN_TEST (test_set5, answ_set5, 64, 32, 0, 2);
+  RUN_TEST (test_set6, answ_set6, 64, 32, 0, 2);
+  RUN_TEST (test_set7, answ_set7, 64, 32, 0, 2);
+  RUN_TEST (test_set8, answ_set8, 64, 32, 0, 2);
+  RUN_TEST (test_set9, answ_set9, 64, 32, 0, 2);
+  RUN_TEST (test_set10, answ_set10, 64, 32, 0, 2);
+  RUN_TEST (test_set11, answ_set11, 64, 32, 0, 2);
+  RUN_TEST (test_set12, answ_set12, 64, 32, 0, 2);
+  RUN_TEST (test_set13, answ_set13, 64, 32, 0, 2);
+  RUN_TEST (test_set14, answ_set14, 64, 32, 0, 2);
+  RUN_TEST (test_set15, answ_set15, 64, 32, 0, 2);
+  RUN_TEST (test_set16, answ_set16, 64, 32, 0, 1);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+test_vclzq_u8 ()
+{
+  int i;
+  uint8x16_t a;
+  uint8x16_t b;
+
+  uint8_t test_set0[16] = {
+    TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7,
+    TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8, TEST8
+  };
+  uint8_t answ_set0[16] = {
+    8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  RUN_TEST (test_set0, answ_set0, 128, 8, 0, 9);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+test_vclzq_u16 ()
+{
+  int i;
+  uint16x8_t a;
+  uint16x8_t b;
+
+  uint16_t test_set0[8] = {
+    TEST0, TEST1, TEST2, TEST3, TEST4, TEST5, TEST6, TEST7
+  };
+  uint16_t test_set1[8] = {
+    TEST8, TEST9, TEST10, TEST11, TEST12, TEST13, TEST14, TEST15
+  };
+  uint16_t test_set2[8] = {
+    TEST16, TEST16, TEST16, TEST16, TEST16, TEST16, TEST16, TEST16
+  };
+
+  uint16_t answ_set0[8] = {
+    16, 15, 14, 13, 12, 11, 10, 9
+  };
+
+  uint16_t answ_set1[8] = {
+    8, 7, 6, 5, 4, 3, 2, 1
+  };
+  uint16_t answ_set2[8] = {
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+
+  RUN_TEST (test_set0, answ_set0, 128, 16, 0, 8);
+  RUN_TEST (test_set1, answ_set1, 128, 16, 0, 8);
+  RUN_TEST (test_set2, answ_set2, 128, 16, 0, 1);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+test_vclzq_u32 ()
+{
+  int i;
+  uint32x4_t a;
+  uint32x4_t b;
+
+  uint32_t test_set0[4] = { TEST0, TEST1, TEST2, TEST3 };
+  uint32_t test_set1[4] = { TEST4, TEST5, TEST6, TEST7 };
+  uint32_t test_set2[4] = { TEST8, TEST9, TEST10, TEST11 };
+  uint32_t test_set3[4] = { TEST12, TEST13, TEST14, TEST15 };
+  uint32_t test_set4[4] = { TEST16, TEST17, TEST18, TEST19 };
+  uint32_t test_set5[4] = { TEST20, TEST21, TEST22, TEST23 };
+  uint32_t test_set6[4] = { TEST24, TEST25, TEST26, TEST27 };
+  uint32_t test_set7[4] = { TEST28, TEST29, TEST30, TEST31 };
+  uint32_t test_set8[4] = { TEST32, TEST32, TEST32, TEST32 };
+
+  uint32_t answ_set0[4] = { 32, 31, 30, 29 };
+  uint32_t answ_set1[4] = { 28, 27, 26, 25 };
+  uint32_t answ_set2[4] = { 24, 23, 22, 21 };
+  uint32_t answ_set3[4] = { 20, 19, 18, 17 };
+  uint32_t answ_set4[4] = { 16, 15, 14, 13 };
+  uint32_t answ_set5[4] = { 12, 11, 10, 9 };
+  uint32_t answ_set6[4] = { 8, 7, 6, 5 };
+  uint32_t answ_set7[4] = { 4, 3, 2, 1 };
+  uint32_t answ_set8[4] = { 0, 0, 0, 0 };
+
+  RUN_TEST (test_set0, answ_set0, 128, 32, 0, 4);
+  RUN_TEST (test_set1, answ_set1, 128, 32, 0, 4);
+  RUN_TEST (test_set2, answ_set2, 128, 32, 0, 4);
+  RUN_TEST (test_set3, answ_set3, 128, 32, 0, 4);
+  RUN_TEST (test_set4, answ_set4, 128, 32, 0, 1);
+
+  return 0;
+}
+
+/* ASM scan near test for signed version.  */
+
+int
+main (int argc, char **argv)
+{
+
+  if (test_vclz_s8 ())
+    abort ();
+
+  if (test_vclz_s16 ())
+    abort ();
+
+  if (test_vclz_s32 ())
+    abort ();
+
+  if (test_vclzq_s8 ())
+    abort ();
+
+  if (test_vclzq_s16 ())
+    abort ();
+
+  if (test_vclzq_s32 ())
+    abort ();
+
+  if (test_vclz_u8 ())
+    abort ();
+
+  if (test_vclz_u16 ())
+    abort ();
+
+  if (test_vclz_u32 ())
+    abort ();
+
+  if (test_vclzq_u8 ())
+    abort ();
+
+  if (test_vclzq_u16 ())
+    abort ();
+
+  if (test_vclzq_u32 ())
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */