Patchwork [ARM] Implement vceq_p64 and vtst_p64 intrinsics in arm_neon.h

login
register
mail settings
Submitter Kyrylo Tkachov
Date Dec. 6, 2013, 5:19 p.m.
Message ID <52A2070D.3050307@arm.com>
Download mbox | patch
Permalink /patch/298118/
State New
Headers show

Comments

Kyrylo Tkachov - Dec. 6, 2013, 5:19 p.m.
Hi all,

Following the implementation of the Crypto intrinsics I posted earlier this 
week, this patch implements the vceq_p64 and vtst_p64 intrinsics that operate on 
the new poly64_t type. They do not have a regular form and can thus not be 
autogenerated from our beloved ML scripts and are therefore synthesised as a 
vceq_u32 or vtst_u32 operation, followed by a pairwise reduce with min or max 
respectively.

These intrinsics are only available when the crypto intrinsics are available 
(i.e. -mfpu=crypto-neon-fp-armv8 and -mfloat-abi=(hard|softfp)).

I've added two runtime tests to make sure they generate correct results.

Ok for trunk?

Thanks,
Kyrill

2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
     * config/arm/arm_neon.h: Regenerate.
     * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
     * doc/arm-neon-intrinsics.texi: Regenerate.

2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.target/arm/neon-vceq_p64.c: New test.
     * gcc.target/arm/neon-vtst_p64.c: Likewise.
Kyrylo Tkachov - Dec. 19, 2013, 6:02 p.m.
Ping.

Thanks,
Kyrill

On 06/12/13 17:19, Kyrill Tkachov wrote:
> Hi all,
>
> Following the implementation of the Crypto intrinsics I posted earlier this
> week, this patch implements the vceq_p64 and vtst_p64 intrinsics that operate on
> the new poly64_t type. They do not have a regular form and can thus not be
> autogenerated from our beloved ML scripts and are therefore synthesised as a
> vceq_u32 or vtst_u32 operation, followed by a pairwise reduce with min or max
> respectively.
>
> These intrinsics are only available when the crypto intrinsics are available
> (i.e. -mfpu=crypto-neon-fp-armv8 and -mfloat-abi=(hard|softfp)).
>
> I've added two runtime tests to make sure they generate correct results.
>
> Ok for trunk?
>
> Thanks,
> Kyrill
>
> 2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>       * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
>       * config/arm/arm_neon.h: Regenerate.
>       * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
>       * doc/arm-neon-intrinsics.texi: Regenerate.
>
> 2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>       * gcc.target/arm/neon-vceq_p64.c: New test.
>       * gcc.target/arm/neon-vtst_p64.c: Likewise.
Ramana Radhakrishnan - Dec. 19, 2013, 6:21 p.m.
On 06/12/13 17:19, Kyrill Tkachov wrote:
> Hi all,
>
> Following the implementation of the Crypto intrinsics I posted earlier this
> week, this patch implements the vceq_p64 and vtst_p64 intrinsics that operate on
> the new poly64_t type. They do not have a regular form and can thus not be
> autogenerated from our beloved ML scripts and are therefore synthesised as a
> vceq_u32 or vtst_u32 operation, followed by a pairwise reduce with min or max
> respectively.
>
> These intrinsics are only available when the crypto intrinsics are available
> (i.e. -mfpu=crypto-neon-fp-armv8 and -mfloat-abi=(hard|softfp)).
>
> I've added two runtime tests to make sure they generate correct results.
>
> Ok for trunk?

Can you add a comment as a follow-up describing how this works. FTR this 
works nicely because of the properties with vceq_u32 and vtst_u32 
setting bits appropriately and then pmax and pmin doing the right thing. 
Also shouldn't there also be a vcne_p64 as well ?

Also, please add a big bold comment that says that this file is no 
longer fully autogenerated and adjust the file such that there is a 
clear delineation between the auto-generated and non-autogenerated parts.

Such a rearrangement is pre-approved.

This patch is OK with that change.

regards
Ramana

>
> Thanks,
> Kyrill
>
> 2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>       * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64.
>       * config/arm/arm_neon.h: Regenerate.
>       * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64.
>       * doc/arm-neon-intrinsics.texi: Regenerate.
>
> 2013-12-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
>
>       * gcc.target/arm/neon-vceq_p64.c: New test.
>       * gcc.target/arm/neon-vtst_p64.c: Likewise.
>

Patch

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 59ef22c..cc3f56c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -13278,6 +13278,26 @@  vstrq_p128 (poly128_t * __ptr, poly128_t __val)
 #endif
 }
 
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_p64 (poly64x1_t a, poly64x1_t b)
+{
+  uint32x2_t t_a = vreinterpret_u32_p64 (a);
+  uint32x2_t t_b = vreinterpret_u32_p64 (b);
+  uint32x2_t c = vceq_u32 (t_a, t_b);
+  uint32x2_t m = vpmin_u32 (c, c);
+  return vreinterpret_u64_u32 (m);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_p64 (poly64x1_t a, poly64x1_t b)
+{
+  uint32x2_t t_a = vreinterpret_u32_p64 (a);
+  uint32x2_t t_b = vreinterpret_u32_p64 (b);
+  uint32x2_t c = vtst_u32 (t_a, t_b);
+  uint32x2_t m = vpmax_u32 (c, c);
+  return vreinterpret_u64_u32 (m);
+}
+
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
 {
diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml
index 41ae059..8945da7 100644
--- a/gcc/config/arm/neon-docgen.ml
+++ b/gcc/config/arm/neon-docgen.ml
@@ -340,6 +340,14 @@  let crypto_doc =
 @end itemize
 
 @itemize @bullet
+@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
+@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
 @item uint32_t vsha1h_u32 (uint32_t)
 @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
 @end itemize
diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml
index 968c171..69618d0 100644
--- a/gcc/config/arm/neon.ml
+++ b/gcc/config/arm/neon.ml
@@ -2208,6 +2208,26 @@  vstrq_p128 (poly128_t * __ptr, poly128_t __val)
 #endif
 }
 
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vceq_p64 (poly64x1_t a, poly64x1_t b)
+{
+  uint32x2_t t_a = vreinterpret_u32_p64 (a);
+  uint32x2_t t_b = vreinterpret_u32_p64 (b);
+  uint32x2_t c = vceq_u32 (t_a, t_b);
+  uint32x2_t m = vpmin_u32 (c, c);
+  return vreinterpret_u64_u32 (m);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vtst_p64 (poly64x1_t a, poly64x1_t b)
+{
+  uint32x2_t t_a = vreinterpret_u32_p64 (a);
+  uint32x2_t t_b = vreinterpret_u32_p64 (b);
+  uint32x2_t c = vtst_u32 (t_a, t_b);
+  uint32x2_t m = vpmax_u32 (c, c);
+  return vreinterpret_u64_u32 (m);
+}
+
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vaeseq_u8 (uint8x16_t __data, uint8x16_t __key)
 {
diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi
index 610892d..b146868 100644
--- a/gcc/doc/arm-neon-intrinsics.texi
+++ b/gcc/doc/arm-neon-intrinsics.texi
@@ -11939,6 +11939,14 @@ 
 @end itemize
 
 @itemize @bullet
+@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
+@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t)
+@end itemize
+
+@itemize @bullet
 @item uint32_t vsha1h_u32 (uint32_t)
 @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}}
 @end itemize
diff --git a/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c
new file mode 100644
index 0000000..21a6a78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+#include <stdio.h>
+
+extern void abort (void);
+
+int
+main (void)
+{
+  uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
+                      ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
+  int i, j;
+
+  for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
+    {
+       for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
+         {
+           uint64_t a1 = args[i];
+           uint64_t a2 = args[j];
+           uint64_t res = vceq_p64 (vreinterpret_p64_u64 (a1),
+                                    vreinterpret_p64_u64 (a2));
+           uint64_t exp = (a1 == a2) ? ~0x0 : 0x0;
+
+           if (res != exp)
+             {
+               fprintf (stderr, "vceq_p64 (a1= %lx, a2= %lx)"
+                                " returned %lx, expected %lx\n",
+                                 a1, a2, res, exp);
+               abort ();
+             }
+         }
+    }
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c
new file mode 100644
index 0000000..3a0b117
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+/* { dg-require-effective-target arm_crypto_ok } */
+/* { dg-require-effective-target arm_neon_hw } */
+/* { dg-add-options arm_crypto } */
+
+#include "arm_neon.h"
+#include <stdio.h>
+
+extern void abort (void);
+
+int
+main (void)
+{
+  uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff,
+                      ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 };
+  int i, j;
+
+  for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i)
+    {
+       for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j)
+         {
+           uint64_t a1 = args[i];
+           uint64_t a2 = args[j];
+           uint64_t res = vtst_p64 (vreinterpret_p64_u64 (a1),
+                                    vreinterpret_p64_u64 (a2));
+           uint64_t exp = (a1 & a2) ? ~0x0 : 0x0;
+
+           if (res != exp)
+             {
+               fprintf (stderr, "vtst_p64 (a1= %lx, a2= %lx)"
+                                " returned %lx, expected %lx\n",
+                                 a1, a2, res, exp);
+               abort ();
+             }
+         }
+    }
+  return 0;
+}