diff mbox

[AArch64] Optimized implementation of search_line_fast for the CPP lexer

Message ID 7b1f910a-628f-089a-eed3-23476c1bda9e@arm.com
State New
Headers show

Commit Message

Richard Earnshaw (lists) Nov. 7, 2016, 1:39 p.m. UTC
This patch contains an implementation of search_line_fast for the CPP
lexer.  It's based in part on the AArch32 (ARM) code but incorporates
new instructions available in AArch64 (reduction add operations) plus
some tricks for reducing the realignment overheads.  We assume a page
size of 4k, but that's a safe assumption -- AArch64 systems can never
have a smaller page size than that: on systems with larger pages we will
go through the realignment code more often than strictly necessary, but
it's still likely to be in the noise (less than 0.5% of the time).
Bootstrapped on aarch64-none-linux-gnu.


Although this is AArch64 specific and therefore I don't think it
requires approval from anyone else, I'll wait 24 hours for comments.

	* lex.c (search_line_fast): New implementation for AArch64.

R.

Comments

James Greenhalgh Nov. 8, 2016, 9:46 a.m. UTC | #1
On Mon, Nov 07, 2016 at 01:39:53PM +0000, Richard Earnshaw (lists) wrote:
> This patch contains an implementation of search_line_fast for the CPP
> lexer.  It's based in part on the AArch32 (ARM) code but incorporates
> new instructions available in AArch64 (reduction add operations) plus
> some tricks for reducing the realignment overheads.  We assume a page
> size of 4k, but that's a safe assumption -- AArch64 systems can never
> have a smaller page size than that: on systems with larger pages we will
> go through the realignment code more often than strictly necessary, but
> it's still likely to be in the noise (less than 0.5% of the time).
> Bootstrapped on aarch64-none-linux-gnu.

Some very minor nits wrt. style for the Advanced SIMD intrinsics, otherwise
OK from me.

> 
> +  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);


It is a pedantic point, but these casts are a GNU extension, the "portable"
way to write this would be:

  vreinterpretq_u8_u64 (vdupq_n_u64 (0x8040201008040201ULL));

> +
> +#ifdef __AARCH64EB
> +  const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};

This sort of vector initialisation is a bit scary for user programmers, as
we shouldn't generally mix Neon intrinsics with the GNU extensions (for
exactly the reason you have here, keeping BE and LE straight is extra
effort)

This could be written portably as:

  vcombine_u16 (vdup_n_u16 (8), vdup_n_u16 (0));

Or if you prefer to be explicit about the elements:

  int16_t buf[] = {8, 8, 8, 8, 0, 0, 0, 0};
  int16x8_t shift = vld1q_s16 (buf);

> +#else
> +  const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
> +#endif
> +
> +  unsigned int found;
> +  const uint8_t *p;
> +  uint8x16_t data;
> +  uint8x16_t t;
> +  uint16x8_t m;
> +  uint8x16_t u, v, w;
> +
> +  /* Align the source pointer.  */
> +  p = (const uint8_t *)((uintptr_t)s & -16);
> +
> +  /* Assuming random string start positions, with a 4k page size we'll take
> +     the slow path about 0.37% of the time.  */
> +  if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
> +			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
> +			< 16, 0))
> +    {
> +      /* Slow path: the string starts near a possible page boundary.  */
> +      uint32_t misalign, mask;
> +
> +      misalign = (uintptr_t)s & 15;
> +      mask = (-1u << misalign) & 0xffff;
> +      data = vld1q_u8 (p);
> +      t = vceqq_u8 (data, repl_nl);
> +      u = vceqq_u8 (data, repl_cr);
> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
> +      t = vorrq_u8 (v, w);

Can you trust the compiler to perform the reassociation here manually?
That would let you write this in the more natural form:

      t = vceqq_u8 (data, repl_nl);
      t = vorrq_u8 (t, vceqq_u8 (data, repl_cr));
      t = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
      t = vorrq_u8 (t, vceqq_u8 (data, repl_qm));

> +      t = vandq_u8 (t, xmask);
> +      m = vpaddlq_u8 (t);
> +      m = vshlq_u16 (m, shift);
> +      found = vaddvq_u16 (m);
> +      found &= mask;
> +      if (found)
> +	return (const uchar*)p + __builtin_ctz (found);
> +    }
> +  else
> +    {
> +      data = vld1q_u8 ((const uint8_t *) s);
> +      t = vceqq_u8 (data, repl_nl);
> +      u = vceqq_u8 (data, repl_cr);
> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
> +      t = vorrq_u8 (v, w);
> +      if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
> +	goto done;

As above, this cast is a GNU extension:

    if (__builtin_expect (vpaddd_u64 (vreinterpretq_u64_u8 (t)), 0))

> +    }
> +
> +  do
> +    {
> +      p += 16;
> +      data = vld1q_u8 (p);
> +      t = vceqq_u8 (data, repl_nl);
> +      u = vceqq_u8 (data, repl_cr);
> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
> +      t = vorrq_u8 (v, w);
> +    } while (!vpaddd_u64 ((uint64x2_t)t));

Likewise here.

Thanks,
James
Richard Earnshaw (lists) Nov. 8, 2016, 10:27 a.m. UTC | #2
On 08/11/16 09:46, James Greenhalgh wrote:
> On Mon, Nov 07, 2016 at 01:39:53PM +0000, Richard Earnshaw (lists) wrote:
>> This patch contains an implementation of search_line_fast for the CPP
>> lexer.  It's based in part on the AArch32 (ARM) code but incorporates
>> new instructions available in AArch64 (reduction add operations) plus
>> some tricks for reducing the realignment overheads.  We assume a page
>> size of 4k, but that's a safe assumption -- AArch64 systems can never
>> have a smaller page size than that: on systems with larger pages we will
>> go through the realignment code more often than strictly necessary, but
>> it's still likely to be in the noise (less than 0.5% of the time).
>> Bootstrapped on aarch64-none-linux-gnu.
> 
> Some very minor nits wrt. style for the Advanced SIMD intrinsics, otherwise
> OK from me.
> 
>>
>> +  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
> 
> 
> It is a pedantic point, but these casts are a GNU extension, the "portable"
> way to write this would be:
> 
>   vreinterpretq_u8_u64 (vdupq_n_u64 (0x8040201008040201ULL));

We've used GNU-style casts in the original code and never encountered
problems.  I personally find the reinterpret casts less readable..

> 
>> +
>> +#ifdef __AARCH64EB
>> +  const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
> 
> This sort of vector initialisation is a bit scary for user programmers, as
> we shouldn't generally mix Neon intrinsics with the GNU extensions (for
> exactly the reason you have here, keeping BE and LE straight is extra
> effort)
> 
> This could be written portably as:
> 
>   vcombine_u16 (vdup_n_u16 (8), vdup_n_u16 (0));
> 

Nice idea, but that's the wrong way around and fixing it currently
generates *terrible* code.

> Or if you prefer to be explicit about the elements:
> 
>   int16_t buf[] = {8, 8, 8, 8, 0, 0, 0, 0};
>   int16x8_t shift = vld1q_s16 (buf);
> 
>> +#else
>> +  const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
>> +#endif
>> +
>> +  unsigned int found;
>> +  const uint8_t *p;
>> +  uint8x16_t data;
>> +  uint8x16_t t;
>> +  uint16x8_t m;
>> +  uint8x16_t u, v, w;
>> +
>> +  /* Align the source pointer.  */
>> +  p = (const uint8_t *)((uintptr_t)s & -16);
>> +
>> +  /* Assuming random string start positions, with a 4k page size we'll take
>> +     the slow path about 0.37% of the time.  */
>> +  if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
>> +			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
>> +			< 16, 0))
>> +    {
>> +      /* Slow path: the string starts near a possible page boundary.  */
>> +      uint32_t misalign, mask;
>> +
>> +      misalign = (uintptr_t)s & 15;
>> +      mask = (-1u << misalign) & 0xffff;
>> +      data = vld1q_u8 (p);
>> +      t = vceqq_u8 (data, repl_nl);
>> +      u = vceqq_u8 (data, repl_cr);
>> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
>> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
>> +      t = vorrq_u8 (v, w);
> 
> Can you trust the compiler to perform the reassociation here manually?
> That would let you write this in the more natural form:
> 
>       t = vceqq_u8 (data, repl_nl);
>       t = vorrq_u8 (t, vceqq_u8 (data, repl_cr));
>       t = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
>       t = vorrq_u8 (t, vceqq_u8 (data, repl_qm));
> 

Maybe, but we have plenty of spare registers (this is target specific
code, I know what's happening).

Either way, the reassoc code is currently messing with this and
serializing the VORRQ operations.

>> +      t = vandq_u8 (t, xmask);
>> +      m = vpaddlq_u8 (t);
>> +      m = vshlq_u16 (m, shift);
>> +      found = vaddvq_u16 (m);
>> +      found &= mask;
>> +      if (found)
>> +	return (const uchar*)p + __builtin_ctz (found);
>> +    }
>> +  else
>> +    {
>> +      data = vld1q_u8 ((const uint8_t *) s);
>> +      t = vceqq_u8 (data, repl_nl);
>> +      u = vceqq_u8 (data, repl_cr);
>> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
>> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
>> +      t = vorrq_u8 (v, w);
>> +      if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
>> +	goto done;
> 
> As above, this cast is a GNU extension:
> 
>     if (__builtin_expect (vpaddd_u64 (vreinterpretq_u64_u8 (t)), 0))
> 
>> +    }
>> +
>> +  do
>> +    {
>> +      p += 16;
>> +      data = vld1q_u8 (p);
>> +      t = vceqq_u8 (data, repl_nl);
>> +      u = vceqq_u8 (data, repl_cr);
>> +      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
>> +      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
>> +      t = vorrq_u8 (v, w);
>> +    } while (!vpaddd_u64 ((uint64x2_t)t));
> 
> Likewise here.
> 
> Thanks,
> James
>
Andreas Schwab March 20, 2017, 2:53 p.m. UTC | #3
On Nov 07 2016, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote:

> This patch contains an implementation of search_line_fast for the CPP
> lexer.  It's based in part on the AArch32 (ARM) code but incorporates
> new instructions available in AArch64 (reduction add operations) plus
> some tricks for reducing the realignment overheads.

I'm getting erroneous behaviour when building the compiler in ILP32
mode.

build/genmatch --gimple ../../gcc/match.pd \
    > tmp-gimple-match.c
/home/abuild/rpmbuild/BUILD/gcc-7.0.1-r246083/obj-aarch64-suse-linux/gcc/cfn-operators.pd:91:5 error: expected (, got NAME
(define_operator_list EXPM1
    ^

Some part of the code appears to depend on LP64.  When I insert three
newlines before this line then the parser goes on further, but reports a
similar error later on.

Andreas.
Richard Earnshaw (lists) March 20, 2017, 3:17 p.m. UTC | #4
On 20/03/17 14:53, Andreas Schwab wrote:
> On Nov 07 2016, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote:
> 
>> This patch contains an implementation of search_line_fast for the CPP
>> lexer.  It's based in part on the AArch32 (ARM) code but incorporates
>> new instructions available in AArch64 (reduction add operations) plus
>> some tricks for reducing the realignment overheads.
> 
> I'm getting erroneous behaviour when building the compiler in ILP32
> mode.
> 
> build/genmatch --gimple ../../gcc/match.pd \
>     > tmp-gimple-match.c
> /home/abuild/rpmbuild/BUILD/gcc-7.0.1-r246083/obj-aarch64-suse-linux/gcc/cfn-operators.pd:91:5 error: expected (, got NAME
> (define_operator_list EXPM1
>     ^
> 
> Some part of the code appears to depend on LP64.  When I insert three
> newlines before this line then the parser goes on further, but reports a
> similar error later on.
> 
> Andreas.
> 

Please file a PR.

I don't have access to an ILP32 run-time environment, so I'm not sure
how I'll be able to check this out.  There are some pointer checks in
the code so it's possible something is going awry.  Can you compare the
assembly output for ILP32 and LP64 to see if there's anything obvious?


R.
diff mbox

Patch

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 6f65fa1..cea8848 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -752,6 +752,101 @@  search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
   }
 }
 
+#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
+#include "arm_neon.h"
+
+/* This doesn't have to be the exact page size, but no system may use
+   a size smaller than this.  ARMv8 requires a minimum page size of
+   4k.  The impact of being conservative here is a small number of
+   cases will take the slightly slower entry path into the main
+   loop.  */
+
+#define AARCH64_MIN_PAGE_SIZE 4096
+
+static const uchar *
+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
+  const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
+  const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
+  const uint8x16_t repl_qm = vdupq_n_u8 ('?');
+  const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
+
+#ifdef __AARCH64EB
+  const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
+#else
+  const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
+#endif
+
+  unsigned int found;
+  const uint8_t *p;
+  uint8x16_t data;
+  uint8x16_t t;
+  uint16x8_t m;
+  uint8x16_t u, v, w;
+
+  /* Align the source pointer.  */
+  p = (const uint8_t *)((uintptr_t)s & -16);
+
+  /* Assuming random string start positions, with a 4k page size we'll take
+     the slow path about 0.37% of the time.  */
+  if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
+			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
+			< 16, 0))
+    {
+      /* Slow path: the string starts near a possible page boundary.  */
+      uint32_t misalign, mask;
+
+      misalign = (uintptr_t)s & 15;
+      mask = (-1u << misalign) & 0xffff;
+      data = vld1q_u8 (p);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+      t = vandq_u8 (t, xmask);
+      m = vpaddlq_u8 (t);
+      m = vshlq_u16 (m, shift);
+      found = vaddvq_u16 (m);
+      found &= mask;
+      if (found)
+	return (const uchar*)p + __builtin_ctz (found);
+    }
+  else
+    {
+      data = vld1q_u8 ((const uint8_t *) s);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+      if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0))
+	goto done;
+    }
+
+  do
+    {
+      p += 16;
+      data = vld1q_u8 (p);
+      t = vceqq_u8 (data, repl_nl);
+      u = vceqq_u8 (data, repl_cr);
+      v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
+      w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
+      t = vorrq_u8 (v, w);
+    } while (!vpaddd_u64 ((uint64x2_t)t));
+
+done:
+  /* Now that we've found the terminating substring, work out precisely where
+     we need to stop.  */
+  t = vandq_u8 (t, xmask);
+  m = vpaddlq_u8 (t);
+  m = vshlq_u16 (m, shift);
+  found = vaddvq_u16 (m);
+  return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
+	  + __builtin_ctz (found));
+}
+
 #elif defined (__ARM_NEON)
 #include "arm_neon.h"