diff mbox

[rs6000,libcpp] Revise search_line_fast to avoid old unaligned load sequences

Message ID 1412021724.2986.19.camel@gnopaine
State New
Headers show

Commit Message

Bill Schmidt Sept. 29, 2014, 8:15 p.m. UTC
Hi,

The vec_lvsl and vec_lvsr interfaces are deprecated for little-endian
Power, and really should not be used on big-endian Power either when the
target CPU is power8 or above.  The lexer in libcpp currently makes use
of these interfaces in search_line_fast().  This patch provides a new
version of search_line_fast() that allows unaligned loads to be handled
by the hardware.

The new version is used when _ARCH_PWR8 and __ALTIVEC__ are defined.
Otherwise, the older version may be used; however it is now restricted
for use only on big-endian systems.  If we are targeting little-endian
(which requires P8 or higher) and either Altivec support or Power8
architecture support has been disabled, then we revert to a slower
search routine.  This prevents ever using the deprecated instructions
for little-endian code generation.

I haven't added a new test case, as bootstrapping GCC is an excellent
test of search_line_fast(), and that appears to be all we do at present
for the existing implementations.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu and
powerpc64-unknown-linux-gnu with no new regressions.  Is this ok for
trunk?

Thanks,
Bill


2014-09-29  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* lex.c (search_line_fast): Add new version to be used for Power8
	and later targets when Altivec is enabled.  Restrict the existing
	Altivec version to big-endian systems so that lvsr is not used on
	little endian, where it is deprecated.  Remove LE-specific code
	from the now-BE-only version.

Comments

David Edelsohn Oct. 3, 2014, 5:40 p.m. UTC | #1
On Mon, Sep 29, 2014 at 4:15 PM, Bill Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Hi,
>
> The vec_lvsl and vec_lvsr interfaces are deprecated for little-endian
> Power, and really should not be used on big-endian Power either when the
> target CPU is power8 or above.  The lexer in libcpp currently makes use
> of these interfaces in search_line_fast().  This patch provides a new
> version of search_line_fast() that allows unaligned loads to be handled
> by the hardware.
>
> The new version is used when _ARCH_PWR8 and __ALTIVEC__ are defined.
> Otherwise, the older version may be used; however it is now restricted
> for use only on big-endian systems.  If we are targeting little-endian
> (which requires P8 or higher) and either Altivec support or Power8
> architecture support has been disabled, then we revert to a slower
> search routine.  This prevents ever using the deprecated instructions
> for little-endian code generation.
>
> I haven't added a new test case, as bootstrapping GCC is an excellent
> test of search_line_fast(), and that appears to be all we do at present
> for the existing implementations.
>
> Bootstrapped and tested on powerpc64le-unknown-linux-gnu and
> powerpc64-unknown-linux-gnu with no new regressions.  Is this ok for
> trunk?
>
> Thanks,
> Bill
>
>
> 2014-09-29  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
>
>         * lex.c (search_line_fast): Add new version to be used for Power8
>         and later targets when Altivec is enabled.  Restrict the existing
>         Altivec version to big-endian systems so that lvsr is not used on
>         little endian, where it is deprecated.  Remove LE-specific code
>         from the now-BE-only version.

The code is POWER-specific.  It's okay with me.  The GCC front-end
maintainers do not seem to have any concerns or objections.

Thanks, David
diff mbox

Patch

Index: libcpp/lex.c
===================================================================
--- libcpp/lex.c	(revision 215683)
+++ libcpp/lex.c	(working copy)
@@ -513,9 +513,111 @@  init_vectorized_lexer (void)
   search_line_fast = impl;
 }
 
-#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
+#elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 
-/* A vection of the fast scanner using AltiVec vectorized byte compares.  */
+/* A vection of the fast scanner using AltiVec vectorized byte compares
+   and VSX unaligned loads (when VSX is available).  This is otherwise
+   the same as the pre-GCC 5 version.  */
+
+static const uchar *
+search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef __attribute__((altivec(vector))) unsigned char vc;
+
+  const vc repl_nl = {
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+  };
+  const vc repl_cr = {
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+  };
+  const vc repl_bs = {
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+  };
+  const vc repl_qm = {
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+    '?', '?', '?', '?', '?', '?', '?', '?', 
+  };
+  const vc zero = { 0 };
+
+  vc data, t;
+
+  /* Main loop processing 16 bytes at a time.  */
+  do
+    {
+      vc m_nl, m_cr, m_bs, m_qm;
+
+      data = *((const vc *)s);
+      s += 16;
+
+      m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
+      m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
+      m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
+      m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
+      t = (m_nl | m_cr) | (m_bs | m_qm);
+
+      /* T now contains 0xff in bytes for which we matched one of the relevant
+	 characters.  We want to exit the loop if any byte in T is non-zero.
+	 Below is the expansion of vec_any_ne(t, zero).  */
+    }
+  while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
+
+  /* Restore s to to point to the 16 bytes we just processed.  */
+  s -= 16;
+
+  {
+#define N  (sizeof(vc) / sizeof(long))
+
+    union {
+      vc v;
+      /* Statically assert that N is 2 or 4.  */
+      unsigned long l[(N == 2 || N == 4) ? N : -1];
+    } u;
+    unsigned long l, i = 0;
+
+    u.v = t;
+
+    /* Find the first word of T that is non-zero.  */
+    switch (N)
+      {
+      case 4:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+      case 2:
+	l = u.l[i++];
+	if (l != 0)
+	  break;
+	s += sizeof(unsigned long);
+	l = u.l[i];
+      }
+
+    /* L now contains 0xff in bytes for which we matched one of the
+       relevant characters.  We can find the byte index by finding
+       its bit index and dividing by 8.  */
+#ifdef __BIG_ENDIAN__
+    l = __builtin_clzl(l) >> 3;
+#else
+    l = __builtin_ctzl(l) >> 3;
+#endif
+    return s + l;
+
+#undef N
+  }
+}
+
+#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
+
+/* A vection of the fast scanner using AltiVec vectorized byte compares.
+   This cannot be used for little endian because vec_lvsl/lvsr are
+   deprecated for little endian and the code won't work properly.  */
 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
    so we can't compile this function without -maltivec on the command line
    (or implied by some other switch).  */
@@ -557,13 +659,8 @@  search_line_fast (const uchar *s, const uchar *end
      beginning with all ones and shifting in zeros according to the
      mis-alignment.  The LVSR instruction pulls the exact shift we
      want from the address.  */
-#ifdef __BIG_ENDIAN__
   mask = __builtin_vec_lvsr(0, s);
   mask = __builtin_vec_perm(zero, ones, mask);
-#else
-  mask = __builtin_vec_lvsl(0, s);
-  mask = __builtin_vec_perm(ones, zero, mask);
-#endif
   data &= mask;
 
   /* While altivec loads mask addresses, we still need to align S so
@@ -627,11 +724,7 @@  search_line_fast (const uchar *s, const uchar *end
     /* L now contains 0xff in bytes for which we matched one of the
        relevant characters.  We can find the byte index by finding
        its bit index and dividing by 8.  */
-#ifdef __BIG_ENDIAN__
     l = __builtin_clzl(l) >> 3;
-#else
-    l = __builtin_ctzl(l) >> 3;
-#endif
     return s + l;
 
 #undef N