diff mbox

[06/14] S390: Optimize iso-8859-1 to ibm037 iconv-module.

Message ID nfapai$1bi$4@ger.gmane.org
State New
Headers show

Commit Message

Stefan Liebler April 21, 2016, 2:49 p.m. UTC
Here is an updated patch, where the labels in inline assemblies are 
out-dented as suggested by Florian.

On 02/23/2016 10:21 AM, Stefan Liebler wrote:
> This patch reworks the s390 specific module which used the z900
> translate one to one instruction. Now the g5 translate instruction is used,
> because it outperforms the troo instruction.
>
> ChangeLog:
>
> 	* sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c (TROO_LOOP):
> 	Rename to TR_LOOP and usage of tr instead of troo instruction.
> ---
>   sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 93 +++++++++++++++++-----------
>   1 file changed, 56 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
> index c59f87f..4d79bbf 100644
> --- a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
> +++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
> @@ -1,7 +1,6 @@
>   /* Conversion between ISO 8859-1 and IBM037.
>
> -   This module uses the Z900 variant of the Translate One To One
> -   instruction.
> +   This module uses the translate instruction.
>      Copyright (C) 1997-2016 Free Software Foundation, Inc.
>
>      Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
> @@ -176,50 +175,70 @@ __attribute__ ((aligned (8))) =
>   #define MIN_NEEDED_FROM		1
>   #define MIN_NEEDED_TO		1
>
> -/* The Z900 variant of troo forces us to always specify a test
> -   character which ends the translation.  So if we run into the
> -   situation where the translation has been interrupted due to the
> -   test character we translate the character by hand and jump back
> -   into the instruction.  */
> -
> -#define TROO_LOOP(TABLE)						\
> +#define TR_LOOP(TABLE)							\
>     {									\
> -    register const unsigned char test __asm__ ("0") = 0;		\
> -    register const unsigned char *pTable __asm__ ("1") = TABLE;		\
> -    register unsigned char *pOutput __asm__ ("2") = outptr;		\
> -    register uint64_t length __asm__ ("3");				\
> -    const unsigned char* pInput = inptr;				\
> -    uint64_t tmp;							\
> -									\
> -    length = (inend - inptr < outend - outptr				\
> -	      ? inend - inptr : outend - outptr);			\
> +    size_t length = (inend - inptr < outend - outptr			\
> +		     ? inend - inptr : outend - outptr);		\
>   									\
> -    __asm__ volatile ("0:                        \n\t"			\
> -		      "  troo    %0,%1           \n\t"			\
> -		      "  jz      1f              \n\t"			\
> -		      "  jo      0b              \n\t"			\
> -		      "  llgc    %3,0(%1)        \n\t"			\
> -		      "  la      %3,0(%3,%4)     \n\t"			\
> -		      "  mvc     0(1,%0),0(%3)   \n\t"			\
> -		      "  aghi    %1,1            \n\t"			\
> -		      "  aghi    %0,1            \n\t"			\
> -		      "  aghi    %2,-1           \n\t"			\
> -		      "  j       0b              \n\t"			\
> -		      "1:                        \n"			\
> +    /* Process in 256 byte blocks.  */					\
> +    if (__builtin_expect (length >= 256, 0))				\
> +      {									\
> +	size_t blocks = length / 256;					\
> +	__asm__ __volatile__("0: mvc 0(256,%[R_OUT]),0(%[R_IN])\n\t"	\
> +			     "tr 0(256,%[R_OUT]),0(%[R_TBL])\n\t"	\
> +			     "la %[R_IN],256(%[R_IN])\n\t"		\
> +			     "la %[R_OUT],256(%[R_OUT])\n\t"		\
> +			     "brctg %[R_LI],0b\n\t"			\
> +			     : /* outputs */ [R_IN] "+a" (inptr)	\
> +			       , [R_OUT] "+a" (outptr), [R_LI] "+d" (blocks) \
> +			     : /* inputs */ [R_TBL] "a" (TABLE)		\
> +			     : /* clobber list */ "memory"		\
> +			     );						\
> +	length = length % 256;						\
> +      }									\
>   									\
> -     : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp)        \
> -     : "a" (pTable), "d" (test)						\
> -     : "cc");								\
> +    /* Process remaining 0...248 bytes in 8byte blocks.  */		\
> +    if (length >= 8)							\
> +      {									\
> +	size_t blocks = length / 8;					\
> +	for (int i = 0; i < blocks; i++)				\
> +	  {								\
> +	    outptr[0] = TABLE[inptr[0]];				\
> +	    outptr[1] = TABLE[inptr[1]];				\
> +	    outptr[2] = TABLE[inptr[2]];				\
> +	    outptr[3] = TABLE[inptr[3]];				\
> +	    outptr[4] = TABLE[inptr[4]];				\
> +	    outptr[5] = TABLE[inptr[5]];				\
> +	    outptr[6] = TABLE[inptr[6]];				\
> +	    outptr[7] = TABLE[inptr[7]];				\
> +	    inptr += 8;							\
> +	    outptr += 8;						\
> +	  }								\
> +	length = length % 8;						\
> +      }									\
>   									\
> -    inptr = pInput;							\
> -    outptr = pOutput;							\
> +    /* Process remaining 0...7 bytes.  */				\
> +    switch (length)							\
> +      {									\
> +      case 7: outptr[6] = TABLE[inptr[6]];				\
> +      case 6: outptr[5] = TABLE[inptr[5]];				\
> +      case 5: outptr[4] = TABLE[inptr[4]];				\
> +      case 4: outptr[3] = TABLE[inptr[3]];				\
> +      case 3: outptr[2] = TABLE[inptr[2]];				\
> +      case 2: outptr[1] = TABLE[inptr[1]];				\
> +      case 1: outptr[0] = TABLE[inptr[0]];				\
> +      case 0: break;							\
> +      }									\
> +    inptr += length;							\
> +    outptr += length;							\
>     }
>
> +
>   /* First define the conversion function from ISO 8859-1 to CP037.  */
>   #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
>   #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
>   #define LOOPFCT			FROM_LOOP
> -#define BODY TROO_LOOP (table_iso8859_1_to_cp037)
> +#define BODY			TR_LOOP (table_iso8859_1_to_cp037)
>
>   #include <iconv/loop.c>
>
> @@ -228,7 +247,7 @@ __attribute__ ((aligned (8))) =
>   #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
>   #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
>   #define LOOPFCT			TO_LOOP
> -#define BODY TROO_LOOP (table_cp037_iso8859_1);
> +#define BODY			TR_LOOP (table_cp037_iso8859_1);
>
>   #include <iconv/loop.c>
>
>
diff mbox

Patch

From d489351c09c82994adb872049fcb33bf189f86af Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.vnet.ibm.com>
Date: Thu, 21 Apr 2016 12:42:49 +0200
Subject: [PATCH 06/14] S390: Optimize iso-8859-1 to ibm037 iconv-module.

This patch reworks the s390 specific module which used the z900
translate one to one instruction. Now the g5 translate instruction is used,
because it outperforms the troo instruction.

ChangeLog:

	* sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c (TROO_LOOP):
	Rename to TR_LOOP and usage of tr instead of troo instruction.
---
 sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 93 +++++++++++++++++-----------
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
index c59f87f..3b63e6a 100644
--- a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
+++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
@@ -1,7 +1,6 @@ 
 /* Conversion between ISO 8859-1 and IBM037.
 
-   This module uses the Z900 variant of the Translate One To One
-   instruction.
+   This module uses the translate instruction.
    Copyright (C) 1997-2016 Free Software Foundation, Inc.
 
    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
@@ -176,50 +175,70 @@  __attribute__ ((aligned (8))) =
 #define MIN_NEEDED_FROM		1
 #define MIN_NEEDED_TO		1
 
-/* The Z900 variant of troo forces us to always specify a test
-   character which ends the translation.  So if we run into the
-   situation where the translation has been interrupted due to the
-   test character we translate the character by hand and jump back
-   into the instruction.  */
-
-#define TROO_LOOP(TABLE)						\
+#define TR_LOOP(TABLE)							\
   {									\
-    register const unsigned char test __asm__ ("0") = 0;		\
-    register const unsigned char *pTable __asm__ ("1") = TABLE;		\
-    register unsigned char *pOutput __asm__ ("2") = outptr;		\
-    register uint64_t length __asm__ ("3");				\
-    const unsigned char* pInput = inptr;				\
-    uint64_t tmp;							\
-									\
-    length = (inend - inptr < outend - outptr				\
-	      ? inend - inptr : outend - outptr);			\
+    size_t length = (inend - inptr < outend - outptr			\
+		     ? inend - inptr : outend - outptr);		\
 									\
-    __asm__ volatile ("0:                        \n\t"			\
-		      "  troo    %0,%1           \n\t"			\
-		      "  jz      1f              \n\t"			\
-		      "  jo      0b              \n\t"			\
-		      "  llgc    %3,0(%1)        \n\t"			\
-		      "  la      %3,0(%3,%4)     \n\t"			\
-		      "  mvc     0(1,%0),0(%3)   \n\t"			\
-		      "  aghi    %1,1            \n\t"			\
-		      "  aghi    %0,1            \n\t"			\
-		      "  aghi    %2,-1           \n\t"			\
-		      "  j       0b              \n\t"			\
-		      "1:                        \n"			\
+    /* Process in 256 byte blocks.  */					\
+    if (__builtin_expect (length >= 256, 0))				\
+      {									\
+	size_t blocks = length / 256;					\
+	__asm__ __volatile__("0: mvc 0(256,%[R_OUT]),0(%[R_IN])\n\t"	\
+			     "   tr 0(256,%[R_OUT]),0(%[R_TBL])\n\t"	\
+			     "   la %[R_IN],256(%[R_IN])\n\t"		\
+			     "   la %[R_OUT],256(%[R_OUT])\n\t"		\
+			     "   brctg %[R_LI],0b\n\t"			\
+			     : /* outputs */ [R_IN] "+a" (inptr)	\
+			       , [R_OUT] "+a" (outptr), [R_LI] "+d" (blocks) \
+			     : /* inputs */ [R_TBL] "a" (TABLE)		\
+			     : /* clobber list */ "memory"		\
+			     );						\
+	length = length % 256;						\
+      }									\
 									\
-     : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp)        \
-     : "a" (pTable), "d" (test)						\
-     : "cc");								\
+    /* Process remaining 0...248 bytes in 8byte blocks.  */		\
+    if (length >= 8)							\
+      {									\
+	size_t blocks = length / 8;					\
+	for (int i = 0; i < blocks; i++)				\
+	  {								\
+	    outptr[0] = TABLE[inptr[0]];				\
+	    outptr[1] = TABLE[inptr[1]];				\
+	    outptr[2] = TABLE[inptr[2]];				\
+	    outptr[3] = TABLE[inptr[3]];				\
+	    outptr[4] = TABLE[inptr[4]];				\
+	    outptr[5] = TABLE[inptr[5]];				\
+	    outptr[6] = TABLE[inptr[6]];				\
+	    outptr[7] = TABLE[inptr[7]];				\
+	    inptr += 8;							\
+	    outptr += 8;						\
+	  }								\
+	length = length % 8;						\
+      }									\
 									\
-    inptr = pInput;							\
-    outptr = pOutput;							\
+    /* Process remaining 0...7 bytes.  */				\
+    switch (length)							\
+      {									\
+      case 7: outptr[6] = TABLE[inptr[6]];				\
+      case 6: outptr[5] = TABLE[inptr[5]];				\
+      case 5: outptr[4] = TABLE[inptr[4]];				\
+      case 4: outptr[3] = TABLE[inptr[3]];				\
+      case 3: outptr[2] = TABLE[inptr[2]];				\
+      case 2: outptr[1] = TABLE[inptr[1]];				\
+      case 1: outptr[0] = TABLE[inptr[0]];				\
+      case 0: break;							\
+      }									\
+    inptr += length;							\
+    outptr += length;							\
   }
 
+
 /* First define the conversion function from ISO 8859-1 to CP037.  */
 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
 #define LOOPFCT			FROM_LOOP
-#define BODY TROO_LOOP (table_iso8859_1_to_cp037)
+#define BODY			TR_LOOP (table_iso8859_1_to_cp037)
 
 #include <iconv/loop.c>
 
@@ -228,7 +247,7 @@  __attribute__ ((aligned (8))) =
 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
 #define LOOPFCT			TO_LOOP
-#define BODY TROO_LOOP (table_cp037_iso8859_1);
+#define BODY			TR_LOOP (table_cp037_iso8859_1);
 
 #include <iconv/loop.c>
 
-- 
2.5.5