This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 06/14] S390: Optimize iso-8859-1 to ibm037 iconv-module.


Here is an updated patch, where the labels in inline assemblies are out-dented as suggested by Florian.

On 02/23/2016 10:21 AM, Stefan Liebler wrote:
This patch reworks the s390 specific module which used the z900
translate one to one instruction. Now the g5 translate instruction is used,
because it outperforms the troo instruction.

ChangeLog:

	* sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c (TROO_LOOP):
	Rename to TR_LOOP and usage of tr instead of troo instruction.
---
  sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 93 +++++++++++++++++-----------
  1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
index c59f87f..4d79bbf 100644
--- a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
+++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
@@ -1,7 +1,6 @@
  /* Conversion between ISO 8859-1 and IBM037.

-   This module uses the Z900 variant of the Translate One To One
-   instruction.
+   This module uses the translate instruction.
     Copyright (C) 1997-2016 Free Software Foundation, Inc.

     Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
@@ -176,50 +175,70 @@ __attribute__ ((aligned (8))) =
  #define MIN_NEEDED_FROM		1
  #define MIN_NEEDED_TO		1

-/* The Z900 variant of troo forces us to always specify a test
-   character which ends the translation.  So if we run into the
-   situation where the translation has been interrupted due to the
-   test character we translate the character by hand and jump back
-   into the instruction.  */
-
-#define TROO_LOOP(TABLE)						\
+#define TR_LOOP(TABLE)							\
    {									\
-    register const unsigned char test __asm__ ("0") = 0;		\
-    register const unsigned char *pTable __asm__ ("1") = TABLE;		\
-    register unsigned char *pOutput __asm__ ("2") = outptr;		\
-    register uint64_t length __asm__ ("3");				\
-    const unsigned char* pInput = inptr;				\
-    uint64_t tmp;							\
-									\
-    length = (inend - inptr < outend - outptr				\
-	      ? inend - inptr : outend - outptr);			\
+    size_t length = (inend - inptr < outend - outptr			\
+		     ? inend - inptr : outend - outptr);		\
  									\
-    __asm__ volatile ("0:                        \n\t"			\
-		      "  troo    %0,%1           \n\t"			\
-		      "  jz      1f              \n\t"			\
-		      "  jo      0b              \n\t"			\
-		      "  llgc    %3,0(%1)        \n\t"			\
-		      "  la      %3,0(%3,%4)     \n\t"			\
-		      "  mvc     0(1,%0),0(%3)   \n\t"			\
-		      "  aghi    %1,1            \n\t"			\
-		      "  aghi    %0,1            \n\t"			\
-		      "  aghi    %2,-1           \n\t"			\
-		      "  j       0b              \n\t"			\
-		      "1:                        \n"			\
+    /* Process in 256 byte blocks.  */					\
+    if (__builtin_expect (length >= 256, 0))				\
+      {									\
+	size_t blocks = length / 256;					\
+	__asm__ __volatile__("0: mvc 0(256,%[R_OUT]),0(%[R_IN])\n\t"	\
+			     "tr 0(256,%[R_OUT]),0(%[R_TBL])\n\t"	\
+			     "la %[R_IN],256(%[R_IN])\n\t"		\
+			     "la %[R_OUT],256(%[R_OUT])\n\t"		\
+			     "brctg %[R_LI],0b\n\t"			\
+			     : /* outputs */ [R_IN] "+a" (inptr)	\
+			       , [R_OUT] "+a" (outptr), [R_LI] "+d" (blocks) \
+			     : /* inputs */ [R_TBL] "a" (TABLE)		\
+			     : /* clobber list */ "memory"		\
+			     );						\
+	length = length % 256;						\
+      }									\
  									\
-     : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp)        \
-     : "a" (pTable), "d" (test)						\
-     : "cc");								\
+    /* Process remaining 0...248 bytes in 8byte blocks.  */		\
+    if (length >= 8)							\
+      {									\
+	size_t blocks = length / 8;					\
+	for (int i = 0; i < blocks; i++)				\
+	  {								\
+	    outptr[0] = TABLE[inptr[0]];				\
+	    outptr[1] = TABLE[inptr[1]];				\
+	    outptr[2] = TABLE[inptr[2]];				\
+	    outptr[3] = TABLE[inptr[3]];				\
+	    outptr[4] = TABLE[inptr[4]];				\
+	    outptr[5] = TABLE[inptr[5]];				\
+	    outptr[6] = TABLE[inptr[6]];				\
+	    outptr[7] = TABLE[inptr[7]];				\
+	    inptr += 8;							\
+	    outptr += 8;						\
+	  }								\
+	length = length % 8;						\
+      }									\
  									\
-    inptr = pInput;							\
-    outptr = pOutput;							\
+    /* Process remaining 0...7 bytes.  */				\
+    switch (length)							\
+      {									\
+      case 7: outptr[6] = TABLE[inptr[6]];				\
+      case 6: outptr[5] = TABLE[inptr[5]];				\
+      case 5: outptr[4] = TABLE[inptr[4]];				\
+      case 4: outptr[3] = TABLE[inptr[3]];				\
+      case 3: outptr[2] = TABLE[inptr[2]];				\
+      case 2: outptr[1] = TABLE[inptr[1]];				\
+      case 1: outptr[0] = TABLE[inptr[0]];				\
+      case 0: break;							\
+      }									\
+    inptr += length;							\
+    outptr += length;							\
    }

+
  /* First define the conversion function from ISO 8859-1 to CP037.  */
  #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
  #define LOOPFCT			FROM_LOOP
-#define BODY TROO_LOOP (table_iso8859_1_to_cp037)
+#define BODY			TR_LOOP (table_iso8859_1_to_cp037)

  #include <iconv/loop.c>

@@ -228,7 +247,7 @@ __attribute__ ((aligned (8))) =
  #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
  #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
  #define LOOPFCT			TO_LOOP
-#define BODY TROO_LOOP (table_cp037_iso8859_1);
+#define BODY			TR_LOOP (table_cp037_iso8859_1);

  #include <iconv/loop.c>


>From d489351c09c82994adb872049fcb33bf189f86af Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.vnet.ibm.com>
Date: Thu, 21 Apr 2016 12:42:49 +0200
Subject: [PATCH 06/14] S390: Optimize iso-8859-1 to ibm037 iconv-module.

This patch reworks the s390 specific module which used the z900
translate one to one instruction. Now the g5 translate instruction is used,
because it outperforms the troo instruction.

ChangeLog:

	* sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c (TROO_LOOP):
	Rename to TR_LOOP and usage of tr instead of troo instruction.
---
 sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c | 93 +++++++++++++++++-----------
 1 file changed, 56 insertions(+), 37 deletions(-)

diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
index c59f87f..3b63e6a 100644
--- a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
+++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c
@@ -1,7 +1,6 @@
 /* Conversion between ISO 8859-1 and IBM037.
 
-   This module uses the Z900 variant of the Translate One To One
-   instruction.
+   This module uses the translate instruction.
    Copyright (C) 1997-2016 Free Software Foundation, Inc.
 
    Author: Andreas Krebbel  <Andreas.Krebbel@de.ibm.com>
@@ -176,50 +175,70 @@ __attribute__ ((aligned (8))) =
 #define MIN_NEEDED_FROM		1
 #define MIN_NEEDED_TO		1
 
-/* The Z900 variant of troo forces us to always specify a test
-   character which ends the translation.  So if we run into the
-   situation where the translation has been interrupted due to the
-   test character we translate the character by hand and jump back
-   into the instruction.  */
-
-#define TROO_LOOP(TABLE)						\
+#define TR_LOOP(TABLE)							\
   {									\
-    register const unsigned char test __asm__ ("0") = 0;		\
-    register const unsigned char *pTable __asm__ ("1") = TABLE;		\
-    register unsigned char *pOutput __asm__ ("2") = outptr;		\
-    register uint64_t length __asm__ ("3");				\
-    const unsigned char* pInput = inptr;				\
-    uint64_t tmp;							\
-									\
-    length = (inend - inptr < outend - outptr				\
-	      ? inend - inptr : outend - outptr);			\
+    size_t length = (inend - inptr < outend - outptr			\
+		     ? inend - inptr : outend - outptr);		\
 									\
-    __asm__ volatile ("0:                        \n\t"			\
-		      "  troo    %0,%1           \n\t"			\
-		      "  jz      1f              \n\t"			\
-		      "  jo      0b              \n\t"			\
-		      "  llgc    %3,0(%1)        \n\t"			\
-		      "  la      %3,0(%3,%4)     \n\t"			\
-		      "  mvc     0(1,%0),0(%3)   \n\t"			\
-		      "  aghi    %1,1            \n\t"			\
-		      "  aghi    %0,1            \n\t"			\
-		      "  aghi    %2,-1           \n\t"			\
-		      "  j       0b              \n\t"			\
-		      "1:                        \n"			\
+    /* Process in 256 byte blocks.  */					\
+    if (__builtin_expect (length >= 256, 0))				\
+      {									\
+	size_t blocks = length / 256;					\
+	__asm__ __volatile__("0: mvc 0(256,%[R_OUT]),0(%[R_IN])\n\t"	\
+			     "   tr 0(256,%[R_OUT]),0(%[R_TBL])\n\t"	\
+			     "   la %[R_IN],256(%[R_IN])\n\t"		\
+			     "   la %[R_OUT],256(%[R_OUT])\n\t"		\
+			     "   brctg %[R_LI],0b\n\t"			\
+			     : /* outputs */ [R_IN] "+a" (inptr)	\
+			       , [R_OUT] "+a" (outptr), [R_LI] "+d" (blocks) \
+			     : /* inputs */ [R_TBL] "a" (TABLE)		\
+			     : /* clobber list */ "memory"		\
+			     );						\
+	length = length % 256;						\
+      }									\
 									\
-     : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp)        \
-     : "a" (pTable), "d" (test)						\
-     : "cc");								\
+    /* Process remaining 0...248 bytes in 8byte blocks.  */		\
+    if (length >= 8)							\
+      {									\
+	size_t blocks = length / 8;					\
+	for (int i = 0; i < blocks; i++)				\
+	  {								\
+	    outptr[0] = TABLE[inptr[0]];				\
+	    outptr[1] = TABLE[inptr[1]];				\
+	    outptr[2] = TABLE[inptr[2]];				\
+	    outptr[3] = TABLE[inptr[3]];				\
+	    outptr[4] = TABLE[inptr[4]];				\
+	    outptr[5] = TABLE[inptr[5]];				\
+	    outptr[6] = TABLE[inptr[6]];				\
+	    outptr[7] = TABLE[inptr[7]];				\
+	    inptr += 8;							\
+	    outptr += 8;						\
+	  }								\
+	length = length % 8;						\
+      }									\
 									\
-    inptr = pInput;							\
-    outptr = pOutput;							\
+    /* Process remaining 0...7 bytes.  */				\
+    switch (length)							\
+      {									\
+      case 7: outptr[6] = TABLE[inptr[6]];				\
+      case 6: outptr[5] = TABLE[inptr[5]];				\
+      case 5: outptr[4] = TABLE[inptr[4]];				\
+      case 4: outptr[3] = TABLE[inptr[3]];				\
+      case 3: outptr[2] = TABLE[inptr[2]];				\
+      case 2: outptr[1] = TABLE[inptr[1]];				\
+      case 1: outptr[0] = TABLE[inptr[0]];				\
+      case 0: break;							\
+      }									\
+    inptr += length;							\
+    outptr += length;							\
   }
 
+
 /* First define the conversion function from ISO 8859-1 to CP037.  */
 #define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
 #define LOOPFCT			FROM_LOOP
-#define BODY TROO_LOOP (table_iso8859_1_to_cp037)
+#define BODY			TR_LOOP (table_iso8859_1_to_cp037)
 
 #include <iconv/loop.c>
 
@@ -228,7 +247,7 @@ __attribute__ ((aligned (8))) =
 #define MIN_NEEDED_INPUT	MIN_NEEDED_TO
 #define MIN_NEEDED_OUTPUT	MIN_NEEDED_FROM
 #define LOOPFCT			TO_LOOP
-#define BODY TROO_LOOP (table_cp037_iso8859_1);
+#define BODY			TR_LOOP (table_cp037_iso8859_1);
 
 #include <iconv/loop.c>
 
-- 
2.5.5


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]