This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch ibm/2.18-new/master created. glibc-2.18-81-ge7788aa

From: azanella at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 15 Nov 2013 18:08:33 -0000
Subject: GNU C Library master sources branch ibm/2.18-new/master created. glibc-2.18-81-ge7788aa
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, ibm/2.18-new/master has been created
        at  e7788aa8c0f54c1586b95fb135f065f3e9d71a24 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e7788aa8c0f54c1586b95fb135f065f3e9d71a24

commit e7788aa8c0f54c1586b95fb135f065f3e9d71a24
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 12:04:30 2013 -0600

    PowerPC64 ELFv2 ABI 6/6: Bump ld.so soname version number
    
    To avoid having a ELFv2 binary accidentally picking up an old ABI ld.so,
    this patch bumps the soname to ld64.so.2.
    
    In theory (or for testing purposes) this will also allow co-installing
    ld.so versions for both ABIs on the same system.  Note that the kernel
    will already be able to load executables of both ABIs.  However, there
    is currently no plan to use that theoretical possibility in a any
    supported distribution environment ...
    
    Note that in order to check which ABI to use, we need to invoke the
    compiler to check the _CALL_ELF macro; this is done in a new configure
    check in sysdeps/unix/sysv/linux/powerpc/powerpc64/configure.ac,
    replacing the hard-coded value of default-abi in the Makefile.

diff --git a/ChangeLog b/ChangeLog
index 1632319..3e7c663 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,23 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
+	* sysdeps/unix/sysv/linux/powerpc/Makefile (abi-variants): Rename
+	"64" to "64-v1".  Add "64-v2".
+	(abi-64-options): Rename to ...
+	(abi-64-v1-options): ... this.   Redefine _CALL_ELF.
+	(abi-64-condition): Rename to ...
+	(abi-64-v1-condition): ... this.  Add _CALL_ELF check.,
+	(abi-64-ld-soname): Rename to ...
+	(abi-64-v1-ld-soname): ... this.
+	(abi-64-v2-options): Define.
+	(abi-64-v2-condition): Likewise.
+	(abi-64-v2-ld-soname): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/ldconfig.h
+	(SYSDEP_KNOWN_INTERPRETER_NAMES): Add "/lib64/ld64.so.2".
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/Makefile: Delete file.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/configure.ac: New file.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/configure: Generate.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 	    Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/powerpc/bits/link.h (La_ppc64v2_regs, La_ppc64v2_retval):
diff --git a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile
index cf4de97..395342f 100644
--- a/sysdeps/unix/sysv/linux/powerpc/Makefile
+++ b/sysdeps/unix/sysv/linux/powerpc/Makefile
@@ -1,9 +1,12 @@
-abi-variants := 32 64
+abi-variants := 32 64-v1 64-v2
 abi-32-options := -U__powerpc64__
 abi-32-condition := __WORDSIZE == 32
-abi-64-options := -D__powerpc64__
-abi-64-condition := __WORDSIZE == 64
-abi-64-ld-soname := ld64.so.1
+abi-64-v1-options := -D__powerpc64__ -U_CALL_ELF -D_CALL_ELF=1
+abi-64-v1-condition := __WORDSIZE == 64 && _CALL_ELF != 2
+abi-64-v1-ld-soname := ld64.so.1
+abi-64-v2-options := -D__powerpc64__ -U_CALL_ELF -D_CALL_ELF=2
+abi-64-v2-condition := __WORDSIZE == 64 && _CALL_ELF == 2
+abi-64-v2-ld-soname := ld64.so.2
 
 ifeq ($(subdir),rt)
 librt-routines += rt-sysdep
diff --git a/sysdeps/unix/sysv/linux/powerpc/ldconfig.h b/sysdeps/unix/sysv/linux/powerpc/ldconfig.h
index 2c5f50d..92ea723 100644
--- a/sysdeps/unix/sysv/linux/powerpc/ldconfig.h
+++ b/sysdeps/unix/sysv/linux/powerpc/ldconfig.h
@@ -20,7 +20,8 @@
 
 #define SYSDEP_KNOWN_INTERPRETER_NAMES \
   { "/lib/ld.so.1", FLAG_ELF_LIBC6 },	\
-  { "/lib64/ld64.so.1", FLAG_ELF_LIBC6 },
+  { "/lib64/ld64.so.1", FLAG_ELF_LIBC6 }, \
+  { "/lib64/ld64.so.2", FLAG_ELF_LIBC6 },
 #define SYSDEP_KNOWN_LIBRARY_NAMES \
   { "libc.so.6", FLAG_ELF_LIBC6 },	\
   { "libm.so.6", FLAG_ELF_LIBC6 },
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/Makefile b/sysdeps/unix/sysv/linux/powerpc/powerpc64/Makefile
deleted file mode 100644
index 3ba3b1f..0000000
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# See Makeconfig regarding the use of default-abi.
-default-abi := 64
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure b/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure
new file mode 100644
index 0000000..dc7c1a7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure
@@ -0,0 +1,167 @@
+# This file is generated from configure.ac by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/unix/sysv/linux/powerpc/powerpc64/.
+
+# Define default-abi according to compiler flags.
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler is using the PowerPC64 ELFv2 ABI" >&5
+$as_echo_n "checking whether the compiler is using the PowerPC64 ELFv2 ABI... " >&6; }
+if ${libc_cv_ppc64_elfv2_abi+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#if _CALL_ELF == 2
+                      yes
+                     #endif
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "yes" >/dev/null 2>&1; then :
+  libc_cv_ppc64_elfv2_abi=yes
+else
+  libc_cv_ppc64_elfv2_abi=no
+fi
+rm -f conftest*
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_ppc64_elfv2_abi" >&5
+$as_echo "$libc_cv_ppc64_elfv2_abi" >&6; }
+if test $libc_cv_ppc64_elfv2_abi = yes; then
+  config_vars="$config_vars
+default-abi = 64-v2"
+else
+  config_vars="$config_vars
+default-abi = 64-v1"
+fi
+
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure.ac b/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure.ac
new file mode 100644
index 0000000..52052d2
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/configure.ac
@@ -0,0 +1,16 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/unix/sysv/linux/powerpc/powerpc64/.
+
+# Define default-abi according to compiler flags.
+AC_CACHE_CHECK([whether the compiler is using the PowerPC64 ELFv2 ABI],
+  [libc_cv_ppc64_elfv2_abi],
+  [AC_EGREP_CPP(yes,[#if _CALL_ELF == 2
+                      yes
+                     #endif
+  ], libc_cv_ppc64_elfv2_abi=yes, libc_cv_ppc64_elfv2_abi=no)])
+if test $libc_cv_ppc64_elfv2_abi = yes; then
+  LIBC_CONFIG_VAR([default-abi], [64-v2])
+else
+  LIBC_CONFIG_VAR([default-abi], [64-v1])
+fi
+

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3e1d1af62ffe085247e84f3d70cfebfbf144f859

commit 3e1d1af62ffe085247e84f3d70cfebfbf144f859
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 12:01:33 2013 -0600

    PowerPC64 ELFv2 ABI 5/6: LD_AUDIT interface changes
    
    The ELFv2 ABI changes the calling convention by passing and returning
    structures in registers in more cases than the old ABI:
    http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01145.html
    http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01147.html
    
    For the most part, this does not affect glibc, since glibc assembler
    files do not use structure parameters / return values.  However, one
    place is affected: the LD_AUDIT interface provides a structure to
    the audit routine that contains all registers holding function
    argument and return values for the intercepted PLT call.
    
    Since the new ABI now sometimes uses registers to return values
    that were never used for this purpose in the old ABI, this structure
    has to be extended.  To force audit routines to be modified for the
    new ABI if necessary, the patch defines v2 variants of the la_ppc64
    types and routines.
    
    In addition, the patch contains two unrelated changes to the
    PLT trampoline routines: it fixes a bug where FPR return values
    were stored in the wrong place, and it removes the unnecessary
    save/restore of CR.

diff --git a/ChangeLog b/ChangeLog
index d3f51a2..1632319 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,27 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+	    Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/bits/link.h (La_ppc64v2_regs, La_ppc64v2_retval):
+	New versions for use with the ELFv2 ABI.
+	(la_ppc64v2_gnu_pltenter, la_ppc64v2_gnu_pltexit): Add prototypes.
+	* sysdeps/powerpc/lsdodefs.h (struct La_ppc64v2_regs): Add forward
+	declaration.
+	(struct La_ppc64v2_retval): Likewise.
+	(ARCH_PLTENTER_MEMBERS): Add ppc64v2_gnu_pltenter.
+	(ARCH_PLTEXIT_MEMBERS): Add ppc64v2_gnu_pltexit.
+	* sysdeps/powerpc/powerpc64/dl-machine.h (ARCH_LA_PLTENTER): Define
+	to ppc64v2_gnu_pltenter if _CALL_ELF == 2.
+	(ARCH_LA_PLTEXIT): Define to ppc64v2_gnu_pltexit if _CALL_ELF == 2.
+	* sysdeps/powerpc/powerpc64/dl-trampoline.S (_dl_runtime_resolve):
+	Do not save or restore CR.
+	(FRAME_SIZE, VR_RTN): Provide updated values for _CALL_ELF == 2.
+	(_dl_profile_resolve): Do no save or restore CR.  Support extended
+	return values for ELFv2 ABI.  Fix location of FPR return registers.
+	* sysdeps/powerpc/powerpc64/tst-audit.h (pltenter, pltexit): Provide
+	updated values for _CALL_ELF == 2.
+	(La_regs, La_retval, int_retval): Likewise.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/sysdep.h (FRAME_MIN_SIZE): Define.
 	(FRAME_MIN_SIZE_PARM): Likewise.
diff --git a/sysdeps/powerpc/bits/link.h b/sysdeps/powerpc/bits/link.h
index f06092f..2f1da8b 100644
--- a/sysdeps/powerpc/bits/link.h
+++ b/sysdeps/powerpc/bits/link.h
@@ -63,7 +63,7 @@ extern unsigned int la_ppc32_gnu_pltexit (Elf32_Sym *__sym,
 
 __END_DECLS
 
-#else
+#elif _CALL_ELF != 2
 
 /* Registers for entry into PLT on PPC64.  */
 typedef struct La_ppc64_regs
@@ -107,4 +107,48 @@ extern unsigned int la_ppc64_gnu_pltexit (Elf64_Sym *__sym,
 
 __END_DECLS
 
+#else
+
+/* Registers for entry into PLT on PPC64 in the ELFv2 ABI.  */
+typedef struct La_ppc64v2_regs
+{
+  uint64_t lr_reg[8];
+  double lr_fp[13];
+  uint32_t __padding;
+  uint32_t lr_vrsave;
+  uint32_t lr_vreg[12][4] __attribute__ ((aligned (16)));
+  uint64_t lr_r1;
+  uint64_t lr_lr;
+} La_ppc64v2_regs;
+
+/* Return values for calls from PLT on PPC64 in the ELFv2 ABI.  */
+typedef struct La_ppc64v2_retval
+{
+  uint64_t lrv_r3;
+  uint64_t lrv_r4;
+  double lrv_fp[10];
+  uint32_t lrv_vreg[8][4] __attribute__ ((aligned (16)));
+} La_ppc64v2_retval;
+
+
+__BEGIN_DECLS
+
+extern Elf64_Addr la_ppc64v2_gnu_pltenter (Elf64_Sym *__sym,
+					   unsigned int __ndx,
+					   uintptr_t *__refcook,
+					   uintptr_t *__defcook,
+					   La_ppc64v2_regs *__regs,
+					   unsigned int *__flags,
+					   const char *__symname,
+					   long int *__framesizep);
+extern unsigned int la_ppc64v2_gnu_pltexit (Elf64_Sym *__sym,
+					    unsigned int __ndx,
+					    uintptr_t *__refcook,
+					    uintptr_t *__defcook,
+					    const La_ppc64v2_regs *__inregs,
+					    La_ppc64v2_retval *__outregs,
+					    const char *__symname);
+
+__END_DECLS
+
 #endif
diff --git a/sysdeps/powerpc/ldsodefs.h b/sysdeps/powerpc/ldsodefs.h
index ef849e9..435821c 100644
--- a/sysdeps/powerpc/ldsodefs.h
+++ b/sysdeps/powerpc/ldsodefs.h
@@ -25,6 +25,8 @@ struct La_ppc32_regs;
 struct La_ppc32_retval;
 struct La_ppc64_regs;
 struct La_ppc64_retval;
+struct La_ppc64v2_regs;
+struct La_ppc64v2_retval;
 
 #define ARCH_PLTENTER_MEMBERS						\
     Elf32_Addr (*ppc32_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
@@ -34,7 +36,12 @@ struct La_ppc64_retval;
     Elf64_Addr (*ppc64_gnu_pltenter) (Elf64_Sym *, unsigned int, uintptr_t *, \
 				      uintptr_t *, struct La_ppc64_regs *, \
 				      unsigned int *, const char *name,	\
-				      long int *framesizep)
+				      long int *framesizep);		\
+    Elf64_Addr (*ppc64v2_gnu_pltenter) (Elf64_Sym *, unsigned int,	\
+					uintptr_t *,  uintptr_t *,	\
+					struct La_ppc64v2_regs *,	\
+					unsigned int *, const char *name, \
+					long int *framesizep)
 
 #define ARCH_PLTEXIT_MEMBERS						\
     unsigned int (*ppc32_gnu_pltexit) (Elf32_Sym *, unsigned int,	\
@@ -47,7 +54,14 @@ struct La_ppc64_retval;
 				       uintptr_t *,			\
 				       uintptr_t *,			\
 				       const struct La_ppc64_regs *,	\
-				       struct La_ppc64_retval *, const char *)
+				       struct La_ppc64_retval *,	\
+				       const char *);			\
+    unsigned int (*ppc64v2_gnu_pltexit) (Elf64_Sym *, unsigned int,	\
+					 uintptr_t *,			\
+					 uintptr_t *,			\
+					 const struct La_ppc64v2_regs *,\
+					 struct La_ppc64v2_retval *,	\
+					 const char *)
 
 #include_next <ldsodefs.h>
 
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index eccfbb3..36f3916 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -545,8 +545,13 @@ elf_machine_plt_value (struct link_map *map, const Elf64_Rela *reloc,
 
 
 /* Names of the architecture-specific auditing callback functions.  */
+#if _CALL_ELF != 2
 #define ARCH_LA_PLTENTER ppc64_gnu_pltenter
 #define ARCH_LA_PLTEXIT ppc64_gnu_pltexit
+#else
+#define ARCH_LA_PLTENTER ppc64v2_gnu_pltenter
+#define ARCH_LA_PLTEXIT ppc64v2_gnu_pltexit
+#endif
 
 #endif /* dl_machine_h */
 
diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
index 18c8a3a..69ce523 100644
--- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
+++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
@@ -50,11 +50,8 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 /* Store the LR in the LR Save area.  */
 	std	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	cfi_offset (lr, FRAME_LR_SAVE)
-	mfcr	r0
 	std	r9,INT_PARMS+48(r1)
 	std	r10,INT_PARMS+56(r1)
-/* I'm almost certain we don't have to save cr...  be safe.  */
-	std	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	bl	JUMPTARGET(_dl_fixup)
 #ifndef SHARED
 	nop
@@ -66,11 +63,9 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
-	mtcrf	0xFF,r0
 /* Prepare for calling the function returned by fixup.  */
 	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
@@ -85,18 +80,30 @@ END(_dl_runtime_resolve)
 #undef FRAME_SIZE
 #undef INT_PARMS
 
-	/* Stack layout:
-	   (Note: some of these are not required for the ELFv2 ABI.)
-	  +592   previous backchain
-	  +584   spill_r31
-	  +576   spill_r30
-	  +560   v1
-	  +552   fp4
-	  +544   fp3
-	  +536   fp2
-	  +528   fp1
-	  +520   r4
-	  +512   r3
+	/* Stack layout:		ELFv2 ABI.
+					+752   previous backchain
+					+744   spill_r31
+					+736   spill_r30
+					+720   v8
+					+704   v7
+					+688   v6
+					+672   v5
+					+656   v4
+					+640   v3
+					+624   v2
+					+608   v1
+					+600   fp10
+	  ELFv1 ABI			+592   fp9
+	  +592   previous backchain	+584   fp8
+	  +584   spill_r31		+576   fp7
+	  +576   spill_r30		+568   fp6
+	  +560   v1			+560   fp5
+	  +552   fp4			+552   fp4
+	  +544   fp3			+544   fp3
+	  +536   fp2			+536   fp2
+	  +528   fp1			+528   fp1
+	  +520   r4			+520   r4
+	  +512   r3			+512   r3
 	   return values
           +504   free
 	  +496   stackframe
@@ -157,10 +164,15 @@ END(_dl_runtime_resolve)
 	  +8     CR save area
 	r1+0     stack back chain
 	*/
-#define FRAME_SIZE 592
+#if _CALL_ELF == 2
+# define FRAME_SIZE 752
+# define VR_RTN 608
+#else
+# define FRAME_SIZE 592
+# define VR_RTN 560
+#endif
 #define INT_RTN 512
 #define FPR_RTN 528
-#define VR_RTN 560
 #define STACK_FRAME 496
 #define CALLING_LR 488
 #define CALLING_SP 480
@@ -205,18 +217,14 @@ EALIGN(_dl_profile_resolve, 4, 0)
 	mflr	r5
 	std	r7,INT_PARMS+32(r1)
 	std	r8,INT_PARMS+40(r1)
-/* Store the LR in the LR Save area of the previous frame.  */
-/* XXX Do we have to do this?  */
+/* Store the LR in the LR Save area.  */
 	la	r8,FRAME_SIZE(r1)
 	std	r5,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	cfi_offset (lr, FRAME_LR_SAVE)
 	std	r5,CALLING_LR(r1)
-	mfcr	r0
 	std	r9,INT_PARMS+48(r1)
 	std	r10,INT_PARMS+56(r1)
 	std	r8,CALLING_SP(r1)
-/* I'm almost certain we don't have to save cr...  be safe.  */
-	std	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r12,.LC__dl_hwcap@toc(r2)
 #ifdef SHARED
 	/* Load _rtld_local_ro._dl_hwcap.  */
@@ -319,11 +327,9 @@ L(restoreFXR):
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
-	mtcrf	0xFF,r0
 /* Prepare for calling the function returned by fixup.  */
 	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
@@ -346,10 +352,11 @@ L(restoreFXR):
 	lfd	fp12,FPR_PARMS+88(r1)
 	lfd	fp13,FPR_PARMS+96(r1)
 /* Unwind the stack frame, and jump.  */
-	ld	r31,584(r1)
-	ld	r30,576(r1)
+	ld	r31,FRAME_SIZE-8(r1)
+	ld	r30,FRAME_SIZE-16(r1)
 	addi	r1,r1,FRAME_SIZE
 	bctr
+
 L(do_pltexit):
 	la	r10,(VR_PARMS+0)(r1)
 	la	r9,(VR_PARMS+16)(r1)
@@ -383,11 +390,9 @@ L(restoreFXR2):
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
-	mtcrf	0xFF,r0
 /* Prepare for calling the function returned by fixup.  */
 	std	r2,FRAME_TOC_SAVE(r1)
 	PPC64_LOAD_FUNCPTR r3
@@ -413,16 +418,37 @@ L(restoreFXR2):
 /* But return here and store the return values.  */
 	std	r3,INT_RTN(r1)
 	std	r4,INT_RTN+8(r1)
-	stfd	fp1,FPR_PARMS+0(r1)
-	stfd	fp2,FPR_PARMS+8(r1)
+	stfd	fp1,FPR_RTN+0(r1)
+	stfd	fp2,FPR_RTN+8(r1)
 	cmpdi	cr0,r12,0
 	la	r10,VR_RTN(r1)
-	stfd	fp3,FPR_PARMS+16(r1)
-	stfd	fp4,FPR_PARMS+24(r1)
+	stfd	fp3,FPR_RTN+16(r1)
+	stfd	fp4,FPR_RTN+24(r1)
+#if _CALL_ELF == 2
+	la	r12,VR_RTN+16(r1)
+	stfd	fp5,FPR_RTN+32(r1)
+	stfd	fp6,FPR_RTN+40(r1)
+	li	r5,32
+	li	r6,64
+	stfd	fp7,FPR_RTN+48(r1)
+	stfd	fp8,FPR_RTN+56(r1)
+	stfd	fp9,FPR_RTN+64(r1)
+	stfd	fp10,FPR_RTN+72(r1)
+#endif
 	mr	r3,r31
 	mr	r4,r30
 	beq	L(callpltexit)
 	stvx	v2,0,r10
+#if _CALL_ELF == 2
+	stvx	v3,0,r12
+	stvx	v4,r5,r10
+	stvx	v5,r5,r12
+	addi	r5,r5,64
+	stvx	v6,r6,r10
+	stvx	v7,r6,r12
+	stvx	v8,r5,r10
+	stvx	v9,r5,r12
+#endif
 L(callpltexit):
 	addi	r5,r1,INT_PARMS
 	addi	r6,r1,INT_RTN
@@ -434,18 +460,39 @@ L(callpltexit):
 	lwz	r12,VR_VRSAVE(r1)
 	ld	r3,INT_RTN(r1)
 	ld	r4,INT_RTN+8(r1)
-	lfd	fp1,FPR_PARMS+0(r1)
-	lfd	fp2,FPR_PARMS+8(r1)
+	lfd	fp1,FPR_RTN+0(r1)
+	lfd	fp2,FPR_RTN+8(r1)
 	cmpdi	cr0,r12,0
-	la	r10,VR_RTN(r1)
-	lfd	fp3,FPR_PARMS+16(r1)
-	lfd	fp4,FPR_PARMS+24(r1)
+	la	r11,VR_RTN(r1)
+	lfd	fp3,FPR_RTN+16(r1)
+	lfd	fp4,FPR_RTN+24(r1)
+#if _CALL_ELF == 2
+	la	r12,VR_RTN+16(r1)
+	lfd	fp5,FPR_RTN+32(r1)
+	lfd	fp6,FPR_RTN+40(r1)
+	li	r30,32
+	li	r31,64
+	lfd	fp7,FPR_RTN+48(r1)
+	lfd	fp8,FPR_RTN+56(r1)
+	lfd	fp9,FPR_RTN+64(r1)
+	lfd	fp10,FPR_RTN+72(r1)
+#endif
 	beq	L(pltexitreturn)
-	lvx	v2,0,r10
+	lvx	v2,0,r11
+#if _CALL_ELF == 2
+	lvx	v3,0,r12
+	lvx	v4,r30,r11
+	lvx	v5,r30,r12
+	addi	r30,r30,64
+	lvx	v6,r31,r11
+	lvx	v7,r31,r12
+	lvx	v8,r30,r11
+	lvx	v9,r30,r12
+#endif
 L(pltexitreturn):
 	ld	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
-	ld	r31,584(r1)
-	ld	r30,576(r1)
+	ld	r31,FRAME_SIZE-8(r1)
+	ld	r30,FRAME_SIZE-16(r1)
 	mtlr	r0
 	ld	r1,0(r1)
 	blr
diff --git a/sysdeps/powerpc/powerpc64/tst-audit.h b/sysdeps/powerpc/powerpc64/tst-audit.h
index ad6545e..0fbe1fe 100644
--- a/sysdeps/powerpc/powerpc64/tst-audit.h
+++ b/sysdeps/powerpc/powerpc64/tst-audit.h
@@ -18,8 +18,16 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if _CALL_ELF != 2
 #define pltenter la_ppc64_gnu_pltenter
 #define pltexit la_ppc64_gnu_pltexit
 #define La_regs La_ppc64_regs
 #define La_retval La_ppc64_retval
 #define int_retval lrv_r3
+#else
+#define pltenter la_ppc64v2_gnu_pltenter
+#define pltexit la_ppc64v2_gnu_pltexit
+#define La_regs La_ppc64v2_regs
+#define La_retval La_ppc64v2_retval
+#define int_retval lrv_r3
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1e9e44f367d335c263ab432af03a66e6f383b021

commit 1e9e44f367d335c263ab432af03a66e6f383b021
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:59:39 2013 -0600

    PowerPC64 ELFv2 ABI 4/6: Stack frame layout changes
    
    This updates glibc for the changes in the ELFv2 relating to the
    stack frame layout.  These are described in more detail here:
    http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01149.html
    http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01146.html
    
    Specifically, the "compiler and linker doublewords" were removed,
    which has the effect that the save slot for the TOC register is
    now at offset 24 rather than 40 to the stack pointer.
    
    In addition, a function may now no longer necessarily assume that
    its caller has set up a 64-byte register save area its use.
    
    To address the first change, the patch goes through all assembler
    files and replaces immediate offsets in instructions accessing the
    ABI-defined stack slots by symbolic offsets.  Those already were
    defined in ucontext_i.sym and used in some of the context routines,
    but that doesn't really seem like the right place for those defines.
    
    The patch instead defines those symbolic offsets in sysdeps.h,
    in two variants for the old and new ABI, and uses them systematically
    in all assembler files, not just the context routines.
    
    The second change only affected a few assembler files that used
    the save area to temporarily store some registers.  In those
    cases where this happens within a leaf function, this patch
    changes the code to store those registers to the "red zone"
    below the stack pointer.  Otherwise, the functions already allocate
    a stack frame, and the patch changes them to add extra space in
    these frames as temporary space for the ELFv2 ABI.

diff --git a/ChangeLog b/ChangeLog
index a9496d8..d3f51a2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,54 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/sysdep.h (FRAME_MIN_SIZE): Define.
+	(FRAME_MIN_SIZE_PARM): Likewise.
+	(FRAME_BACKCHAIN): Likewise.
+	(FRAME_CR_SAVE): Likewise.
+	(FRAME_LR_SAVE): Likewise.
+	(FRAME_TOC_SAVE): Likewise.
+	(FRAME_PARM_SAVE): Likewise.
+	(FRAME_PARM1_SAVE, FRAME_PARM2_SAVE, FRAME_PARM3_SAVE,
+	FRAME_PARM4_SAVE, FRAME_PARM5_SAVE, FRAME_PARM6_SAVE,
+	FRAME_PARM7_SAVE, FRAME_PARM8_SAVE, FRAME_PARM9_SAVE): Likewise.
+	(call_mcount_parm_offset): New macro.
+	(SAVE_ARG, REST_ARG, CFI_SAVE_ARG): Use it.
+	(PROF): Use symbolic stack frame offsets.
+	(TAIL_CALL_SYSCALL_ERROR): Likewise.
+	* sysdeps/powerpc/powerpc64/dl-trampoline.S (FRAME_SIZE, INT_PARMS):
+	Redefine in terms of FRAME_MIN_SIZE.
+	(_dl_runtime_resolve): Use symbolic stack frame offsets.
+	(_dl_profile_resolve): Likewise.  Update comment.
+	* sysdeps/powerpc/powerpc64/setjmp-common.S (__GI__setjmp): Use
+	symbols stack frame offsets.
+	(__sigsetjmp): Likewise.
+	* sysdeps/powerpc/powerpc64/__longjmp-common.S (__longjmp): Likewise.
+	* sysdeps/powerpc/powerpc64/ppc-mcount.S (_mcount): Likewise.
+	* sysdeps/powerpc/powerpc64/crti.S (_init, _fini): Likewise.
+	* sysdeps/powerpc/powerpc64/crtn.S (_init, _fini): Likewise.
+
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/ucontext_i.sym
+	(FRAME_BACKCHAIN): Remove.
+	(FRAME_CR_SAVE): Likewise.
+	(FRAME_LR_SAVE): Likewise.
+	(FRAME_COMPILER_DW): Likewise.
+	(FRAME_LINKER_DW): Likewise.
+	(FRAME_TOC_SAVE): Likewise.
+	(FRAME_PARM_SAVE): Likewise.
+	(FRAME_PARM1_SAVE, FRAME_PARM2_SAVE, FRAME_PARM3_SAVE,
+	FRAME_PARM4_SAVE, FRAME_PARM5_SAVE, FRAME_PARM6_SAVE,
+	FRAME_PARM7_SAVE, FRAME_PARM8_SAVE, FRAME_PARM9_SAVE): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/____longjmp_chk.S
+	(CHECK_SP): Use symbolic stack frame offsets.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/brk.S (__brk): Use "red
+	zone" instead of caller's parameter save area for temp storage.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S (__clone):
+	Likewise.  Also, use symbolic stack frame offsets.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/socket.S (FRAMESIZE,
+	stackblock): Redefine for _CALL_ELF == 2 to save parameters into
+	our own stack frame instead of the caller's.
+	(__socket): Use symbolic stack frame offsets.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 	    Alan Modra  <amodra@gmail.com>
 
 	* elf/elf.h (DT_PPC64_OPT, PPC64_OPT_TLS, PPC64_OPT_MULTI_TOC):
diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index 3ac857a..4c84d1e 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,5 +1,18 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h
+	(CANCEL_FRAMESIZE, CANCEL_PARM_SAVE): New macros to save parameters
+	into our own stack frame instead of the caller's.
+	(PSEUDO): Use them.  Use symbolic stack frame offsets.
+	(DOCARGS_1, UNDOCARGS_1): Use CANCEL_PARM_SAVE.
+	(DOCARGS_2, UNDOCARGS_2): Likewise.
+	(DOCARGS_3, UNDOCARGS_3): Likewise.
+	(DOCARGS_4, UNDOCARGS_4): Likewise.
+	(DOCARGS_5, UNDOCARGS_5): Likewise.
+	(DOCARGS_6, UNDOCARGS_6): Likewise.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
 	* sysdeps/powerpc/tls.h (tcbhead_t): Add __private_ss field.
 
 2013-10-03  Siddhesh Poyarekar  <siddhesh@redhat.com>
diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h b/nptl/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h
index 51e021d..d711dc6 100644
--- a/nptl/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h
+++ b/nptl/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h
@@ -31,6 +31,14 @@
 #  define DASHDASHPFX(str) __##str
 # endif
 
+#if _CALL_ELF == 2
+#define CANCEL_FRAMESIZE (FRAME_MIN_SIZE+16+48)
+#define CANCEL_PARM_SAVE (FRAME_MIN_SIZE+16)
+#else
+#define CANCEL_FRAMESIZE (FRAME_MIN_SIZE+16)
+#define CANCEL_PARM_SAVE (CANCEL_FRAMESIZE+FRAME_PARM_SAVE)
+#endif
+
 # undef PSEUDO
 # define PSEUDO(name, syscall_name, args)				\
   .section ".text";							\
@@ -44,52 +52,52 @@
     PSEUDO_RET;								\
   .size DASHDASHPFX(syscall_name##_nocancel),.-DASHDASHPFX(syscall_name##_nocancel);	\
   .Lpseudo_cancel:							\
-    stdu 1,-128(1);							\
-    cfi_adjust_cfa_offset (128);					\
+    stdu 1,-CANCEL_FRAMESIZE(1);					\
+    cfi_adjust_cfa_offset (CANCEL_FRAMESIZE);				\
     mflr 9;								\
-    std  9,128+16(1);							\
-    cfi_offset (lr, 16);						\
+    std  9,CANCEL_FRAMESIZE+FRAME_LR_SAVE(1);				\
+    cfi_offset (lr, FRAME_LR_SAVE);					\
     DOCARGS_##args;	/* save syscall args around CENABLE.  */	\
     CENABLE;								\
-    std  3,112(1);	/* store CENABLE return value (MASK).  */	\
+    std  3,FRAME_MIN_SIZE(1); /* store CENABLE return value (MASK).  */	\
     UNDOCARGS_##args;	/* restore syscall args.  */			\
     DO_CALL (SYS_ify (syscall_name));					\
     mfcr 0;		/* save CR/R3 around CDISABLE.  */		\
-    std  3,120(1);							\
-    std  0,128+8(1);							\
-    cfi_offset (cr, 8);							\
-    ld   3,112(1);	/* pass MASK to CDISABLE.  */			\
+    std  3,FRAME_MIN_SIZE+8(1);						\
+    std  0,CANCEL_FRAMESIZE+FRAME_CR_SAVE(1);				\
+    cfi_offset (cr, FRAME_CR_SAVE);					\
+    ld   3,FRAME_MIN_SIZE(1); /* pass MASK to CDISABLE.  */		\
     CDISABLE;								\
-    ld   9,128+16(1);							\
-    ld   0,128+8(1);	/* restore CR/R3. */				\
-    ld   3,120(1);							\
+    ld   9,CANCEL_FRAMESIZE+FRAME_LR_SAVE(1);				\
+    ld   0,CANCEL_FRAMESIZE+FRAME_CR_SAVE(1); /* restore CR/R3. */	\
+    ld   3,FRAME_MIN_SIZE+8(1);						\
     mtlr 9;								\
     mtcr 0;								\
-    addi 1,1,128;							\
-    cfi_adjust_cfa_offset (-128);					\
+    addi 1,1,CANCEL_FRAMESIZE;						\
+    cfi_adjust_cfa_offset (-CANCEL_FRAMESIZE);				\
     cfi_restore (lr);							\
     cfi_restore (cr)
 
 # define DOCARGS_0
 # define UNDOCARGS_0
 
-# define DOCARGS_1	std 3,128+48(1); DOCARGS_0
-# define UNDOCARGS_1	ld 3,128+48(1); UNDOCARGS_0
+# define DOCARGS_1	std 3,CANCEL_PARM_SAVE(1); DOCARGS_0
+# define UNDOCARGS_1	ld 3,CANCEL_PARM_SAVE(1); UNDOCARGS_0
 
-# define DOCARGS_2	std 4,128+56(1); DOCARGS_1
-# define UNDOCARGS_2	ld 4,128+56(1); UNDOCARGS_1
+# define DOCARGS_2	std 4,CANCEL_PARM_SAVE+8(1); DOCARGS_1
+# define UNDOCARGS_2	ld 4,CANCEL_PARM_SAVE+8(1); UNDOCARGS_1
 
-# define DOCARGS_3	std 5,128+64(1); DOCARGS_2
-# define UNDOCARGS_3	ld 5,128+64(1); UNDOCARGS_2
+# define DOCARGS_3	std 5,CANCEL_PARM_SAVE+16(1); DOCARGS_2
+# define UNDOCARGS_3	ld 5,CANCEL_PARM_SAVE+16(1); UNDOCARGS_2
 
-# define DOCARGS_4	std 6,128+72(1); DOCARGS_3
-# define UNDOCARGS_4	ld 6,128+72(1); UNDOCARGS_3
+# define DOCARGS_4	std 6,CANCEL_PARM_SAVE+24(1); DOCARGS_3
+# define UNDOCARGS_4	ld 6,CANCEL_PARM_SAVE+24(1); UNDOCARGS_3
 
-# define DOCARGS_5	std 7,128+80(1); DOCARGS_4
-# define UNDOCARGS_5	ld 7,128+80(1); UNDOCARGS_4
+# define DOCARGS_5	std 7,CANCEL_PARM_SAVE+32(1); DOCARGS_4
+# define UNDOCARGS_5	ld 7,CANCEL_PARM_SAVE+32(1); UNDOCARGS_4
 
-# define DOCARGS_6	std 8,128+88(1); DOCARGS_5
-# define UNDOCARGS_6	ld 8,128+88(1); UNDOCARGS_5
+# define DOCARGS_6	std 8,CANCEL_PARM_SAVE+40(1); DOCARGS_5
+# define UNDOCARGS_6	ld 8,CANCEL_PARM_SAVE+40(1); UNDOCARGS_5
 
 # ifdef IS_IN_libpthread
 #  ifdef SHARED
diff --git a/sysdeps/powerpc/powerpc64/__longjmp-common.S b/sysdeps/powerpc/powerpc64/__longjmp-common.S
index 4f1e3c8..ce5a018 100644
--- a/sysdeps/powerpc/powerpc64/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/__longjmp-common.S
@@ -130,7 +130,7 @@ L(no_vmx):
 	ld r14,((JB_GPRS+0)*8)(r3)
 	lfd fp14,((JB_FPRS+0)*8)(r3)
 #if defined SHARED && !defined IS_IN_rtld
-	std r2,40(r1)	/* Restore the callers TOC save area.  */
+	std r2,FRAME_TOC_SAVE(r1)	/* Restore the callers TOC save area.  */
 #endif
 	ld r15,((JB_GPRS+1)*8)(r3)
 	lfd fp15,((JB_FPRS+1)*8)(r3)
@@ -148,7 +148,7 @@ L(no_vmx):
 	PTR_DEMANGLE2 (r0, r25)
 #endif
 	mtlr r0
-/* 	std r2,40(r1)	Restore the TOC save area.  */
+/* 	std r2,FRAME_TOC_SAVE(r1)	Restore the TOC save area.  */
 	ld r21,((JB_GPRS+7)*8)(r3)
 	lfd fp21,((JB_FPRS+7)*8)(r3)
 	ld r22,((JB_GPRS+8)*8)(r3)
diff --git a/sysdeps/powerpc/powerpc64/crti.S b/sysdeps/powerpc/powerpc64/crti.S
index 7eff7fd..6e1ece8 100644
--- a/sysdeps/powerpc/powerpc64/crti.S
+++ b/sysdeps/powerpc/powerpc64/crti.S
@@ -66,8 +66,8 @@
 BODY_LABEL (_init):
 	LOCALENTRY(_init)
 	mflr 0
-	std 0, 16(r1)
-	stdu r1, -112(r1)
+	std 0, FRAME_LR_SAVE(r1)
+	stdu r1, -FRAME_MIN_SIZE_PARM(r1)
 #if PREINIT_FUNCTION_WEAK
 	addis r9, r2, .LC0@toc@ha
 	ld r0, .LC0@toc@l(r9)
@@ -84,5 +84,5 @@ BODY_LABEL (_init):
 BODY_LABEL (_fini):
 	LOCALENTRY(_fini)
 	mflr 0
-	std 0, 16(r1)
-	stdu r1, -112(r1)
+	std 0, FRAME_LR_SAVE(r1)
+	stdu r1, -FRAME_MIN_SIZE_PARM(r1)
diff --git a/sysdeps/powerpc/powerpc64/crtn.S b/sysdeps/powerpc/powerpc64/crtn.S
index 364e53a..cdd3b0f 100644
--- a/sysdeps/powerpc/powerpc64/crtn.S
+++ b/sysdeps/powerpc/powerpc64/crtn.S
@@ -39,13 +39,13 @@
 #include <sysdep.h>
 
 	.section .init,"ax",@progbits
-	addi r1, r1, 112
-	ld r0, 16(r1)
+	addi r1, r1, FRAME_MIN_SIZE_PARM
+	ld r0, FRAME_LR_SAVE(r1)
 	mtlr r0
 	blr
 
 	.section .fini,"ax",@progbits
-	addi r1, r1, 112
-	ld r0, 16(r1)
+	addi r1, r1, FRAME_MIN_SIZE_PARM
+	ld r0, FRAME_LR_SAVE(r1)
 	mtlr r0
 	blr
diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
index e31311c..18c8a3a 100644
--- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
+++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
@@ -26,13 +26,13 @@
    parm1 (r3) and the index (r0) need to be converted to an offset
    (index * 24) in parm2 (r4).  */
 
-#define FRAME_SIZE 176
+#define FRAME_SIZE (FRAME_MIN_SIZE+64)
 /* We need to save the registers used to pass parameters, ie. r3 thru
    r10;  Use local var space rather than the parameter save area,
    because gcc as of 2010/05 doesn't allocate a proper stack frame for
    a function that makes no calls except for __tls_get_addr and we
    might be here resolving the __tls_get_addr call.  */
-#define INT_PARMS 112
+#define INT_PARMS FRAME_MIN_SIZE
 EALIGN(_dl_runtime_resolve, 4, 0)
 	stdu	r1,-FRAME_SIZE(r1)
 	cfi_adjust_cfa_offset (FRAME_SIZE)
@@ -48,25 +48,25 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 	mflr	r0
 	std	r8,INT_PARMS+40(r1)
 /* Store the LR in the LR Save area.  */
-	std	r0,FRAME_SIZE+16(r1)
-	cfi_offset (lr, 16)
+	std	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
+	cfi_offset (lr, FRAME_LR_SAVE)
 	mfcr	r0
 	std	r9,INT_PARMS+48(r1)
 	std	r10,INT_PARMS+56(r1)
 /* I'm almost certain we don't have to save cr...  be safe.  */
-	std	r0,FRAME_SIZE+8(r1)
+	std	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	bl	JUMPTARGET(_dl_fixup)
 #ifndef SHARED
 	nop
 #endif
 /* Put the registers back.  */
-	ld	r0,FRAME_SIZE+16(r1)
+	ld	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	ld	r10,INT_PARMS+56(r1)
 	ld	r9,INT_PARMS+48(r1)
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+8(r1)
+	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
@@ -76,7 +76,7 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 	ld	r3,INT_PARMS+0(r1)
 #if _CALL_ELF == 2
 /* Restore the caller's TOC in case we jump to a local entry point.  */
-	ld	r2,FRAME_SIZE+40(r1)
+	ld	r2,FRAME_SIZE+FRAME_TOC_SAVE(r1)
 #endif
 /* Unwind the stack frame, and jump.  */
 	addi	r1,r1,FRAME_SIZE
@@ -86,6 +86,7 @@ END(_dl_runtime_resolve)
 #undef INT_PARMS
 
 	/* Stack layout:
+	   (Note: some of these are not required for the ELFv2 ABI.)
 	  +592   previous backchain
 	  +584   spill_r31
 	  +576   spill_r30
@@ -147,10 +148,11 @@ END(_dl_runtime_resolve)
 	  +64    parm3
 	  +56    parm2
 	  +48    parm1
-	 * Parameter save area, Allocated by the call, at least 8 double words
-	  +40    TOC save area
-	  +32    Reserved for linker
-	  +24    Reserved for compiler
+	 * Parameter save area
+	 * (v1 ABI: Allocated by the call, at least 8 double words)
+	  +40    v1 ABI: TOC save area
+	  +32    v1 ABI: Reserved for linker
+	  +24    v1 ABI: Reserved for compiler / v2 ABI: TOC save area
 	  +16    LR save area
 	  +8     CR save area
 	r1+0     stack back chain
@@ -206,15 +208,15 @@ EALIGN(_dl_profile_resolve, 4, 0)
 /* Store the LR in the LR Save area of the previous frame.  */
 /* XXX Do we have to do this?  */
 	la	r8,FRAME_SIZE(r1)
-	std	r5,FRAME_SIZE+16(r1)
-	cfi_offset (lr, 16)
+	std	r5,FRAME_SIZE+FRAME_LR_SAVE(r1)
+	cfi_offset (lr, FRAME_LR_SAVE)
 	std	r5,CALLING_LR(r1)
 	mfcr	r0
 	std	r9,INT_PARMS+48(r1)
 	std	r10,INT_PARMS+56(r1)
 	std	r8,CALLING_SP(r1)
 /* I'm almost certain we don't have to save cr...  be safe.  */
-	std	r0,FRAME_SIZE+8(r1)
+	std	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r12,.LC__dl_hwcap@toc(r2)
 #ifdef SHARED
 	/* Load _rtld_local_ro._dl_hwcap.  */
@@ -311,13 +313,13 @@ L(saveFP):
 	lvx	v12,r11,r10
 	lvx	v13,r11,r9
 L(restoreFXR):
-	ld	r0,FRAME_SIZE+16(r1)
+	ld	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	ld	r10,INT_PARMS+56(r1)
 	ld	r9,INT_PARMS+48(r1)
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+8(r1)
+	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
@@ -327,7 +329,7 @@ L(restoreFXR):
 	ld	r3,INT_PARMS+0(r1)
 #if _CALL_ELF == 2
 /* Restore the caller's TOC in case we jump to a local entry point.  */
-	ld	r2,FRAME_SIZE+40(r1)
+	ld	r2,FRAME_SIZE+FRAME_TOC_SAVE(r1)
 #endif
 /* Load the floating point registers.  */
 	lfd	fp1,FPR_PARMS+0(r1)
@@ -375,19 +377,19 @@ L(do_pltexit):
 	lvx	v12,r11,r10
 	lvx	v13,r11,r9
 L(restoreFXR2):
-	ld	r0,FRAME_SIZE+16(r1)
+	ld	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	ld	r10,INT_PARMS+56(r1)
 	ld	r9,INT_PARMS+48(r1)
 	ld	r8,INT_PARMS+40(r1)
 	ld	r7,INT_PARMS+32(r1)
 	mtlr	r0
-	ld	r0,FRAME_SIZE+8(r1)
+	ld	r0,FRAME_SIZE+FRAME_CR_SAVE(r1)
 	ld	r6,INT_PARMS+24(r1)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
 	mtcrf	0xFF,r0
 /* Prepare for calling the function returned by fixup.  */
-	std	r2,40(r1)
+	std	r2,FRAME_TOC_SAVE(r1)
 	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
 /* Load the floating point registers.  */
@@ -406,7 +408,7 @@ L(restoreFXR2):
 	lfd	fp13,FPR_PARMS+96(r1)
 /* Call the target function.  */
 	bctrl
-	ld	r2,40(r1)
+	ld	r2,FRAME_TOC_SAVE(r1)
 	lwz	r12,VR_VRSAVE(r1)
 /* But return here and store the return values.  */
 	std	r3,INT_RTN(r1)
@@ -441,7 +443,7 @@ L(callpltexit):
 	beq	L(pltexitreturn)
 	lvx	v2,0,r10
 L(pltexitreturn):
-	ld	r0,FRAME_SIZE+16(r1)
+	ld	r0,FRAME_SIZE+FRAME_LR_SAVE(r1)
 	ld	r31,584(r1)
 	ld	r30,576(r1)
 	mtlr	r0
diff --git a/sysdeps/powerpc/powerpc64/ppc-mcount.S b/sysdeps/powerpc/powerpc64/ppc-mcount.S
index 3d21a70..9824a55 100644
--- a/sysdeps/powerpc/powerpc64/ppc-mcount.S
+++ b/sysdeps/powerpc/powerpc64/ppc-mcount.S
@@ -24,16 +24,16 @@
 ENTRY(_mcount)
 	mflr		 r4
 	ld		 r11, 0(r1)
-	stdu		 r1,-112(r1)
-	cfi_adjust_cfa_offset (112)
-	std		 r4, 128(r1)
-	cfi_offset (lr, 16)
-	ld		 r3, 16(r11)
+	stdu		 r1,-FRAME_MIN_SIZE(r1)
+	cfi_adjust_cfa_offset (FRAME_MIN_SIZE)
+	std		 r4, FRAME_MIN_SIZE+FRAME_LR_SAVE(r1)
+	cfi_offset (lr, FRAME_LR_SAVE)
+	ld		 r3, FRAME_LR_SAVE(r11)
 	bl		 JUMPTARGET(__mcount_internal)
 	nop
-	ld		 r0, 128(r1)
+	ld		 r0, FRAME_MIN_SIZE+FRAME_LR_SAVE(r1)
 	mtlr		 r0
-	addi		 r1,r1,112
+	addi		 r1,r1,FRAME_MIN_SIZE
 	blr
 END(_mcount)
 
diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S
index db4b349..9e4fb02 100644
--- a/sysdeps/powerpc/powerpc64/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/setjmp-common.S
@@ -54,7 +54,7 @@ END (setjmp)
    bugz #269.  __GI__setjmp is used in csu/libc-start.c when
    HAVE_CLEANUP_JMP_BUF is defined.  */
 ENTRY (__GI__setjmp)
-	std r2,40(r1)		/* Save the callers TOC in the save area.  */
+	std r2,FRAME_TOC_SAVE(r1)		/* Save the callers TOC in the save area.  */
 	CALL_MCOUNT 1
 	li r4,0			/* Set second argument to 0.  */
 	b JUMPTARGET (GLUE(__sigsetjmp,_ent))
@@ -80,7 +80,7 @@ JUMPTARGET(GLUE(__sigsetjmp,_ent)):
 #endif
 	mflr r0
 #if defined SHARED && !defined IS_IN_rtld
-	ld   r5,40(r1)	/* Retrieve the callers TOC.  */
+	ld   r5,FRAME_TOC_SAVE(r1)	/* Retrieve the callers TOC.  */
 	std  r5,(JB_GPR2*8)(3)
 #else
 	std  r2,(JB_GPR2*8)(3)
@@ -216,14 +216,14 @@ L(no_vmx):
 	b	JUMPTARGET (__sigjmp_save)
 #else
 	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-112(r1)
-	cfi_adjust_cfa_offset(112)
-	cfi_offset(lr,16)
+	std	r0,FRAME_LR_SAVE(r1)
+	stdu	r1,-FRAME_MIN_SIZE(r1)
+	cfi_adjust_cfa_offset(FRAME_MIN_SIZE)
+	cfi_offset(lr,FRAME_LR_SAVE)
 	bl	JUMPTARGET (__sigjmp_save)
 	nop
-	ld	r0,112+16(r1)
-	addi	r1,r1,112
+	ld	r0,FRAME_MIN_SIZE+FRAME_LR_SAVE(r1)
+	addi	r1,r1,FRAME_MIN_SIZE
 	mtlr	r0
 	blr
 #endif
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index 779fd90..112e418 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -20,25 +20,67 @@
 
 #ifdef __ASSEMBLER__
 
+/* Stack frame offsets.  */
+#if _CALL_ELF != 2
+#define FRAME_MIN_SIZE		112
+#define FRAME_MIN_SIZE_PARM	112
+#define FRAME_BACKCHAIN		0
+#define FRAME_CR_SAVE		8
+#define FRAME_LR_SAVE		16
+#define FRAME_TOC_SAVE		40
+#define FRAME_PARM_SAVE		48
+#define FRAME_PARM1_SAVE	48
+#define FRAME_PARM2_SAVE	56
+#define FRAME_PARM3_SAVE	64
+#define FRAME_PARM4_SAVE	72
+#define FRAME_PARM5_SAVE	80
+#define FRAME_PARM6_SAVE	88
+#define FRAME_PARM7_SAVE	96
+#define FRAME_PARM8_SAVE	104
+#define FRAME_PARM9_SAVE	112
+#else
+#define FRAME_MIN_SIZE		32
+#define FRAME_MIN_SIZE_PARM	96
+#define FRAME_BACKCHAIN		0
+#define FRAME_CR_SAVE		8
+#define FRAME_LR_SAVE		16
+#define FRAME_TOC_SAVE		24
+#define FRAME_PARM_SAVE		32
+#define FRAME_PARM1_SAVE	32
+#define FRAME_PARM2_SAVE	40
+#define FRAME_PARM3_SAVE	48
+#define FRAME_PARM4_SAVE	56
+#define FRAME_PARM5_SAVE	64
+#define FRAME_PARM6_SAVE	72
+#define FRAME_PARM7_SAVE	80
+#define FRAME_PARM8_SAVE	88
+#define FRAME_PARM9_SAVE	96
+#endif
+
 /* Support macros for CALL_MCOUNT.  */
+#if _CALL_ELF == 2
+#define call_mcount_parm_offset (-64)
+#else
+#define call_mcount_parm_offset FRAME_PARM_SAVE
+#endif
 	.macro SAVE_ARG NARG
 	.if \NARG
 	SAVE_ARG \NARG-1
-	std	2+\NARG,40+8*(\NARG)(1)
+	std	2+\NARG,call_mcount_parm_offset-8+8*(\NARG)(1)
 	.endif
 	.endm
 
 	.macro REST_ARG NARG
 	.if \NARG
 	REST_ARG \NARG-1
-	ld	2+\NARG,112+40+8*(\NARG)(1)
+	ld	2+\NARG,FRAME_MIN_SIZE_PARM+call_mcount_parm_offset-8+8*(\NARG)(1)
 	.endif
 	.endm
 
 	.macro CFI_SAVE_ARG NARG
 	.if \NARG
 	CFI_SAVE_ARG \NARG-1
-	cfi_offset(2+\NARG,40+8*(\NARG))
+	cfi_offset(2+\NARG,call_mcount_parm_offset-8+8*(\NARG))
 	.endif
 	.endm
 
@@ -55,20 +97,20 @@
 #ifdef	PROF
 	mflr	r0
 	SAVE_ARG \NARG
-	std	r0,16(r1)
-	stdu	r1,-112(r1)
-	cfi_adjust_cfa_offset(112)
-	cfi_offset(lr,16)
+	std	r0,FRAME_LR_SAVE(r1)
+	stdu	r1,-FRAME_MIN_SIZE_PARM(r1)
+	cfi_adjust_cfa_offset(FRAME_MIN_SIZE_PARM)
+	cfi_offset(lr,FRAME_LR_SAVE)
 	CFI_SAVE_ARG \NARG
 	bl	JUMPTARGET (_mcount)
 #ifndef SHARED
 	nop
 #endif
-	ld	r0,128(r1)
+	ld	r0,FRAME_MIN_SIZE_PARM+FRAME_LR_SAVE(r1)
 	REST_ARG \NARG
 	mtlr	r0
-	addi	r1,r1,112
-	cfi_adjust_cfa_offset(-112)
+	addi	r1,r1,FRAME_MIN_SIZE_PARM
+	cfi_adjust_cfa_offset(-FRAME_MIN_SIZE_PARM)
 	cfi_restore(lr)
 	CFI_REST_ARG \NARG
 #endif
@@ -267,15 +309,15 @@ LT_LABELSUFFIX(name,_name_end): ; \
     .else; \
 .Local_syscall_error: \
     mflr 0; \
-    std 0,16(1); \
-    stdu 1,-112(1); \
-    cfi_adjust_cfa_offset(112); \
-    cfi_offset(lr,16); \
+    std 0,FRAME_LR_SAVE(1); \
+    stdu 1,-FRAME_MIN_SIZE(1); \
+    cfi_adjust_cfa_offset(FRAME_MIN_SIZE); \
+    cfi_offset(lr,FRAME_LR_SAVE); \
     bl JUMPTARGET(__syscall_error); \
     nop; \
-    ld 0,112+16(1); \
-    addi 1,1,112; \
-    cfi_adjust_cfa_offset(-112); \
+    ld 0,FRAME_MIN_SIZE+FRAME_LR_SAVE(1); \
+    addi 1,1,FRAME_MIN_SIZE; \
+    cfi_adjust_cfa_offset(-FRAME_MIN_SIZE); \
     mtlr 0; \
     cfi_restore(lr); \
     blr; \
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/____longjmp_chk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/____longjmp_chk.S
index 270e21e..ae576d6 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/____longjmp_chk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/____longjmp_chk.S
@@ -33,24 +33,24 @@
 	cmpld	reg, r1;				\
 	bge+	.Lok;					\
 	mflr	r0;					\
-	std	r0,16(r1);				\
+	std	r0,FRAME_LR_SAVE(r1);			\
 	mr	r31,r3;					\
 	mr	r30,r4;					\
-	stdu	r1,-144(r1);				\
+	stdu	r1,-FRAME_MIN_SIZE-32(r1);		\
 	cfi_remember_state;				\
-	cfi_adjust_cfa_offset (144);			\
-	cfi_offset (lr, 16);				\
+	cfi_adjust_cfa_offset (FRAME_MIN_SIZE+32);	\
+	cfi_offset (lr, FRAME_LR_SAVE);			\
 	li	r3,0;					\
-	addi	r4,r1,112;				\
+	addi	r4,r1,FRAME_MIN_SIZE;			\
 	li	r0,__NR_sigaltstack;			\
 	sc;						\
 	/* Without working sigaltstack we cannot perform the test.  */ \
 	bso	.Lok2;					\
-	lwz	r0,112+8(r1);				\
+	lwz	r0,FRAME_MIN_SIZE+8(r1);		\
 	andi.	r4,r0,1;				\
 	beq	.Lfail;					\
-	ld	r0,112+16(r1);				\
-	ld	r4,112(r1);				\
+	ld	r0,FRAME_MIN_SIZE+16(r1);		\
+	ld	r4,FRAME_MIN_SIZE(r1);			\
 	add	r4,r4,r0;				\
 	sub	r3,r3,reg;				\
 	cmpld	r3,r0;					\
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/brk.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/brk.S
index 348aeb5..33cdf25 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/brk.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/brk.S
@@ -28,9 +28,9 @@
 ENTRY (__brk)
 	CALL_MCOUNT 1
 
-	std	r3,48(r1)
+	std	r3,-8(r1)
 	DO_CALL(SYS_ify(brk))
-	ld	r6,48(r1)
+	ld	r6,-8(r1)
 	ld	r5,.LC__curbrk@toc(r2)
 	std     r3,0(r5)
 	cmpld   r6,r3
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
index 4151d15..37d9d24 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
@@ -40,22 +40,22 @@ ENTRY (__clone)
 	cror	cr0*4+eq,cr1*4+eq,cr0*4+eq
 	beq-	cr0,L(badargs)
 
-	/* Save some regs in parm save area.  */
+	/* Save some regs in the "red zone".  */
 #ifdef RESET_PID
-	std	r29,48(r1)
+	std	r29,-24(r1)
 #endif
-	std	r30,56(r1)
-	std	r31,64(r1)
+	std	r30,-16(r1)
+	std	r31,-8(r1)
 #ifdef RESET_PID
-	cfi_offset(r29,48)
+	cfi_offset(r29,-24)
 #endif
-	cfi_offset(r30,56)
-	cfi_offset(r31,64)
+	cfi_offset(r30,-16)
+	cfi_offset(r31,-8)
 
 	/* Set up stack frame for child.  */
 	clrrdi	r4,r4,4
 	li	r0,0
-	stdu	r0,-112(r4) /* min stack frame is 112 bytes per ABI */
+	stdu	r0,-FRAME_MIN_SIZE_PARM(r4)
 
 	/* Save fn, args, stack across syscall.  */
 	mr	r30,r3			/* Function in r30.  */
@@ -97,12 +97,12 @@ L(nomoregetpid):
 L(oldpid):
 #endif
 
-	std	r2,40(r1)
+	std	r2,FRAME_TOC_SAVE(r1)
 	/* Call procedure.  */
 	PPC64_LOAD_FUNCPTR r30
 	mr	r3,r31
 	bctrl
-	ld	r2,40(r1)
+	ld	r2,FRAME_TOC_SAVE(r1)
 	/* Call _exit with result from procedure.  */
 #ifdef SHARED
 	b	JUMPTARGET(__GI__exit)
@@ -121,15 +121,15 @@ L(badargs):
 L(parent):
 	/* Parent.  Restore registers & return.  */
 #ifdef RESET_PID
-	cfi_offset(r29,48)
+	cfi_offset(r29,-24)
 #endif
-	cfi_offset(r30,56)
-	cfi_offset(r31,64)
+	cfi_offset(r30,-16)
+	cfi_offset(r31,-8)
 #ifdef RESET_PID
-	ld	r29,48(r1)
+	ld	r29,-24(r1)
 #endif
-	ld	r30,56(r1)
-	ld	r31,64(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
 #ifdef RESET_PID
 	cfi_restore(r29)
 #endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/socket.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/socket.S
index 018e55c..aba2d80 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/socket.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/socket.S
@@ -46,8 +46,13 @@
 # endif
 #endif
 
-#define FRAMESIZE 128
-#define stackblock FRAMESIZE+48 /* offset to parm save area.  */
+#if _CALL_ELF == 2
+#define FRAMESIZE (FRAME_MIN_SIZE+16+64)
+#define stackblock (FRAME_MIN_SIZE+16)
+#else
+#define FRAMESIZE (FRAME_MIN_SIZE+16)
+#define stackblock (FRAMESIZE+FRAME_PARM_SAVE) /* offset to parm save area.  */
+#endif
 
 	.text
 ENTRY(__socket)
@@ -98,22 +103,22 @@ ENTRY(__socket)
 .Lsocket_cancel:
 	cfi_adjust_cfa_offset(FRAMESIZE)
 	mflr	r9
-	std	r9,FRAMESIZE+16(r1)
-	cfi_offset (lr, 16)
+	std	r9,FRAMESIZE+FRAME_LR_SAVE(r1)
+	cfi_offset (lr, FRAME_LR_SAVE)
 	CENABLE
-	std	r3,120(r1)
+	std	r3,FRAME_MIN_SIZE+8(r1)
 	li	r3,P(SOCKOP_,socket)
 	addi	r4,r1,stackblock
 	DO_CALL(SYS_ify(socketcall))
 	mfcr	r0
-	std	r3,112(r1)
-	std	r0,FRAMESIZE+8(r1)
-	cfi_offset (cr, 8)
-	ld  	r3,120(r1)
+	std	r3,FRAME_MIN_SIZE(r1)
+	std	r0,FRAMESIZE+FRAME_CR_SAVE(r1)
+	cfi_offset (cr, FRAME_CR_SAVE)
+	ld  	r3,FRAME_MIN_SIZE+8(r1)
 	CDISABLE
-	ld	r4,FRAMESIZE+16(r1)
-	ld	r0,FRAMESIZE+8(r1)
-	ld	r3,112(r1)
+	ld	r4,FRAMESIZE+FRAME_LR_SAVE(r1)
+	ld	r0,FRAMESIZE+FRAME_CR_SAVE(r1)
+	ld	r3,FRAME_MIN_SIZE(r1)
 	mtlr	r4
 	mtcr	r0
 	addi	r1,r1,FRAMESIZE
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/ucontext_i.sym b/sysdeps/unix/sysv/linux/powerpc/powerpc64/ucontext_i.sym
index a35418d..8364e46 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/ucontext_i.sym
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/ucontext_i.sym
@@ -8,27 +8,6 @@ SIG_BLOCK
 SIG_SETMASK
 
 
--- Offsets of the fields in the powerpc64 ABI stack frame.
--- XXX Do these correspond to some struct?
-
-FRAME_BACKCHAIN		0
-FRAME_CR_SAVE		8
-FRAME_LR_SAVE		16
-FRAME_COMPILER_DW	24
-FRAME_LINKER_DW		32
-FRAME_TOC_SAVE		40
-FRAME_PARM_SAVE		48
-FRAME_PARM1_SAVE	48
-FRAME_PARM2_SAVE	56
-FRAME_PARM3_SAVE	64
-FRAME_PARM4_SAVE	72
-FRAME_PARM5_SAVE	80
-FRAME_PARM6_SAVE	88
-FRAME_PARM7_SAVE	96
-FRAME_PARM8_SAVE	104
-FRAME_PARM9_SAVE	112
-
-
 -- Offsets of the fields in the ucontext_t structure.
 #define ucontext(member)	offsetof (ucontext_t, member)
 #define mcontext(member)	ucontext (uc_mcontext.member)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f52b81721f85a00fdd9e69702dc5f64dac9ce460

commit f52b81721f85a00fdd9e69702dc5f64dac9ce460
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:56:31 2013 -0600

    PowerPC64 ELFv2 ABI 3/6: PLT local entry point optimization
    
    This is a follow-on to the previous patch to support the ELFv2 ABI in the
    dynamic loader, split off into its own patch since it is just an optional
    optimization.
    
    In the ELFv2 ABI, most functions define both a global and a local entry
    point; the local entry requires r2 to be already set up by the caller
    to point to the callee's TOC; while the global entry does not require
    the caller to know about the callee's TOC, but it needs to set up r12
    to the callee's entry point address.
    
    Now, when setting up a PLT slot, the dynamic linker will usually need
    to enter the target function's global entry point.  However, if the
    linker can prove that the target function is in the same DSO as the
    PLT slot itself, and the whole DSO only uses a single TOC (which the
    linker will let ld.so know via a DT_PPC64_OPT entry), then it is
    possible to actually enter the local entry point address into the
    PLT slot, for a slight improvement in performance.
    
    Note that this uncovered a problem on the first call via _dl_runtime_resolve,
    because that routine neglected to restore the caller's TOC before calling
    the target function for the first time, since it assumed that function
    would always reload its own TOC anyway ...

diff --git a/ChangeLog b/ChangeLog
index 154f379..a9496d8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,23 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 	    Alan Modra  <amodra@gmail.com>
 
+	* elf/elf.h (DT_PPC64_OPT, PPC64_OPT_TLS, PPC64_OPT_MULTI_TOC):
+	Define.
+	(STO_PPC64_LOCAL_BIT, STO_PPC64_LOCAL_MASK,
+	PPC64_LOCAL_ENTRY_OFFSET): Define.
+	* sysdeps/powerpc/powerpc64/dl-machine.h (ppc64_local_entry_offset):
+	New function.
+	(elf_machine_fixup_plt): Call it.
+	(elf_machine_plt_conflict): Likewise.  Add map, sym_map, and
+	reloc arguments.
+	(elf_machine_rela): Update call to elf_machine_plt_conflict.
+	* sysdeps/powerpc/powerpc64/dl-trampoline.S (_dl_runtime_resolve,
+	_dl_profile_resolve) [_CALL_ELF == 2]: Restore caller's TOC into
+	r2 before calling target.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+	    Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/sysdep.h [_CALL_ELF == 2]
 	(PPC64_LOAD_FUNCPTR, DOT_LABEL, BODY_LABEL, ENTRY_2, END_2): New
 	versions of macros to support ELFv2 ABI.
diff --git a/elf/elf.h b/elf/elf.h
index 331ad3e..d3fea9d 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2281,8 +2281,19 @@ typedef Elf32_Addr Elf32_Conflict;
 #define DT_PPC64_GLINK  (DT_LOPROC + 0)
 #define DT_PPC64_OPD	(DT_LOPROC + 1)
 #define DT_PPC64_OPDSZ	(DT_LOPROC + 2)
+#define DT_PPC64_OPT	(DT_LOPROC + 3)
 #define DT_PPC64_NUM    3
 
+/* PowerPC64 specific values for the DT_PPC64_OPT Dyn entry.  */
+#define PPC64_OPT_TLS		1
+#define PPC64_OPT_MULTI_TOC	2
+
+/* PowerPC64 specific values for the Elf64_Sym st_other field.  */
+#define STO_PPC64_LOCAL_BIT	5
+#define STO_PPC64_LOCAL_MASK	(7 << STO_PPC64_LOCAL_BIT)
+#define PPC64_LOCAL_ENTRY_OFFSET(other)				\
+ (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) >> 2) << 2)
+
 
 /* ARM specific declarations */
 
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index f222bb0..eccfbb3 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -424,6 +424,42 @@ elf_machine_runtime_setup (struct link_map *map, int lazy, int profile)
   return lazy;
 }
 
+#if _CALL_ELF == 2
+/* If the PLT entry whose reloc is 'reloc' resolves to a function in
+   the same object, return the target function's local entry point
+   offset if usable.  */
+static inline Elf64_Addr __attribute__ ((always_inline))
+ppc64_local_entry_offset (struct link_map *map, lookup_t sym_map,
+			  const Elf64_Rela *reloc)
+{
+  const Elf64_Sym *symtab;
+  const Elf64_Sym *sym;
+
+  /* If the target function is in a different object, we cannot
+     use the local entry point.  */
+  if (sym_map != map)
+    return 0;
+
+  /* If the linker inserted multiple TOCs, we cannot use the
+     local entry point.  */
+  if (map->l_info[DT_PPC64(OPT)]
+      && (map->l_info[DT_PPC64(OPT)]->d_un.d_val & PPC64_OPT_MULTI_TOC))
+    return 0;
+
+  /* Otherwise, we can use the local entry point.  Retrieve its offset
+     from the symbol's ELF st_other field.  */
+  symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]);
+  sym = &symtab[ELFW(R_SYM) (reloc->r_info)];
+
+  /* If the target function is an ifunc then the local entry offset is
+     for the resolver, not the final destination.  */
+  if (__builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0))
+    return 0;
+
+  return PPC64_LOCAL_ENTRY_OFFSET (sym->st_other);
+}
+#endif
+
 /* Change the PLT entry whose reloc is 'reloc' to call the actual
    routine.  */
 static inline Elf64_Addr __attribute__ ((always_inline))
@@ -470,6 +506,7 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t sym_map,
   PPC_DCBST (&plt->fd_func);
   PPC_ISYNC;
 #else
+  finaladdr += ppc64_local_entry_offset (map, sym_map, reloc);
   *reloc_addr = finaladdr;
 #endif
 
@@ -477,7 +514,9 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t sym_map,
 }
 
 static inline void __attribute__ ((always_inline))
-elf_machine_plt_conflict (Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
+elf_machine_plt_conflict (struct link_map *map, lookup_t sym_map,
+			  const Elf64_Rela *reloc,
+			  Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
 {
 #if _CALL_ELF != 2
   Elf64_FuncDesc *plt = (Elf64_FuncDesc *) reloc_addr;
@@ -491,6 +530,7 @@ elf_machine_plt_conflict (Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
   PPC_DCBST (&plt->fd_toc);
   PPC_SYNC;
 #else
+  finaladdr += ppc64_local_entry_offset (map, sym_map, reloc);
   *reloc_addr = finaladdr;
 #endif
 }
@@ -646,7 +686,7 @@ elf_machine_rela (struct link_map *map,
       /* Fall thru */
     case R_PPC64_JMP_SLOT:
 #ifdef RESOLVE_CONFLICT_FIND_MAP
-      elf_machine_plt_conflict (reloc_addr, value);
+      elf_machine_plt_conflict (map, sym_map, reloc, reloc_addr, value);
 #else
       elf_machine_fixup_plt (map, sym_map, reloc, reloc_addr, value);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
index bffc4cb..e31311c 100644
--- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
+++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
@@ -74,6 +74,10 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 /* Prepare for calling the function returned by fixup.  */
 	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
+#if _CALL_ELF == 2
+/* Restore the caller's TOC in case we jump to a local entry point.  */
+	ld	r2,FRAME_SIZE+40(r1)
+#endif
 /* Unwind the stack frame, and jump.  */
 	addi	r1,r1,FRAME_SIZE
 	bctr
@@ -321,6 +325,10 @@ L(restoreFXR):
 /* Prepare for calling the function returned by fixup.  */
 	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
+#if _CALL_ELF == 2
+/* Restore the caller's TOC in case we jump to a local entry point.  */
+	ld	r2,FRAME_SIZE+40(r1)
+#endif
 /* Load the floating point registers.  */
 	lfd	fp1,FPR_PARMS+0(r1)
 	lfd	fp2,FPR_PARMS+8(r1)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fe301b50f108bbf8d899e191ad68b857ddf8147c

commit fe301b50f108bbf8d899e191ad68b857ddf8147c
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:54:51 2013 -0600

    PowerPC64 ELFv2 ABI 2/6: Remove function descriptors
    
    This patch adds support for the ELFv2 ABI feature to remove function
    descriptors.  See this GCC patch for in-depth discussion:
    http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01141.html
    
    This mostly involves two types of changes: updating assembler source
    files to the new logic, and updating the dynamic loader.
    
    After the refactoring in the previous patch, most of the assembler source
    changes can be handled simply by providing ELFv2 versions of the
    macros in sysdep.h.   One somewhat non-obvious change is in __GI__setjmp:
    this used to "fall through" to the immediately following __setjmp ENTRY
    point.  This is no longer safe in the ELFv2 since ENTRY defines both
    a global and a local entry point, and you cannot simply fall through
    to a global entry point as it requires r12 to be set up.
    
    Also, makecontext needs to be updated to set up registers according to
    the new ABI for calling into the context's start routine.
    
    The dynamic linker changes mostly consist of removing special code
    to handle function descriptors.  We also need to support the new PLT
    and glink format used by the the ELFv2 linker, see:
    https://sourceware.org/ml/binutils/2013-10/msg00376.html
    
    In addition, the dynamic linker now verifies that the dynamic libraries
    it loads match its own ABI.
    
    The hack in VDSO_IFUNC_RET to "synthesize" a function descriptor
    for vDSO routines is also no longer necessary for ELFv2.

diff --git a/ChangeLog b/ChangeLog
index 9bd9834..154f379 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,37 @@
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+	    Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/powerpc64/sysdep.h [_CALL_ELF == 2]
+	(PPC64_LOAD_FUNCPTR, DOT_LABEL, BODY_LABEL, ENTRY_2, END_2): New
+	versions of macros to support ELFv2 ABI.
+	(LOCALENTRY): New macro.
+	(ENTRY, EALIGN): Use it.
+	* sysdeps/powerpc/powerpc64/crti.S (_init, _fini): Use LOCALENTRY.
+	* sysdeps/powerpc/powerpc64/setjmp-common.S (__GI__setjmp): Do not
+	fall through into ENTRY entry point.
+	* libc/sysdeps/powerpc/powerpc64/dl-machine.h (Elf64_FuncDesc):
+	Only define if _CALL_ELF != 2.
+
+	(elf_machine_matches_host): Verify ABI version matches.
+	(RTLD_START): Use LOCALENTRY.
+	(elf_machine_type_class): Use SHN_UNDEF PLT handling for ELFv2 ABI.
+	(PLT_INITIAL_ENTRY_WORDS): New version for _CALL_ELF != 2.
+	(PLT_ENTRY_WORDS): New macro.
+	(GLINK_INITIAL_ENTRY_WORDS, GLINK_ENTRY_WORDS): Likewise.
+	(elf_machine_runtime_setup): Support ELFv2 ABI.
+	(elf_machine_fixup_plt): Likewise.
+	(elf_machine_plt_conflict): Likewise.
+	(resolve_ifunc): Likewise.
+	* sysdeps/powerpc/powerpc64/dl-irel.h (elf_irela): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h (VDSO_IFUNC_RET):
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/ldsodefs.h
+	(DL_ADDR_SYM_MATCH): Only define if _CALL_ELF != 2.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+	(makecontext): Support ELFv2 ABI.
+	* elf/elf.h (EF_PPC64_ABI): Define.
+
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/sysdep.h [ASSEMBLER] (PPC64_LOAD_FUNCPTR):
 	New assembler macro.
diff --git a/elf/elf.h b/elf/elf.h
index 98c722e..331ad3e 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2271,6 +2271,12 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_PPC64_REL16_HI	251	/* half16   (sym+add-.)@h */
 #define R_PPC64_REL16_HA	252	/* half16   (sym+add-.)@ha */
 
+/* e_flags bits specifying ABI.
+   1 for original function descriptor using ABI,
+   2 for revised ABI without function descriptors,
+   0 for unspecified or not using any features affected by the differences.  */
+#define EF_PPC64_ABI	3
+
 /* PowerPC64 specific values for the Dyn d_tag field.  */
 #define DT_PPC64_GLINK  (DT_LOPROC + 0)
 #define DT_PPC64_OPD	(DT_LOPROC + 1)
diff --git a/sysdeps/powerpc/powerpc64/crti.S b/sysdeps/powerpc/powerpc64/crti.S
index 116199d..7eff7fd 100644
--- a/sysdeps/powerpc/powerpc64/crti.S
+++ b/sysdeps/powerpc/powerpc64/crti.S
@@ -64,6 +64,7 @@
 	ENTRY_2(_init)
 	.align ALIGNARG (2)
 BODY_LABEL (_init):
+	LOCALENTRY(_init)
 	mflr 0
 	std 0, 16(r1)
 	stdu r1, -112(r1)
@@ -81,6 +82,7 @@ BODY_LABEL (_init):
 	ENTRY_2(_fini)
 	.align ALIGNARG (2)
 BODY_LABEL (_fini):
+	LOCALENTRY(_fini)
 	mflr 0
 	std 0, 16(r1)
 	stdu r1, -112(r1)
diff --git a/sysdeps/powerpc/powerpc64/dl-irel.h b/sysdeps/powerpc/powerpc64/dl-irel.h
index d85c614..a500aa6 100644
--- a/sysdeps/powerpc/powerpc64/dl-irel.h
+++ b/sysdeps/powerpc/powerpc64/dl-irel.h
@@ -50,7 +50,11 @@ elf_irela (const Elf64_Rela *reloc)
     {
       Elf64_Addr *const reloc_addr = (void *) reloc->r_offset;
       Elf64_Addr value = elf_ifunc_invoke(reloc->r_addend);
+#if _CALL_ELF != 2
       *(Elf64_FuncDesc *) reloc_addr = *(Elf64_FuncDesc *) value;
+#else
+      *reloc_addr = value;
+#endif
     }
   else
     __libc_fatal ("unexpected reloc type in static binary");
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index a623be5..f222bb0 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -31,6 +31,7 @@
    in l_info array.  */
 #define DT_PPC64(x) (DT_PPC64_##x - DT_LOPROC + DT_NUM)
 
+#if _CALL_ELF != 2
 /* A PowerPC64 function descriptor.  The .plt (procedure linkage
    table) and .opd (official procedure descriptor) sections are
    arrays of these.  */
@@ -40,6 +41,7 @@ typedef struct
   Elf64_Addr fd_toc;
   Elf64_Addr fd_aux;
 } Elf64_FuncDesc;
+#endif
 
 #define ELF_MULT_MACHINES_SUPPORTED
 
@@ -47,6 +49,18 @@ typedef struct
 static inline int
 elf_machine_matches_host (const Elf64_Ehdr *ehdr)
 {
+  /* Verify that the binary matches our ABI version.  */
+  if ((ehdr->e_flags & EF_PPC64_ABI) != 0)
+    {
+#if _CALL_ELF != 2
+      if ((ehdr->e_flags & EF_PPC64_ABI) != 1)
+        return 0;
+#else
+      if ((ehdr->e_flags & EF_PPC64_ABI) != 2)
+        return 0;
+#endif
+    }
+
   return ehdr->e_machine == EM_PPC64;
 }
 
@@ -124,6 +138,7 @@ elf_machine_dynamic (void)
 "	.align	2\n"							\
 "	" ENTRY_2(_start) "\n"						\
 BODY_PREFIX "_start:\n"							\
+"	" LOCALENTRY(_start) "\n"						\
 /* We start with the following on the stack, from top:			\
    argc (4 bytes);							\
    arguments for program (terminated by NULL);				\
@@ -165,6 +180,7 @@ DL_STARTING_UP_DEF							\
    Changing these is strongly discouraged (not least because argc is	\
    passed by value!).  */						\
 BODY_PREFIX "_dl_start_user:\n"						\
+"	" LOCALENTRY(_dl_start_user) "\n"				\
 /* the address of _start in r30.  */					\
 "	mr	30,3\n"							\
 /* &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28.  */		\
@@ -256,8 +272,22 @@ BODY_PREFIX "_dl_start_user:\n"						\
    relocations behave "normally", ie. always use the real address
    like PLT relocations.  So always set ELF_RTYPE_CLASS_PLT.  */
 
+#if _CALL_ELF != 2
 #define elf_machine_type_class(type) \
   (ELF_RTYPE_CLASS_PLT | (((type) == R_PPC64_COPY) * ELF_RTYPE_CLASS_COPY))
+#else
+/* And now that you have read that large comment, you can disregard it
+   all for ELFv2.  ELFv2 does need the special SHN_UNDEF treatment.  */
+#define IS_PPC64_TLS_RELOC(R)						\
+  (((R) >= R_PPC64_TLS && (R) <= R_PPC64_DTPREL16_HIGHESTA)		\
+   || ((R) >= R_PPC64_TPREL16_HIGH && (R) <= R_PPC64_DTPREL16_HIGHA))
+
+#define elf_machine_type_class(type) \
+  ((((type) == R_PPC64_JMP_SLOT					\
+     || (type) == R_PPC64_ADDR24				\
+     || IS_PPC64_TLS_RELOC (type)) * ELF_RTYPE_CLASS_PLT)	\
+   | (((type) == R_PPC64_COPY) * ELF_RTYPE_CLASS_COPY))
+#endif
 
 /* A reloc type used for ld.so cmdline arg lookups to reject PLT entries.  */
 #define ELF_MACHINE_JMP_SLOT	R_PPC64_JMP_SLOT
@@ -266,8 +296,19 @@ BODY_PREFIX "_dl_start_user:\n"						\
 #define ELF_MACHINE_NO_REL 1
 
 /* Stuff for the PLT.  */
+#if _CALL_ELF != 2
 #define PLT_INITIAL_ENTRY_WORDS 3
+#define PLT_ENTRY_WORDS 3
+#define GLINK_INITIAL_ENTRY_WORDS 8
+/* The first 32k entries of glink can set an index and branch using two
+   instructions; past that point, glink uses three instructions.  */
+#define GLINK_ENTRY_WORDS(I) (((I) < 0x8000)? 2 : 3)
+#else
+#define PLT_INITIAL_ENTRY_WORDS 2
+#define PLT_ENTRY_WORDS 1
 #define GLINK_INITIAL_ENTRY_WORDS 8
+#define GLINK_ENTRY_WORDS(I) 1
+#endif
 
 #define PPC_DCBST(where) asm volatile ("dcbst 0,%0" : : "r"(where) : "memory")
 #define PPC_DCBT(where) asm volatile ("dcbt 0,%0" : : "r"(where) : "memory")
@@ -312,38 +353,45 @@ elf_machine_runtime_setup (struct link_map *map, int lazy, int profile)
 
       if (lazy)
 	{
-	  /* The function descriptor of the appropriate trampoline
-	     routine is used to set the 1st and 2nd doubleword of the
-	     plt_reserve.  */
-	  Elf64_FuncDesc *resolve_fd;
 	  Elf64_Word glink_offset;
-	  /* the plt_reserve area is the 1st 3 doublewords of the PLT */
-	  Elf64_FuncDesc *plt_reserve = (Elf64_FuncDesc *) plt;
 	  Elf64_Word offset;
+	  Elf64_Addr dlrr;
 
-	  resolve_fd = (Elf64_FuncDesc *) (profile ? _dl_profile_resolve
-					   : _dl_runtime_resolve);
+	  dlrr = (Elf64_Addr) (profile ? _dl_profile_resolve
+				       : _dl_runtime_resolve);
 	  if (profile && GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), map))
 	    /* This is the object we are looking for.  Say that we really
 	       want profiling and the timers are started.  */
 	    GL(dl_profile_map) = map;
 
-
+#if _CALL_ELF != 2
 	  /* We need to stuff the address/TOC of _dl_runtime_resolve
 	     into doublewords 0 and 1 of plt_reserve.  Then we need to
 	     stuff the map address into doubleword 2 of plt_reserve.
 	     This allows the GLINK0 code to transfer control to the
 	     correct trampoline which will transfer control to fixup
 	     in dl-machine.c.  */
-	  plt_reserve->fd_func = resolve_fd->fd_func;
-	  plt_reserve->fd_toc  = resolve_fd->fd_toc;
-	  plt_reserve->fd_aux  = (Elf64_Addr) map;
+	  {
+	    /* The plt_reserve area is the 1st 3 doublewords of the PLT.  */
+	    Elf64_FuncDesc *plt_reserve = (Elf64_FuncDesc *) plt;
+	    Elf64_FuncDesc *resolve_fd = (Elf64_FuncDesc *) dlrr;
+	    plt_reserve->fd_func = resolve_fd->fd_func;
+	    plt_reserve->fd_toc  = resolve_fd->fd_toc;
+	    plt_reserve->fd_aux  = (Elf64_Addr) map;
 #ifdef RTLD_BOOTSTRAP
-	  /* When we're bootstrapping, the opd entry will not have
-	     been relocated yet.  */
-	  plt_reserve->fd_func += l_addr;
-	  plt_reserve->fd_toc  += l_addr;
+	    /* When we're bootstrapping, the opd entry will not have
+	       been relocated yet.  */
+	    plt_reserve->fd_func += l_addr;
+	    plt_reserve->fd_toc  += l_addr;
+#endif
+	  }
+#else
+	  /* When we don't have function descriptors, the first doubleword
+	     of the PLT holds the address of _dl_runtime_resolve, and the
+	     second doubleword holds the map address.  */
+	  plt[0] = dlrr;
+	  plt[1] = (Elf64_Addr) map;
 #endif
 
 	  /* Set up the lazy PLT entries.  */
@@ -354,14 +402,8 @@ elf_machine_runtime_setup (struct link_map *map, int lazy, int profile)
 	    {
 
 	      plt[offset] = (Elf64_Xword) &glink[glink_offset];
-	      offset += 3;
-	      /* The first 32k entries of glink can set an index and
-		 branch using two instructions;  Past that point,
-		 glink uses three instructions.  */
-	      if (i < 0x8000)
-		glink_offset += 2;
-	      else
-		glink_offset += 3;
+	      offset += PLT_ENTRY_WORDS;
+	      glink_offset += GLINK_ENTRY_WORDS (i);
 	    }
 
 	  /* Now, we've modified data.  We need to write the changes from
@@ -389,6 +431,7 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t sym_map,
 		       const Elf64_Rela *reloc,
 		       Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
 {
+#if _CALL_ELF != 2
   Elf64_FuncDesc *plt = (Elf64_FuncDesc *) reloc_addr;
   Elf64_FuncDesc *rel = (Elf64_FuncDesc *) finaladdr;
   Elf64_Addr offset = 0;
@@ -426,6 +469,9 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t sym_map,
   plt->fd_func = rel->fd_func + offset;
   PPC_DCBST (&plt->fd_func);
   PPC_ISYNC;
+#else
+  *reloc_addr = finaladdr;
+#endif
 
   return finaladdr;
 }
@@ -433,6 +479,7 @@ elf_machine_fixup_plt (struct link_map *map, lookup_t sym_map,
 static inline void __attribute__ ((always_inline))
 elf_machine_plt_conflict (Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
 {
+#if _CALL_ELF != 2
   Elf64_FuncDesc *plt = (Elf64_FuncDesc *) reloc_addr;
   Elf64_FuncDesc *rel = (Elf64_FuncDesc *) finaladdr;
 
@@ -443,6 +490,9 @@ elf_machine_plt_conflict (Elf64_Addr *reloc_addr, Elf64_Addr finaladdr)
   PPC_DCBST (&plt->fd_aux);
   PPC_DCBST (&plt->fd_toc);
   PPC_SYNC;
+#else
+  *reloc_addr = finaladdr;
+#endif
 }
 
 /* Return the final value of a plt relocation.  */
@@ -512,6 +562,7 @@ auto inline Elf64_Addr __attribute__ ((always_inline))
 resolve_ifunc (Elf64_Addr value,
 	       const struct link_map *map, const struct link_map *sym_map)
 {
+#if _CALL_ELF != 2
 #ifndef RESOLVE_CONFLICT_FIND_MAP
   /* The function we are calling may not yet have its opd entry relocated.  */
   Elf64_FuncDesc opd;
@@ -529,6 +580,7 @@ resolve_ifunc (Elf64_Addr value,
       value = (Elf64_Addr) &opd;
     }
 #endif
+#endif
   return ((Elf64_Addr (*) (unsigned long int)) value) (GLRO(dl_hwcap));
 }
 
diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S
index 1829b9a..db4b349 100644
--- a/sysdeps/powerpc/powerpc64/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/setjmp-common.S
@@ -55,9 +55,10 @@ END (setjmp)
    HAVE_CLEANUP_JMP_BUF is defined.  */
 ENTRY (__GI__setjmp)
 	std r2,40(r1)		/* Save the callers TOC in the save area.  */
-	cfi_endproc
-END_2 (__GI__setjmp)
-/* Fall thru. */
+	CALL_MCOUNT 1
+	li r4,0			/* Set second argument to 0.  */
+	b JUMPTARGET (GLUE(__sigsetjmp,_ent))
+END (__GI__setjmp)
 #endif
 
 ENTRY (_setjmp)
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index cc89b3c..779fd90 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -74,6 +74,8 @@
 #endif
 	.endm
 
+#if _CALL_ELF != 2
+
 /* Macro to prepare for calling via a function pointer.  */
 	.macro PPC64_LOAD_FUNCPTR PTR
 	ld      r12,0(\PTR)
@@ -115,13 +117,37 @@ name##: OPD_ENT (name);				\
 	.size name,.-BODY_LABEL(name);		\
 	.size BODY_LABEL(name),.-BODY_LABEL(name);
 #endif
+#define LOCALENTRY(name)
+
+#else /* _CALL_ELF */
+
+/* Macro to prepare for calling via a function pointer.  */
+	.macro PPC64_LOAD_FUNCPTR PTR
+	mr	r12,\PTR
+	mtctr   r12
+	.endm
+
+#define DOT_LABEL(X) X
+#define BODY_LABEL(X) X
+#define ENTRY_2(name)	\
+	.globl name;				\
+	.type name,@function;
+#define END_2(name)	\
+	.size name,.-name;
+#define LOCALENTRY(name)	\
+1:      addis	r2,r12,.TOC.-1b@ha; \
+        addi	r2,r2,.TOC.-1b@l; \
+	.localentry name,.-name;
+
+#endif /* _CALL_ELF */
 
 #define ENTRY(name)	\
 	.section	".text";		\
 	ENTRY_2(name)				\
 	.align ALIGNARG(2);			\
 BODY_LABEL(name):				\
-	cfi_startproc;
+	cfi_startproc;				\
+	LOCALENTRY(name)
 
 #define EALIGN_W_0  /* No words to insert.  */
 #define EALIGN_W_1  nop
@@ -140,7 +166,8 @@ BODY_LABEL(name):				\
 	.align ALIGNARG(alignt);		\
 	EALIGN_W_##words;			\
 BODY_LABEL(name):				\
-	cfi_startproc;
+	cfi_startproc;				\
+	LOCALENTRY(name)
 
 /* Local labels stripped out by the linker.  */
 #undef L
@@ -295,6 +322,8 @@ LT_LABELSUFFIX(name,_name_end): ; \
 
 #else /* !__ASSEMBLER__ */
 
+#if _CALL_ELF != 2
+
 #define PPC64_LOAD_FUNCPTR(ptr) \
 	"ld 	12,0(" #ptr ");\n"					\
 	"ld	2,8(" #ptr ");\n"					\
@@ -335,5 +364,26 @@ LT_LABELSUFFIX(name,_name_end): ; \
 	".size " #name ",.-" BODY_PREFIX #name ";\n"			\
 	".size " BODY_PREFIX #name ",.-" BODY_PREFIX #name ";"
 #endif
+#define LOCALENTRY(name)
+
+#else /* _CALL_ELF */
+
+#define PPC64_LOAD_FUNCPTR(ptr) \
+	"mr	12," #ptr ";\n"						\
+	"mtctr 	12;"
+
+#define DOT_PREFIX ""
+#define BODY_PREFIX ""
+#define ENTRY_2(name)	\
+	".type " #name ",@function;\n"					\
+	".globl " #name ";"
+#define END_2(name)	\
+	".size " #name ",.-" #name ";"
+#define LOCALENTRY(name)	\
+	"1: addis 2,12,.TOC.-1b@ha;\n"					\
+	"addi	2,2,.TOC.-1b@l;\n"					\
+	".localentry " #name ",.-" #name ";"
+
+#endif /* _CALL_ELF */
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
index d189169..31dd15e 100644
--- a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
@@ -41,7 +41,7 @@ extern void *__vdso_sigtramp32;
 extern void *__vdso_sigtramp_rt32;
 #endif
 
-#if defined(__PPC64__) || defined(__powerpc64__)
+#if (defined(__PPC64__) || defined(__powerpc64__)) && _CALL_ELF != 2
 /* The correct solution is for _dl_vdso_vsym to return the address of the OPD
    for the kernel VDSO function.  That address would then be stored in the
    __vdso_* variables and returned as the result of the IFUNC resolver function.
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/ldsodefs.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/ldsodefs.h
index 4263b1a..d043968 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/ldsodefs.h
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/ldsodefs.h
@@ -23,6 +23,8 @@
 
 /* Now define our stuff.  */
 
+#if _CALL_ELF != 2
+
 static __always_inline bool
 _dl_ppc64_is_opd_sym (const struct link_map *l, const ElfW(Sym) *sym)
 {
@@ -73,4 +75,6 @@ _dl_ppc64_addr_sym_match (const struct link_map *l, const ElfW(Sym) *sym,
 #define DL_ADDR_SYM_MATCH(L, SYM, MATCHSYM, ADDR) \
   _dl_ppc64_addr_sym_match (L, SYM, MATCHSYM, ADDR)
 
+#endif
+
 #endif /* ldsodefs.h */
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
index a7b0a18..b6d82bd 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
@@ -111,6 +111,7 @@ L(parmloop):
 
 L(noparms):
 
+#if _CALL_ELF != 2
   /* Load the function address and TOC from the function descriptor
      and store them in the ucontext as NIP and r2.  Store the 3rd
      field of the function descriptor into the ucontext as r11 in case
@@ -121,6 +122,12 @@ L(noparms):
   std   r0,(SIGCONTEXT_GP_REGS+(PT_NIP*8))(r3)
   std   r10,(SIGCONTEXT_GP_REGS+(PT_R2*8))(r3)
   std   r9,(SIGCONTEXT_GP_REGS+(PT_R11*8))(r3)
+#else
+  /* In the ELFv2 ABI, the function pointer is already the address.
+     Store it as NIP and r12 as required by the ABI.  */
+  std   r4,(SIGCONTEXT_GP_REGS+(PT_NIP*8))(r3)
+  std   r4,(SIGCONTEXT_GP_REGS+(PT_R12*8))(r3)
+#endif
 
   /* If the target function returns we need to do some cleanup.  We use a
      code trick to get the address of our cleanup function into the link

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5446d084fc64c279212616305bbc3c727a6b6d70

commit 5446d084fc64c279212616305bbc3c727a6b6d70
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:52:37 2013 -0600

    PowerPC64 ELFv2 ABI 1/6: Code refactoring
    
    This is the first patch to support the new ELFv2 ABI in glibc.
    
    As preparation, this patch simply refactors some of the powerpc64 assembler
    code to move all code related to creating function descriptors (.opd section)
    or using function descriptors (function pointer call) into a central place
    in sysdep.h.
    
    Note that most locations creating .opd entries were already using macros
    in sysdep.h, this patch simply extends this to the remaining places.
    
    No relevant change in generated code expected.

diff --git a/ChangeLog b/ChangeLog
index 1a0a8a1..9bd9834 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/sysdep.h [ASSEMBLER] (PPC64_LOAD_FUNCPTR):
+	New assembler macro.
+	[ASSEMBLER] (ENTRY_1): Do not switch to .text section here ...
+	[ASSEMBLER] (ENTRY): ... but instead here ...
+	[ASSEMBLER] (EALIGN): ... and here.
+	[!ASSEMBLER] (PPC64_LOAD_FUNCPTR): New macro.
+	[!ASSEMBLER] (ENTRY_1): New macro; set up .opd entry.
+	[!ASSEMBLER] (ENTRY_2): Use it.
+	* sysdeps/powerpc/powerpc64/dl-machine.h (RTLD_START): Update for
+	ENTRY_2 changes.  Use PPC64_LOAD_FUNCPTR.
+	* sysdeps/powerpc/powerpc64/dl-trampoline.S (_dl_runtime_resolve,
+	_dl_profile_resolve): Use PPC64_LOAD_FUNCPTR.
+	* sysdeps/powerpc/powerpc64/crti.S (_init, _fini): Use ENTRY_2.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S (clone):
+	Use PPC64_LOAD_FUNCPTR.
+
 2013-11-12  Alan Modra  <amodra@gmail.com>
 
 	* elf/elf.h (R_PPC64_TLSGD, R_PPC64_TLSLD, R_PPC64_TOCSAVE): Define.
diff --git a/sysdeps/powerpc/powerpc64/crti.S b/sysdeps/powerpc/powerpc64/crti.S
index 967dc66..116199d 100644
--- a/sysdeps/powerpc/powerpc64/crti.S
+++ b/sysdeps/powerpc/powerpc64/crti.S
@@ -60,18 +60,8 @@
 .LC0:
 	.tc PREINIT_FUNCTION[TC], PREINIT_FUNCTION
 #endif
-	.type BODY_LABEL (_init), @function
-	.globl _init
-	.section ".opd", "aw"
-	.align 3
-_init:	OPD_ENT (_init)
-#ifdef HAVE_ASM_GLOBAL_DOT_NAME
-	.globl BODY_LABEL (_init)
-	.size _init, 24
-#else
-	.type _init, @function
-#endif
 	.section ".init", "ax", @progbits
+	ENTRY_2(_init)
 	.align ALIGNARG (2)
 BODY_LABEL (_init):
 	mflr 0
@@ -87,18 +77,8 @@ BODY_LABEL (_init):
 	nop
 1:
 
-	.type BODY_LABEL (_fini), @function
-	.globl _fini
-	.section ".opd", "aw"
-	.align 3
-_fini:	OPD_ENT (_fini)
-#ifdef HAVE_ASM_GLOBAL_DOT_NAME
-	.globl BODY_LABEL (_fini)
-	.size _fini, 24
-#else
-	.type _fini, @function
-#endif
 	.section ".fini", "ax", @progbits
+	ENTRY_2(_fini)
 	.align ALIGNARG (2)
 BODY_LABEL (_fini):
 	mflr 0
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index 19fa4fa..a623be5 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -122,14 +122,7 @@ elf_machine_dynamic (void)
 #define RTLD_START \
   asm (".pushsection \".text\"\n"					\
 "	.align	2\n"							\
-"	.type	" BODY_PREFIX "_start,@function\n"			\
-"	.pushsection \".opd\",\"aw\"\n"					\
-"	.align	3\n"							\
-"	.globl	_start\n"						\
 "	" ENTRY_2(_start) "\n"						\
-"_start:\n"								\
-"	" OPD_ENT(_start) "\n"						\
-"	.popsection\n"							\
 BODY_PREFIX "_start:\n"							\
 /* We start with the following on the stack, from top:			\
    argc (4 bytes);							\
@@ -154,11 +147,6 @@ BODY_PREFIX "_start:\n"							\
 ".LT__start_name_end:\n"						\
 "	.align 2\n"							\
 "	" END_2(_start) "\n"						\
-"	.globl	_dl_start_user\n"					\
-"	.pushsection \".opd\",\"aw\"\n"					\
-"_dl_start_user:\n"							\
-"	" OPD_ENT(_dl_start_user) "\n"					\
-"	.popsection\n"							\
 "	.pushsection	\".toc\",\"aw\"\n"				\
 DL_STARTING_UP_DEF							\
 ".LC__rtld_local:\n"							\
@@ -170,7 +158,6 @@ DL_STARTING_UP_DEF							\
 ".LC__dl_fini:\n"							\
 "	.tc _dl_fini[TC],_dl_fini\n"					\
 "	.popsection\n"							\
-"	.type	" BODY_PREFIX "_dl_start_user,@function\n"		\
 "	" ENTRY_2(_dl_start_user) "\n"					\
 /* Now, we do our main work of calling initialisation procedures.	\
    The ELF ABI doesn't say anything about parameters for these,		\
@@ -228,10 +215,7 @@ BODY_PREFIX "_dl_start_user:\n"						\
 /* Now, call the start function descriptor at r30...  */		\
 "	.globl	._dl_main_dispatch\n"					\
 "._dl_main_dispatch:\n"							\
-"	ld	0,0(30)\n"						\
-"	ld	2,8(30)\n"						\
-"	mtctr	0\n"							\
-"	ld	11,16(30)\n"						\
+"	" PPC64_LOAD_FUNCPTR(30) "\n"					\
 "	bctr\n"								\
 ".LT__dl_start_user:\n"							\
 "	.long 0\n"							\
diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S
index 4dde276..bffc4cb 100644
--- a/sysdeps/powerpc/powerpc64/dl-trampoline.S
+++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S
@@ -71,12 +71,8 @@ EALIGN(_dl_runtime_resolve, 4, 0)
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
 	mtcrf	0xFF,r0
-/* Load the target address, toc and static chain reg from the function
-   descriptor returned by fixup.  */
-	ld	r0,0(r3)
-	ld	r2,8(r3)
-	mtctr	r0
-	ld	r11,16(r3)
+/* Prepare for calling the function returned by fixup.  */
+	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
 /* Unwind the stack frame, and jump.  */
 	addi	r1,r1,FRAME_SIZE
@@ -322,13 +318,9 @@ L(restoreFXR):
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
 	mtcrf	0xFF,r0
-/* Load the target address, toc and static chain reg from the function
-   descriptor returned by fixup.  */
-	ld	r0,0(r3)
-	ld	r2,8(r3)
-	ld	r11,16(r3)
+/* Prepare for calling the function returned by fixup.  */
+	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
-	mtctr	r0
 /* Load the floating point registers.  */
 	lfd	fp1,FPR_PARMS+0(r1)
 	lfd	fp2,FPR_PARMS+8(r1)
@@ -386,14 +378,10 @@ L(restoreFXR2):
 	ld	r5,INT_PARMS+16(r1)
 	ld	r4,INT_PARMS+8(r1)
 	mtcrf	0xFF,r0
-/* Load the target address, toc and static chain reg from the function
-   descriptor returned by fixup.  */
-	ld	r0,0(r3)
+/* Prepare for calling the function returned by fixup.  */
 	std	r2,40(r1)
-	ld	r2,8(r3)
-	ld	r11,16(r3)
+	PPC64_LOAD_FUNCPTR r3
 	ld	r3,INT_PARMS+0(r1)
-	mtctr	r0
 /* Load the floating point registers.  */
 	lfd	fp1,FPR_PARMS+0(r1)
 	lfd	fp2,FPR_PARMS+8(r1)
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index 57fa8ba..cc89b3c 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -74,6 +74,14 @@
 #endif
 	.endm
 
+/* Macro to prepare for calling via a function pointer.  */
+	.macro PPC64_LOAD_FUNCPTR PTR
+	ld      r12,0(\PTR)
+	ld      r2,8(\PTR)
+	mtctr   r12
+	ld      r11,16(\PTR)
+	.endm
+
 #ifdef USE_PPC64_OVERLAPPING_OPD
 # define OPD_ENT(name)	.quad BODY_LABEL (name), .TOC.@tocbase
 #else
@@ -81,7 +89,6 @@
 #endif
 
 #define ENTRY_1(name)	\
-	.section	".text";		\
 	.type BODY_LABEL(name),@function;	\
 	.globl name;				\
 	.section ".opd","aw";			\
@@ -110,6 +117,7 @@ name##: OPD_ENT (name);				\
 #endif
 
 #define ENTRY(name)	\
+	.section	".text";		\
 	ENTRY_2(name)				\
 	.align ALIGNARG(2);			\
 BODY_LABEL(name):				\
@@ -127,6 +135,7 @@ BODY_LABEL(name):				\
 /* EALIGN is like ENTRY, but does alignment to 'words'*4 bytes
    past a 2^alignt boundary.  */
 #define EALIGN(name, alignt, words) \
+	.section	".text";		\
 	ENTRY_2(name)				\
 	.align ALIGNARG(alignt);		\
 	EALIGN_W_##words;			\
@@ -286,24 +295,42 @@ LT_LABELSUFFIX(name,_name_end): ; \
 
 #else /* !__ASSEMBLER__ */
 
+#define PPC64_LOAD_FUNCPTR(ptr) \
+	"ld 	12,0(" #ptr ");\n"					\
+	"ld	2,8(" #ptr ");\n"					\
+	"mtctr	12;\n"							\
+	"ld	11,16(" #ptr ");"
+
 #ifdef USE_PPC64_OVERLAPPING_OPD
 # define OPD_ENT(name)	".quad " BODY_PREFIX #name ", .TOC.@tocbase;"
 #else
 # define OPD_ENT(name)	".quad " BODY_PREFIX #name ", .TOC.@tocbase, 0;"
 #endif
 
+#define ENTRY_1(name)	\
+	".type   " BODY_PREFIX #name ",@function;\n"			\
+	".globl " #name ";\n"						\
+	".pushsection \".opd\",\"aw\";\n"				\
+	".align  3;\n"							\
+#name ":\n"								\
+	OPD_ENT (name) "\n"						\
+	".popsection;"
+
 #ifdef HAVE_ASM_GLOBAL_DOT_NAME
 # define DOT_PREFIX "."
 # define BODY_PREFIX "."
 # define ENTRY_2(name)	\
 	".globl " BODY_PREFIX #name ";\n"				\
+	ENTRY_1(name) "\n"						\
 	".size  " #name ", 24;"
 # define END_2(name)	\
 	".size " BODY_PREFIX #name ",.-" BODY_PREFIX #name ";"
 #else
 # define DOT_PREFIX ""
 # define BODY_PREFIX ".LY"
-# define ENTRY_2(name) ".type " #name ",@function;"
+# define ENTRY_2(name)	\
+	".type " #name ",@function;\n"					\
+	ENTRY_1(name)
 # define END_2(name)	\
 	".size " #name ",.-" BODY_PREFIX #name ";\n"			\
 	".size " BODY_PREFIX #name ",.-" BODY_PREFIX #name ";"
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
index cf46856..4151d15 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone.S
@@ -99,9 +99,7 @@ L(oldpid):
 
 	std	r2,40(r1)
 	/* Call procedure.  */
-	ld	r0,0(r30)
-	ld	r2,8(r30)
-	mtctr	r0
+	PPC64_LOAD_FUNCPTR r30
 	mr	r3,r31
 	bctrl
 	ld	r2,40(r1)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=47a519a15c82486e7f5a15e0f81a58df301017d9

commit 47a519a15c82486e7f5a15e0f81a58df301017d9
Author: Alan Modra <amodra@gmail.com>
Date:   Fri Nov 15 11:51:18 2013 -0600

    PowerPC64: Report overflow on @h and @ha relocations
    
    This patch updates glibc in accordance with the binutils patch checked
    in here:
    https://sourceware.org/ml/binutils/2013-10/msg00372.html
    
    This changes the various R_PPC64_..._HI and _HA relocations to report
    32-bit overflows.  The motivation is that existing uses of @h / @ha
    are to build up 32-bit offsets (for the "medium model" TOC access
    that GCC now defaults to), and we'd really like to see failures at
    link / load time rather than silent truncations.
    
    For those rare cases where a modifier is needed to build up a 64-bit
    constant, new relocations _HIGH / _HIGHA are supported.
    
    The patch also fixes a bug in overflow checking for the R_PPC64_ADDR30
    and R_PPC64_ADDR32 relocations.

diff --git a/ChangeLog b/ChangeLog
index 7b4c702..1a0a8a1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2013-11-12  Alan Modra  <amodra@gmail.com>
+
+	* elf/elf.h (R_PPC64_TLSGD, R_PPC64_TLSLD, R_PPC64_TOCSAVE): Define.
+	(R_PPC64_ADDR16_HIGH, R_PPC64_ADDR16_HIGHA): Likewise.
+	(R_PPC64_TPREL16_HIGH, R_PPC64_TPREL16_HIGHA): Likewise.
+	(R_PPC64_DTPREL16_HIGH, R_PPC64_DTPREL16_HIGHA): Likewise.
+
+	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Add
+	overflow checking for R_PPC64_ADDR16_HI, R_PPC64_ADDR16_HA,
+	R_PPC64_TPREL16_HI, and R_PPC64_TPREL16_HA.
+	Support new R_PPC64_ADDR16_HIGH, R_PPC64_ADDR16_HIGHA,
+	R_PPC64_TPREL16_HIGH, and R_PPC64_TPREL16_HIGHA relocations.
+	Fix overflow checking for R_PPC64_ADDR30 and R_PPC64_ADDR32.
+
 2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
 
 	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
diff --git a/elf/elf.h b/elf/elf.h
index f372271..98c722e 100644
--- a/elf/elf.h
+++ b/elf/elf.h
@@ -2251,6 +2251,17 @@ typedef Elf32_Addr Elf32_Conflict;
 #define R_PPC64_DTPREL16_HIGHERA 104 /* half16	(sym+add)@dtprel@highera */
 #define R_PPC64_DTPREL16_HIGHEST 105 /* half16	(sym+add)@dtprel@highest */
 #define R_PPC64_DTPREL16_HIGHESTA 106 /* half16	(sym+add)@dtprel@highesta */
+#define R_PPC64_TLSGD		107 /* none	(sym+add)@tlsgd */
+#define R_PPC64_TLSLD		108 /* none	(sym+add)@tlsld */
+#define R_PPC64_TOCSAVE		109 /* none */
+
+/* Added when HA and HI relocs were changed to report overflows.  */
+#define R_PPC64_ADDR16_HIGH	110
+#define R_PPC64_ADDR16_HIGHA	111
+#define R_PPC64_TPREL16_HIGH	112
+#define R_PPC64_TPREL16_HIGHA	113
+#define R_PPC64_DTPREL16_HIGH	114
+#define R_PPC64_DTPREL16_HIGHA	115
 
 /* GNU extension to support local ifunc.  */
 #define R_PPC64_JMP_IREL	247
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index 18cf157..19fa4fa 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -669,11 +669,25 @@ elf_machine_rela (struct link_map *map,
 
     case R_PPC64_TPREL16_HI:
       value = elf_machine_tprel (map, sym_map, sym, reloc);
+      if (dont_expect (value + 0x80000000 >= 0x100000000LL))
+	_dl_reloc_overflow (map, "R_PPC64_TPREL16_HI", reloc_addr, refsym);
+      *(Elf64_Half *) reloc_addr = PPC_HI (value);
+      break;
+
+    case R_PPC64_TPREL16_HIGH:
+      value = elf_machine_tprel (map, sym_map, sym, reloc);
       *(Elf64_Half *) reloc_addr = PPC_HI (value);
       break;
 
     case R_PPC64_TPREL16_HA:
       value = elf_machine_tprel (map, sym_map, sym, reloc);
+      if (dont_expect (value + 0x80008000 >= 0x100000000LL))
+	_dl_reloc_overflow (map, "R_PPC64_TPREL16_HA", reloc_addr, refsym);
+      *(Elf64_Half *) reloc_addr = PPC_HA (value);
+      break;
+
+    case R_PPC64_TPREL16_HIGHA:
+      value = elf_machine_tprel (map, sym_map, sym, reloc);
       *(Elf64_Half *) reloc_addr = PPC_HA (value);
       break;
 
@@ -709,17 +723,23 @@ elf_machine_rela (struct link_map *map,
       break;
 
     case R_PPC64_ADDR16_HI:
+      if (dont_expect (value + 0x80000000 >= 0x100000000LL))
+	_dl_reloc_overflow (map, "R_PPC64_ADDR16_HI", reloc_addr, refsym);
+    case R_PPC64_ADDR16_HIGH:
       *(Elf64_Half *) reloc_addr = PPC_HI (value);
       break;
 
     case R_PPC64_ADDR16_HA:
+      if (dont_expect (value + 0x80008000 >= 0x100000000LL))
+	_dl_reloc_overflow (map, "R_PPC64_ADDR16_HA", reloc_addr, refsym);
+    case R_PPC64_ADDR16_HIGHA:
       *(Elf64_Half *) reloc_addr = PPC_HA (value);
       break;
 
     case R_PPC64_ADDR30:
       {
 	Elf64_Addr delta = value - (Elf64_Xword) reloc_addr;
-	if (dont_expect ((delta + 0x80000000) >= 0x10000000
+	if (dont_expect ((delta + 0x80000000) >= 0x100000000LL
 			 || (delta & 3) != 0))
 	  _dl_reloc_overflow (map, "R_PPC64_ADDR30", reloc_addr, refsym);
 	BIT_INSERT (*(Elf64_Word *) reloc_addr, delta, 0xfffffffc);
@@ -755,7 +775,7 @@ elf_machine_rela (struct link_map *map,
       return;
 
     case R_PPC64_ADDR32:
-      if (dont_expect ((value + 0x80000000) >= 0x10000000))
+      if (dont_expect ((value + 0x80000000) >= 0x100000000LL))
 	_dl_reloc_overflow (map, "R_PPC64_ADDR32", reloc_addr, refsym);
       *(Elf64_Word *) reloc_addr = value;
       return;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f0e592f61ef14f3633a35c9a761525e46c03629a

commit f0e592f61ef14f3633a35c9a761525e46c03629a
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:49:21 2013 -0600

    PowerPC64: Fix incorrect CFI in *context routines
    
    The context established by "makecontext" has a link register pointing
    back to an error path within the makecontext routine.  This is currently
    covered by the CFI FDE for makecontext itself, which is simply wrong
    for the stack frame *inside* the context.  When trying to unwind (e.g.
    doing a backtrace) in a routine inside a context created by makecontext,
    this can lead to uninitialized stack slots being accessed, causing the
    unwinder to crash in the worst case.
    
    Similarly, during parts of the "setcontext" routine, when the stack
    pointer has already been switched to point to the new context, the
    address range is still covered by the CFI FDE for setcontext.  When
    trying to unwind in that situation (e.g. backtrace from an async
    signal handler for profiling), it is again possible that the unwinder
    crashes.
    
    Theses are all problems in existing code, but the changes in stack
    frame layout appear to make the "worst case" much more likely in
    the ELFv2 ABI context.  This causes regressions e.g. in the libgo
    testsuite on ELFv2.
    
    This patch fixes this by ending the makecontext/setcontext FDEs
    before those problematic parts of the assembler, similar to what
    is already done on other platforms.   This fixes the libgo
    regression on ELFv2.

diff --git a/ChangeLog b/ChangeLog
index 4ab2fc4..7b4c702 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+	(__makecontext): Fix incorrect CFI when backtracing out of
+	context created via makecontext.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/secontext.S
+	(__setcontext): Fix incorrect CFI during switch to new context.
+	(__novec_setcontext): Likewise.
+
 2013-11-08  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h (VDSO_IFUNC_RET):
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
index 32fc47c..a7b0a18 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
@@ -129,6 +129,10 @@ L(noparms):
      the cpu link stack used to predict blr return addresses.  */
   bcl	20,31,L(gotexitcodeaddr);
 
+  /* End FDE now, because while executing on the context's stack
+     the unwind info would be wrong otherwise.  */
+  cfi_endproc
+
 	/* This is the helper code which gets called if a function which
 	   is registered with 'makecontext' returns.  In this case we
 	   have to install the context listed in the uc_link element of
@@ -157,6 +161,11 @@ L(do_exit):
 #endif
 	b    L(do_exit)
 
+  /* Re-establish FDE for the rest of the actual makecontext routine.  */
+  cfi_startproc
+  cfi_offset (lr, FRAME_LR_SAVE)
+  cfi_adjust_cfa_offset (128)
+
   /* The address of the exit code is in the link register.  Store the lr
      in the ucontext as LNK so the target function will return to our
      exit code.  */
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/setcontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/setcontext.S
index e1f0b86..5ec19ba 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/setcontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/setcontext.S
@@ -129,6 +129,10 @@ ENTRY(__novec_setcontext)
   lfd  fp1,(SIGCONTEXT_FP_REGS+(PT_R1*8))(r31)
   lfd  fp0,(SIGCONTEXT_FP_REGS+(PT_R0*8))(r31)
 
+  /* End FDE now, because the unwind info would be wrong while
+     we're reloading registers to switch to the new context.  */
+  cfi_endproc
+
   ld   r0,(SIGCONTEXT_GP_REGS+(PT_LNK*8))(r31)
   ld   r1,(SIGCONTEXT_GP_REGS+(PT_R1*8))(r31)
   mtlr r0
@@ -177,6 +181,11 @@ ENTRY(__novec_setcontext)
   ld   r31,(SIGCONTEXT_GP_REGS+(PT_R31*8))(r31)
   bctr
 
+  /* Re-establish FDE for the rest of the actual setcontext routine.  */
+  cfi_startproc
+  cfi_offset (lr, FRAME_LR_SAVE)
+  cfi_adjust_cfa_offset (128)
+
 L(nv_error_exit):
   ld   r0,128+FRAME_LR_SAVE(r1)
   addi r1,r1,128
@@ -403,6 +412,10 @@ L(has_no_vec):
   lfd  fp1,(SIGCONTEXT_FP_REGS+(PT_R1*8))(r31)
   lfd  fp0,(SIGCONTEXT_FP_REGS+(PT_R0*8))(r31)
 
+  /* End FDE now, because the unwind info would be wrong while
+     we're reloading registers to switch to the new context.  */
+  cfi_endproc
+
   ld   r0,(SIGCONTEXT_GP_REGS+(PT_LNK*8))(r31)
   ld   r1,(SIGCONTEXT_GP_REGS+(PT_R1*8))(r31)
   mtlr r0
@@ -451,6 +464,11 @@ L(has_no_vec):
   ld   r31,(SIGCONTEXT_GP_REGS+(PT_R31*8))(r31)
   bctr
 
+  /* Re-establish FDE for the rest of the actual setcontext routine.  */
+  cfi_startproc
+  cfi_offset (lr, FRAME_LR_SAVE)
+  cfi_adjust_cfa_offset (128)
+
 L(error_exit):
   ld   r0,128+FRAME_LR_SAVE(r1)
   addi r1,r1,128

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f3f9eb461294f1da1f2aa7c39c77e563caa74007

commit f3f9eb461294f1da1f2aa7c39c77e563caa74007
Author: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
Date:   Fri Nov 15 11:47:44 2013 -0600

    PowerPC64: Add __private_ss field to TCB header
    
    the TCB header on Intel contains a field __private_ss that is used
    to efficiently implement the -fsplit-stack GCC feature.
    
    In order to prepare for a possible future implementation of that
    feature on powerpc64, we'd like to reserve a similar field in
    the TCB header as well.  (It would be good if this went in with
    or before the ELFv2 patches to ensure that this field will be
    available always in the ELFv2 environment.)
    
    The field needs to be added at the front of tcbhead_t structure
    to avoid changing the ABI; see the recent discussion when adding
    the EBB fields.

diff --git a/nptl/ChangeLog b/nptl/ChangeLog
index 2d78490..3ac857a 100644
--- a/nptl/ChangeLog
+++ b/nptl/ChangeLog
@@ -1,3 +1,7 @@
+2013-11-12  Ulrich Weigand  <Ulrich.Weigand@de.ibm.com>
+
+	* sysdeps/powerpc/tls.h (tcbhead_t): Add __private_ss field.
+
 2013-10-03  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 	[BZ #15996]
diff --git a/nptl/sysdeps/powerpc/tls.h b/nptl/sysdeps/powerpc/tls.h
index 8e0ada6..c21c027 100644
--- a/nptl/sysdeps/powerpc/tls.h
+++ b/nptl/sysdeps/powerpc/tls.h
@@ -61,6 +61,8 @@ typedef union dtv
    are private.  */
 typedef struct
 {
+  /* GCC split stack support.  */
+  void *__private_ss;
   /* Reservation for the Event-Based Branching ABI.  */
   uintptr_t ebb_handler;
   uintptr_t ebb_ctx_pointer;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=718786a12ee3f566dc1c0898f82d843ef4145c8e

commit 718786a12ee3f566dc1c0898f82d843ef4145c8e
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Thu Nov 7 05:34:22 2013 -0600

    PowerPC: Fix vDSO missing ODP entries
    
    This patch fixes the vDSO symbol used directed in IFUNC resolver where
    they do not have an associated ODP entry leading to undefined behavior
    in some cases. It adds an artificial OPD static entry to such cases
    and set its TOC to non 0 to avoid triggering lazy resolutions.

diff --git a/ChangeLog b/ChangeLog
index d5f760b..4ab2fc4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2013-11-08  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h (VDSO_IFUNC_RET):
+	Add artificial ODP entry for vDSO symbol for PPC64.
+	* sysdeps/unix/sysv/linux/powerpc/gettimeofday.c: Adjust includes.
+	* sysdeps/unix/sysv/linux/powerpc/time.c: Likewise.
+
 2013-09-25  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/stackguard-macros.h (POINTER_CHK_GUARD:
diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
index ba54de4..d189169 100644
--- a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
@@ -41,12 +41,32 @@ extern void *__vdso_sigtramp32;
 extern void *__vdso_sigtramp_rt32;
 #endif
 
-/* This macro is needed for PPC64 to return a skeleton OPD entry of a vDSO
-   symbol.  This works because _dl_vdso_vsym always return the function
-   address, and no vDSO symbols use the TOC or chain pointers from the OPD
-   so we can allow them to be garbage.  */
 #if defined(__PPC64__) || defined(__powerpc64__)
-#define VDSO_IFUNC_RET(value)  ((void *) &(value))
+/* The correct solution is for _dl_vdso_vsym to return the address of the OPD
+   for the kernel VDSO function.  That address would then be stored in the
+   __vdso_* variables and returned as the result of the IFUNC resolver function.
+   Yet, the kernel does not contain any OPD entries for the VDSO functions
+   (incomplete implementation).  However, PLT relocations for IFUNCs still expect
+   the address of an OPD to be returned from the IFUNC resolver function (since
+   PLT entries on PPC64 are just copies of OPDs).  The solution for now is to
+   create an artificial static OPD for each VDSO function returned by a resolver
+   function.  The TOC value is set to a non-zero value to avoid triggering lazy
+   symbol resolution via .glink0/.plt0 for a zero TOC (requires thread-safe PLT
+   sequences) when the dynamic linker isn't prepared for it e.g. RTLD_NOW.  None
+   of the kernel VDSO routines use the TOC or AUX values so any non-zero value
+   will work.  Note that function pointer comparisons will not use this artificial
+   static OPD since those are resolved via ADDR64 relocations and will point at
+   the non-IFUNC default OPD for the symbol.  Lastly, because the IFUNC relocations
+   are processed immediately at startup the resolver functions and this code need
+   not be thread-safe, but if the caller writes to a PLT slot it must do so in a
+   thread-safe manner with all the required barriers.  */
+#define VDSO_IFUNC_RET(value)                            \
+  ({                                                     \
+    static Elf64_FuncDesc vdso_opd = { .fd_toc = ~0x0 }; \
+    vdso_opd.fd_func = (Elf64_Addr)value;                \
+    &vdso_opd;                                           \
+  })
+
 #else
 #define VDSO_IFUNC_RET(value)  ((void *) (value))
 #endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c b/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c
index 6506d75..48c3f84 100644
--- a/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c
+++ b/sysdeps/unix/sysv/linux/powerpc/gettimeofday.c
@@ -22,6 +22,7 @@
 
 # include <dl-vdso.h>
 # include <bits/libc-vdso.h>
+# include <dl-machine.h>
 
 void *gettimeofday_ifunc (void) __asm__ ("__gettimeofday");
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/time.c b/sysdeps/unix/sysv/linux/powerpc/time.c
index 66b4eb3..2d77ece 100644
--- a/sysdeps/unix/sysv/linux/powerpc/time.c
+++ b/sysdeps/unix/sysv/linux/powerpc/time.c
@@ -20,7 +20,9 @@
 
 # include <time.h>
 # include <sysdep.h>
+# include <dl-vdso.h>
 # include <bits/libc-vdso.h>
+# include <dl-machine.h>
 
 void *time_ifunc (void) asm ("time");
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d128971aa2bc7d7a0b7aaed8c3f546d1d5cc858b

commit d128971aa2bc7d7a0b7aaed8c3f546d1d5cc858b
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Sep 25 13:43:04 2013 -0500

    PowerPC: Fix POINTER_CHK_GUARD thread register for PPC64

diff --git a/ChangeLog b/ChangeLog
index 9698a1d..d5f760b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-09-25  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/stackguard-macros.h (POINTER_CHK_GUARD:
+	Fix thread ID register.
+
 2013-09-11  Will Newton  <will.newton@linaro.org>
 
 	[BZ #15857]
diff --git a/sysdeps/powerpc/powerpc64/stackguard-macros.h b/sysdeps/powerpc/powerpc64/stackguard-macros.h
index 4620f96..e80a683 100644
--- a/sysdeps/powerpc/powerpc64/stackguard-macros.h
+++ b/sysdeps/powerpc/powerpc64/stackguard-macros.h
@@ -6,7 +6,7 @@
 #define POINTER_CHK_GUARD \
   ({												\
      uintptr_t x;										\
-     asm ("ld %0,%1(2)"										\
+     asm ("ld %0,%1(13)"										\
 	  : "=r" (x)										\
 	  : "i" (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))	\
          );											\

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6e610d1dd8a23ed21352e1f3b366becee0ec6fc2

commit 6e610d1dd8a23ed21352e1f3b366becee0ec6fc2
Author: Will Newton <will.newton@linaro.org>
Date:   Fri Sep 13 09:26:02 2013 +0100

    Add CVE-2013-4332 to NEWS.

diff --git a/NEWS b/NEWS
index d97a4ac..a816bc1 100644
--- a/NEWS
+++ b/NEWS
@@ -45,6 +45,11 @@ Version 2.18
   15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15893,
   15895.
 
+* CVE-2013-4332 The pvalloc, valloc, memalign, posix_memalign and
+  aligned_alloc functions could allocate too few bytes or corrupt the
+  heap when passed very large allocation size values (Bugzilla #15855,
+  #15856, #15857).
+
 * CVE-2013-4788 The pointer guard used for pointer mangling was not
   initialized for static applications resulting in the security feature
   being disabled. The pointer guard is now correctly initialized to a

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=66a49fcc5e0cd5d652347ee5983c0bbd00fdeee9

commit 66a49fcc5e0cd5d652347ee5983c0bbd00fdeee9
Author: Will Newton <will.newton@linaro.org>
Date:   Fri Aug 16 12:54:29 2013 +0100

    malloc: Check for integer overflow in memalign.
    
    A large bytes parameter to memalign could cause an integer overflow
    and corrupt allocator internals. Check the overflow does not occur
    before continuing with the allocation.
    
    ChangeLog:
    
    2013-09-11  Will Newton  <will.newton@linaro.org>
    
    	[BZ #15857]
    	* malloc/malloc.c (__libc_memalign): Check the value of bytes
    	does not overflow.

diff --git a/ChangeLog b/ChangeLog
index 4ec9ada..9698a1d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2013-09-11  Will Newton  <will.newton@linaro.org>
 
+	[BZ #15857]
+	* malloc/malloc.c (__libc_memalign): Check the value of bytes
+	does not overflow.
+
+2013-09-11  Will Newton  <will.newton@linaro.org>
+
 	[BZ #15856]
 	* malloc/malloc.c (__libc_valloc): Check the value of bytes
 	does not overflow.
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 31e2dfa..ebbe86d 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3015,6 +3015,13 @@ __libc_memalign(size_t alignment, size_t bytes)
   /* Otherwise, ensure that it is at least a minimum chunk size */
   if (alignment <  MINSIZE) alignment = MINSIZE;
 
+  /* Check for overflow.  */
+  if (bytes > SIZE_MAX - alignment - MINSIZE)
+    {
+      __set_errno (ENOMEM);
+      return 0;
+    }
+
   arena_get(ar_ptr, bytes + alignment + MINSIZE);
   if(!ar_ptr)
     return 0;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ed595041168e6353afeb6157181ef9d5e047bb11

commit ed595041168e6353afeb6157181ef9d5e047bb11
Author: Will Newton <will.newton@linaro.org>
Date:   Fri Aug 16 11:59:37 2013 +0100

    malloc: Check for integer overflow in valloc.
    
    A large bytes parameter to valloc could cause an integer overflow
    and corrupt allocator internals. Check the overflow does not occur
    before continuing with the allocation.
    
    ChangeLog:
    
    2013-09-11  Will Newton  <will.newton@linaro.org>
    
    	[BZ #15856]
    	* malloc/malloc.c (__libc_valloc): Check the value of bytes
    	does not overflow.

diff --git a/ChangeLog b/ChangeLog
index e35032f..4ec9ada 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2013-09-11  Will Newton  <will.newton@linaro.org>
 
+	[BZ #15856]
+	* malloc/malloc.c (__libc_valloc): Check the value of bytes
+	does not overflow.
+
+2013-09-11  Will Newton  <will.newton@linaro.org>
+
 	[BZ #15855]
 	* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
 	does not overflow.
diff --git a/malloc/malloc.c b/malloc/malloc.c
index bcc08c4..31e2dfa 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3046,6 +3046,13 @@ __libc_valloc(size_t bytes)
 
   size_t pagesz = GLRO(dl_pagesize);
 
+  /* Check for overflow.  */
+  if (bytes > SIZE_MAX - pagesz - MINSIZE)
+    {
+      __set_errno (ENOMEM);
+      return 0;
+    }
+
   void *(*hook) (size_t, size_t, const void *) =
     force_reg (__memalign_hook);
   if (__builtin_expect (hook != NULL, 0))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b0f1586fb49af2bb472ef07bce0cc558800ba024

commit b0f1586fb49af2bb472ef07bce0cc558800ba024
Author: Will Newton <will.newton@linaro.org>
Date:   Mon Aug 12 15:08:02 2013 +0100

    malloc: Check for integer overflow in pvalloc.
    
    A large bytes parameter to pvalloc could cause an integer overflow
    and corrupt allocator internals. Check the overflow does not occur
    before continuing with the allocation.
    
    ChangeLog:
    
    2013-09-11  Will Newton  <will.newton@linaro.org>
    
    	[BZ #15855]
    	* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
    	does not overflow.

diff --git a/ChangeLog b/ChangeLog
index 52346e0..e35032f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2013-09-11  Will Newton  <will.newton@linaro.org>
+
+	[BZ #15855]
+	* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
+	does not overflow.
+
 2013-09-23  Carlos O'Donell  <carlos@redhat.com>
 
 	[BZ #15754]
diff --git a/malloc/malloc.c b/malloc/malloc.c
index be472b2..bcc08c4 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3082,6 +3082,13 @@ __libc_pvalloc(size_t bytes)
   size_t page_mask = GLRO(dl_pagesize) - 1;
   size_t rounded_bytes = (bytes + page_mask) & ~(page_mask);
 
+  /* Check for overflow.  */
+  if (bytes > SIZE_MAX - 2*pagesz - MINSIZE)
+    {
+      __set_errno (ENOMEM);
+      return 0;
+    }
+
   void *(*hook) (size_t, size_t, const void *) =
     force_reg (__memalign_hook);
   if (__builtin_expect (hook != NULL, 0))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=19a903fb37a199ee7d11f7cff9b7fd90c8b67e32

commit 19a903fb37a199ee7d11f7cff9b7fd90c8b67e32
Author: Carlos O'Donell <carlos@redhat.com>
Date:   Mon Sep 23 00:52:09 2013 -0400

    BZ #15754: CVE-2013-4788
    
    The pointer guard used for pointer mangling was not initialized for
    static applications resulting in the security feature being disabled.
    The pointer guard is now correctly initialized to a random value for
    static applications. Existing static applications need to be
    recompiled to take advantage of the fix.
    
    The test tst-ptrguard1-static and tst-ptrguard1 add regression
    coverage to ensure the pointer guards are sufficiently random
    and initialized to a default value.

diff --git a/ChangeLog b/ChangeLog
index 2a11ed8..52346e0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2013-09-23  Carlos O'Donell  <carlos@redhat.com>
+
+	[BZ #15754]
+	* elf/Makefile (tests): Add tst-ptrguard1.
+	(tests-static): Add tst-ptrguard1-static.
+	(tst-ptrguard1-ARGS): Define.
+	(tst-ptrguard1-static-ARGS): Define.
+	* elf/tst-ptrguard1.c: New file.
+	* elf/tst-ptrguard1-static.c: New file.
+	* sysdeps/x86_64/stackguard-macros.h: Define POINTER_CHK_GUARD.
+	* sysdeps/i386/stackguard-macros.h: Likewise.
+	* sysdeps/powerpc/powerpc32/stackguard-macros.h: Likewise.
+	* sysdeps/powerpc/powerpc64/stackguard-macros.h: Likewise.
+	* sysdeps/s390/s390-32/stackguard-macros.h: Likewise.
+	* sysdeps/s390/s390-64/stackguard-macros.h: Likewise.
+	* sysdeps/sparc/sparc32/stackguard-macros.h: Likewise.
+	* sysdeps/sparc/sparc64/stackguard-macros.h: Likewise.
+
 2013-09-23  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 	[BZ #14547]
diff --git a/NEWS b/NEWS
index e83c78c..d97a4ac 100644
--- a/NEWS
+++ b/NEWS
@@ -45,6 +45,12 @@ Version 2.18
   15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15893,
   15895.
 
+* CVE-2013-4788 The pointer guard used for pointer mangling was not
+  initialized for static applications resulting in the security feature
+  being disabled. The pointer guard is now correctly initialized to a
+  random value for static applications. Existing static applications need
+  to be recompiled to take advantage of the fix (bug 15754).
+
 * CVE-2012-4412 The strcoll implementation caches indices and rules for
   large collation sequences to optimize multiple passes.  This cache
   computation may overflow for large collation sequences and may cause a
diff --git a/csu/libc-start.c b/csu/libc-start.c
index e5da3ef..c898d06 100644
--- a/csu/libc-start.c
+++ b/csu/libc-start.c
@@ -37,6 +37,12 @@ extern void __pthread_initialize_minimal (void);
    in thread local area.  */
 uintptr_t __stack_chk_guard attribute_relro;
 # endif
+# ifndef  THREAD_SET_POINTER_GUARD
+/* Only exported for architectures that don't store the pointer guard
+   value in thread local area.  */
+uintptr_t __pointer_chk_guard_local
+	attribute_relro attribute_hidden __attribute__ ((nocommon));
+# endif
 #endif
 
 #ifdef HAVE_PTR_NTHREADS
@@ -195,6 +201,16 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
 # else
   __stack_chk_guard = stack_chk_guard;
 # endif
+
+  /* Set up the pointer guard value.  */
+  uintptr_t pointer_chk_guard = _dl_setup_pointer_guard (_dl_random,
+							 stack_chk_guard);
+# ifdef THREAD_SET_POINTER_GUARD
+  THREAD_SET_POINTER_GUARD (pointer_chk_guard);
+# else
+  __pointer_chk_guard_local = pointer_chk_guard;
+# endif
+
 #endif
 
   /* Register the destructor of the dynamic linker if there is any.  */
diff --git a/elf/Makefile b/elf/Makefile
index 3b58649..98834f4 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -121,7 +121,8 @@ endif
 tests = tst-tls1 tst-tls2 tst-tls9 tst-leaks1 \
 	tst-array1 tst-array2 tst-array3 tst-array4 tst-array5
 tests-static = tst-tls1-static tst-tls2-static tst-stackguard1-static \
-	       tst-leaks1-static tst-array1-static tst-array5-static
+	       tst-leaks1-static tst-array1-static tst-array5-static \
+	       tst-ptrguard1-static
 ifeq (yes,$(build-shared))
 tests-static += tst-tls9-static
 tst-tls9-static-ENV = \
@@ -145,7 +146,8 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \
 	 tst-audit1 tst-audit2 tst-audit8 \
 	 tst-stackguard1 tst-addr1 tst-thrlock \
 	 tst-unique1 tst-unique2 tst-unique3 tst-unique4 \
-	 tst-initorder tst-initorder2 tst-relsort1 tst-null-argv
+	 tst-initorder tst-initorder2 tst-relsort1 tst-null-argv \
+	 tst-ptrguard1
 #	 reldep9
 test-srcs = tst-pathopt
 selinux-enabled := $(shell cat /selinux/enforce 2> /dev/null)
@@ -1016,6 +1018,9 @@ LDFLAGS-order2mod2.so = $(no-as-needed)
 tst-stackguard1-ARGS = --command "$(host-test-program-cmd) --child"
 tst-stackguard1-static-ARGS = --command "$(objpfx)tst-stackguard1-static --child"
 
+tst-ptrguard1-ARGS = --command "$(host-test-program-cmd) --child"
+tst-ptrguard1-static-ARGS = --command "$(objpfx)tst-ptrguard1-static --child"
+
 $(objpfx)tst-leaks1: $(libdl)
 $(objpfx)tst-leaks1-mem: $(objpfx)tst-leaks1.out
 	$(common-objpfx)malloc/mtrace $(objpfx)tst-leaks1.mtrace > $@
diff --git a/elf/tst-ptrguard1-static.c b/elf/tst-ptrguard1-static.c
new file mode 100644
index 0000000..7aff3b7
--- /dev/null
+++ b/elf/tst-ptrguard1-static.c
@@ -0,0 +1 @@
+#include "tst-ptrguard1.c"
diff --git a/elf/tst-ptrguard1.c b/elf/tst-ptrguard1.c
new file mode 100644
index 0000000..c344a04
--- /dev/null
+++ b/elf/tst-ptrguard1.c
@@ -0,0 +1,202 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <stackguard-macros.h>
+#include <tls.h>
+#include <unistd.h>
+
+#ifndef POINTER_CHK_GUARD
+extern uintptr_t __pointer_chk_guard;
+# define POINTER_CHK_GUARD __pointer_chk_guard
+#endif
+
+static const char *command;
+static bool child;
+static uintptr_t ptr_chk_guard_copy;
+static bool ptr_chk_guard_copy_set;
+static int fds[2];
+
+static void __attribute__ ((constructor))
+con (void)
+{
+  ptr_chk_guard_copy = POINTER_CHK_GUARD;
+  ptr_chk_guard_copy_set = true;
+}
+
+static int
+uintptr_t_cmp (const void *a, const void *b)
+{
+  if (*(uintptr_t *) a < *(uintptr_t *) b)
+    return 1;
+  if (*(uintptr_t *) a > *(uintptr_t *) b)
+    return -1;
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  if (!ptr_chk_guard_copy_set)
+    {
+      puts ("constructor has not been run");
+      return 1;
+    }
+
+  if (ptr_chk_guard_copy != POINTER_CHK_GUARD)
+    {
+      puts ("POINTER_CHK_GUARD changed between constructor and do_test");
+      return 1;
+    }
+
+  if (child)
+    {
+      write (2, &ptr_chk_guard_copy, sizeof (ptr_chk_guard_copy));
+      return 0;
+    }
+
+  if (command == NULL)
+    {
+      puts ("missing --command or --child argument");
+      return 1;
+    }
+
+#define N 16
+  uintptr_t child_ptr_chk_guards[N + 1];
+  child_ptr_chk_guards[N] = ptr_chk_guard_copy;
+  int i;
+  for (i = 0; i < N; ++i)
+    {
+      if (pipe (fds) < 0)
+	{
+	  printf ("couldn't create pipe: %m\n");
+	  return 1;
+	}
+
+      pid_t pid = fork ();
+      if (pid < 0)
+	{
+	  printf ("fork failed: %m\n");
+	  return 1;
+	}
+
+      if (!pid)
+	{
+	  if (ptr_chk_guard_copy != POINTER_CHK_GUARD)
+	    {
+	      puts ("POINTER_CHK_GUARD changed after fork");
+	      exit (1);
+	    }
+
+	  close (fds[0]);
+	  close (2);
+	  dup2 (fds[1], 2);
+	  close (fds[1]);
+
+	  system (command);
+	  exit (0);
+	}
+
+      close (fds[1]);
+
+      if (TEMP_FAILURE_RETRY (read (fds[0], &child_ptr_chk_guards[i],
+				    sizeof (uintptr_t))) != sizeof (uintptr_t))
+	{
+	  puts ("could not read ptr_chk_guard value from child");
+	  return 1;
+	}
+
+      close (fds[0]);
+
+      pid_t termpid;
+      int status;
+      termpid = TEMP_FAILURE_RETRY (waitpid (pid, &status, 0));
+      if (termpid == -1)
+	{
+	  printf ("waitpid failed: %m\n");
+	  return 1;
+	}
+      else if (termpid != pid)
+	{
+	  printf ("waitpid returned %ld != %ld\n",
+		  (long int) termpid, (long int) pid);
+	  return 1;
+	}
+      else if (!WIFEXITED (status) || WEXITSTATUS (status))
+	{
+	  puts ("child hasn't exited with exit status 0");
+	  return 1;
+	}
+    }
+
+  qsort (child_ptr_chk_guards, N + 1, sizeof (uintptr_t), uintptr_t_cmp);
+
+  /* The default pointer guard is the same as the default stack guard.
+     They are only set to default if dl_random is NULL.  */
+  uintptr_t default_guard = 0;
+  unsigned char *p = (unsigned char *) &default_guard;
+  p[sizeof (uintptr_t) - 1] = 255;
+  p[sizeof (uintptr_t) - 2] = '\n';
+  p[0] = 0;
+
+  /* Test if the pointer guard canaries are either randomized,
+     or equal to the default pointer guard value.
+     Even with randomized pointer guards it might happen
+     that the random number generator generates the same
+     values, but if that happens in more than half from
+     the 16 runs, something is very wrong.  */
+  int ndifferences = 0;
+  int ndefaults = 0;
+  for (i = 0; i < N; ++i)
+    {
+      if (child_ptr_chk_guards[i] != child_ptr_chk_guards[i+1])
+	ndifferences++;
+      else if (child_ptr_chk_guards[i] == default_guard)
+	ndefaults++;
+    }
+
+  printf ("differences %d defaults %d\n", ndifferences, ndefaults);
+
+  if (ndifferences < N / 2 && ndefaults < N / 2)
+    {
+      puts ("pointer guard values are not randomized enough");
+      puts ("nor equal to the default value");
+      return 1;
+    }
+
+  return 0;
+}
+
+#define OPT_COMMAND	10000
+#define OPT_CHILD	10001
+#define CMDLINE_OPTIONS	\
+  { "command", required_argument, NULL, OPT_COMMAND },  \
+  { "child", no_argument, NULL, OPT_CHILD },
+#define CMDLINE_PROCESS	\
+  case OPT_COMMAND:	\
+    command = optarg;	\
+    break;		\
+  case OPT_CHILD:	\
+    child = true;	\
+    break;
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/ports/ChangeLog.ia64 b/ports/ChangeLog.ia64
index 92b81cb..f61f37f 100644
--- a/ports/ChangeLog.ia64
+++ b/ports/ChangeLog.ia64
@@ -1,3 +1,8 @@
+2013-09-22  Carlos O'Donell  <carlos@redhat.com>
+
+	[BZ #15754]
+	* sysdeps/ia64/stackguard-macros.h: Define POINTER_CHK_GUARD.
+
 2013-07-04  Andreas Jaeger  <aj@suse.de>
 
 	* sysdeps/unix/sysv/linux/ia64/sys/ptrace.h (PTRACE_LISTEN):
diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index a2ec5e1..972bd81 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,3 +1,8 @@
+2013-09-22  Carlos O'Donell  <carlos@redhat.com>
+
+	[BZ #15754]
+	* sysdeps/tile/stackguard-macros.h: Define POINTER_CHK_GUARD.
+
 2013-07-22  Chris Metcalf  <cmetcalf@tilera.com>
 
 	[BZ #15759]
diff --git a/ports/sysdeps/ia64/stackguard-macros.h b/ports/sysdeps/ia64/stackguard-macros.h
index dc683c2..3907293 100644
--- a/ports/sysdeps/ia64/stackguard-macros.h
+++ b/ports/sysdeps/ia64/stackguard-macros.h
@@ -2,3 +2,6 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("adds %0 = -8, r13;; ld8 %0 = [%0]" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("adds %0 = -16, r13;; ld8 %0 = [%0]" : "=r" (x)); x; })
diff --git a/ports/sysdeps/tile/stackguard-macros.h b/ports/sysdeps/tile/stackguard-macros.h
index 589ea2b..f2e041b 100644
--- a/ports/sysdeps/tile/stackguard-macros.h
+++ b/ports/sysdeps/tile/stackguard-macros.h
@@ -4,11 +4,17 @@
 # if __WORDSIZE == 64
 #  define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("addi %0, tp, -16; ld %0, %0" : "=r" (x)); x; })
+#  define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("addi %0, tp, -24; ld %0, %0" : "=r" (x)); x; })
 # else
 #  define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("addi %0, tp, -8; ld4s %0, %0" : "=r" (x)); x; })
+#  define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("addi %0, tp, -12; ld4s %0, %0" : "=r" (x)); x; })
 # endif
 #else
 # define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("addi %0, tp, -8; lw %0, %0" : "=r" (x)); x; })
+# define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("addi %0, tp, -12; lw %0, %0" : "=r" (x)); x; })
 #endif
diff --git a/sysdeps/generic/stackguard-macros.h b/sysdeps/generic/stackguard-macros.h
index ababf65..4fa3d96 100644
--- a/sysdeps/generic/stackguard-macros.h
+++ b/sysdeps/generic/stackguard-macros.h
@@ -2,3 +2,6 @@
 
 extern uintptr_t __stack_chk_guard;
 #define STACK_CHK_GUARD __stack_chk_guard
+
+extern uintptr_t __pointer_chk_guard_local;
+#define POINTER_CHK_GUARD __pointer_chk_guard_local
diff --git a/sysdeps/i386/stackguard-macros.h b/sysdeps/i386/stackguard-macros.h
index 8c31e19..0397629 100644
--- a/sysdeps/i386/stackguard-macros.h
+++ b/sysdeps/i386/stackguard-macros.h
@@ -2,3 +2,11 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("movl %%gs:0x14, %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({							\
+     uintptr_t x;					\
+     asm ("movl %%gs:%c1, %0" : "=r" (x)		\
+	  : "i" (offsetof (tcbhead_t, pointer_guard)));	\
+     x;							\
+   })
diff --git a/sysdeps/powerpc/powerpc32/stackguard-macros.h b/sysdeps/powerpc/powerpc32/stackguard-macros.h
index 839f6a4..b3d0af8 100644
--- a/sysdeps/powerpc/powerpc32/stackguard-macros.h
+++ b/sysdeps/powerpc/powerpc32/stackguard-macros.h
@@ -2,3 +2,13 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("lwz %0,-28680(2)" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({												\
+     uintptr_t x;										\
+     asm ("lwz %0,%1(2)"									\
+	  : "=r" (x)										\
+	  : "i" (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))	\
+         );											\
+     x;												\
+   })
diff --git a/sysdeps/powerpc/powerpc64/stackguard-macros.h b/sysdeps/powerpc/powerpc64/stackguard-macros.h
index 9da879c..4620f96 100644
--- a/sysdeps/powerpc/powerpc64/stackguard-macros.h
+++ b/sysdeps/powerpc/powerpc64/stackguard-macros.h
@@ -2,3 +2,13 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("ld %0,-28688(13)" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({												\
+     uintptr_t x;										\
+     asm ("ld %0,%1(2)"										\
+	  : "=r" (x)										\
+	  : "i" (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))	\
+         );											\
+     x;												\
+   })
diff --git a/sysdeps/s390/s390-32/stackguard-macros.h b/sysdeps/s390/s390-32/stackguard-macros.h
index b74c579..449e8d4 100644
--- a/sysdeps/s390/s390-32/stackguard-macros.h
+++ b/sysdeps/s390/s390-32/stackguard-macros.h
@@ -2,3 +2,14 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("ear %0,%%a0; l %0,0x14(%0)" : "=a" (x)); x; })
+
+/* On s390/s390x there is no unique pointer guard, instead we use the
+   same value as the stack guard.  */
+#define POINTER_CHK_GUARD \
+  ({							\
+     uintptr_t x;					\
+     asm ("ear %0,%%a0; l %0,%1(%0)"			\
+	  : "=a" (x)					\
+	  : "i" (offsetof (tcbhead_t, stack_guard)));	\
+     x;							\
+   })
diff --git a/sysdeps/s390/s390-64/stackguard-macros.h b/sysdeps/s390/s390-64/stackguard-macros.h
index 0cebb5f..c8270fb 100644
--- a/sysdeps/s390/s390-64/stackguard-macros.h
+++ b/sysdeps/s390/s390-64/stackguard-macros.h
@@ -2,3 +2,17 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("ear %0,%%a0; sllg %0,%0,32; ear %0,%%a1; lg %0,0x28(%0)" : "=a" (x)); x; })
+
+/* On s390/s390x there is no unique pointer guard, instead we use the
+   same value as the stack guard.  */
+#define POINTER_CHK_GUARD \
+  ({							\
+     uintptr_t x;					\
+     asm ("ear %0,%%a0;"				\
+	  "sllg %0,%0,32;"				\
+	  "ear %0,%%a1;"				\
+	  "lg %0,%1(%0)"				\
+	 : "=a" (x)					\
+	 : "i" (offsetof (tcbhead_t, stack_guard)));	\
+     x;							\
+   })
diff --git a/sysdeps/sparc/sparc32/stackguard-macros.h b/sysdeps/sparc/sparc32/stackguard-macros.h
index c0b02b0..1eef0f1 100644
--- a/sysdeps/sparc/sparc32/stackguard-macros.h
+++ b/sysdeps/sparc/sparc32/stackguard-macros.h
@@ -2,3 +2,6 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("ld [%%g7+0x14], %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("ld [%%g7+0x18], %0" : "=r" (x)); x; })
diff --git a/sysdeps/sparc/sparc64/stackguard-macros.h b/sysdeps/sparc/sparc64/stackguard-macros.h
index 80f0635..cc0c12c 100644
--- a/sysdeps/sparc/sparc64/stackguard-macros.h
+++ b/sysdeps/sparc/sparc64/stackguard-macros.h
@@ -2,3 +2,6 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("ldx [%%g7+0x28], %0" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({ uintptr_t x; asm ("ldx [%%g7+0x30], %0" : "=r" (x)); x; })
diff --git a/sysdeps/x86_64/stackguard-macros.h b/sysdeps/x86_64/stackguard-macros.h
index d7fedb3..1948800 100644
--- a/sysdeps/x86_64/stackguard-macros.h
+++ b/sysdeps/x86_64/stackguard-macros.h
@@ -4,3 +4,8 @@
   ({ uintptr_t x;						\
      asm ("mov %%fs:%c1, %0" : "=r" (x)				\
 	  : "i" (offsetof (tcbhead_t, stack_guard))); x; })
+
+#define POINTER_CHK_GUARD \
+  ({ uintptr_t x;						\
+     asm ("mov %%fs:%c1, %0" : "=r" (x)				\
+	  : "i" (offsetof (tcbhead_t, pointer_guard))); x; })

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4415a70617fdc4a47a6302f69b943c9c51ae8cac

commit 4415a70617fdc4a47a6302f69b943c9c51ae8cac
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date:   Mon Sep 23 11:24:30 2013 +0530

    Check for integer overflow in cache size computation in strcoll
    
    strcoll is implemented using a cache for indices and weights of
    collation sequences in the strings so that subsequent passes do not
    have to search through collation data again.  For very large string
    inputs, the cache size computation could overflow.  In such a case,
    use the fallback function that does not cache indices and weights of
    collation sequences.
    
    Fixes CVE-2012-4412.

diff --git a/ChangeLog b/ChangeLog
index 6886e5f..2a11ed8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,12 @@
 2013-09-23  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 	[BZ #14547]
+	* string/tst-strcoll-overflow.c: New test case.
+	* string/Makefile (xtests): Add tst-strcoll-overflow.
+	* string/strcoll_l.c (STRCOLL): Skip allocating memory for
+	cache if string sizes may cause integer overflow.
+
+	[BZ #14547]
 	* string/strcoll_l.c (coll_seq): New members rule, idx,
 	save_idx and back_us.
 	(get_next_seq_nocache): New function.
diff --git a/NEWS b/NEWS
index aec8b00..e83c78c 100644
--- a/NEWS
+++ b/NEWS
@@ -45,6 +45,12 @@ Version 2.18
   15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15893,
   15895.
 
+* CVE-2012-4412 The strcoll implementation caches indices and rules for
+  large collation sequences to optimize multiple passes.  This cache
+  computation may overflow for large collation sequences and may cause a
+  stack or buffer overflow.  This is now fixed to use a slower algorithm
+  which does not use a cache if there is an integer overflow.
+
 * CVE-2013-2207 Incorrectly granting access to another user's pseudo-terminal
   has been fixed by disabling the use of pt_chown (Bugzilla #15755).
   Distributions can re-enable building and using pt_chown via the new configure
diff --git a/string/Makefile b/string/Makefile
index 72d3e29..17f9d68 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -57,6 +57,8 @@ tests		:= tester inl-tester noinl-tester testcopy test-ffs	\
 tests-ifunc := $(strop-tests:%=test-%-ifunc)
 tests += $(tests-ifunc)
 
+xtests = tst-strcoll-overflow
+
 include ../Rules
 
 tester-ENV = LANGUAGE=C
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index eb042ff..4ee101a 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -524,7 +524,15 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   memset (&seq1, 0, sizeof (seq1));
   seq2 = seq1;
 
-  if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
+  size_t size_max = SIZE_MAX / (sizeof (int32_t) + 1);
+
+  if (MIN (s1len, s2len) > size_max
+      || MAX (s1len, s2len) > size_max - MIN (s1len, s2len))
+    {
+      /* If the strings are long enough to cause overflow in the size request,
+         then skip the allocation and proceed with the non-cached routines.  */
+    }
+  else if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
     {
       seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
 
diff --git a/string/tst-strcoll-overflow.c b/string/tst-strcoll-overflow.c
new file mode 100644
index 0000000..bb665ac
--- /dev/null
+++ b/string/tst-strcoll-overflow.c
@@ -0,0 +1,61 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <locale.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Verify that strcoll does not crash for large strings for which it cannot
+   cache weight lookup results.  The size is large enough to cause integer
+   overflows on 32-bit as well as buffer overflows on 64-bit.  The test should
+   work reasonably reliably when overcommit is disabled, but it obviously
+   depends on how much memory the system has.  There's a limitation to this
+   test in that it does not run to completion.  Actually collating such a
+   large string can take days and we can't have xcheck running that long.  For
+   that reason, we run the test for about 5 minutes and then assume that
+   everything is fine if there are no crashes.  */
+#define SIZE 0x40000000ul
+
+int
+do_test (void)
+{
+  if (setlocale (LC_COLLATE, "en_GB.UTF-8") == NULL)
+    {
+      puts ("setlocale failed, cannot test for overflow");
+      return 0;
+    }
+
+  char *p = malloc (SIZE);
+
+  if (p == NULL)
+    {
+      puts ("could not allocate memory");
+      return 1;
+    }
+
+  memset (p, 'x', SIZE - 1);
+  p[SIZE - 1] = 0;
+  printf ("%d\n", strcoll (p, p));
+  return 0;
+}
+
+#define TIMEOUT 300
+#define EXPECTED_SIGNAL SIGALRM
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5326e2c77bbd559fec9dc50e6d250eb0ce6a8d8c

commit 5326e2c77bbd559fec9dc50e6d250eb0ce6a8d8c
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date:   Mon Sep 23 11:20:02 2013 +0530

    Fall back to non-cached sequence traversal and comparison on malloc fail
    
    strcoll currently falls back to alloca if malloc fails, resulting in a
    possible stack overflow.  This patch implements sequence traversal and
    comparison without caching indices and rules.
    
    Fixes CVE-2012-4424.

diff --git a/ChangeLog b/ChangeLog
index 2394806..6886e5f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,12 @@
-2013-08-20  Siddhesh Poyarekar  <siddhesh@redhat.com>
+2013-09-23  Siddhesh Poyarekar  <siddhesh@redhat.com>
+
+	[BZ #14547]
+	* string/strcoll_l.c (coll_seq): New members rule, idx,
+	save_idx and back_us.
+	(get_next_seq_nocache): New function.
+	(do_compare_nocache): New function.
+	(STRCOLL): Use get_next_seq_nocache and do_compare_nocache
+	when malloc fails.
 
 	* string/strcoll_l.c (coll_seq): New structure.
 	(get_next_seq_cached): New function.
diff --git a/NEWS b/NEWS
index 8b228b0..aec8b00 100644
--- a/NEWS
+++ b/NEWS
@@ -9,8 +9,8 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14155, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909, 15917,
-  15996, 16072, 16150.
+  14155, 14547, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909,
+  15917, 15996, 16072, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
@@ -18,6 +18,12 @@ Version 2.18.1
 
 * CVE-2013-4458 Stack overflow in getaddrinfo with large number of results
   for AF_INET6 has been fixed (Bugzilla #16072).
+
+* CVE-2012-4424 The strcoll implementation uses malloc to cache indices and
+  rules for large collation sequences to optimize multiple passes and falls
+  back to alloca if malloc fails, resulting in a possible stack overflow.
+  The implementation now falls back to an uncached collation sequence lookup
+  if malloc fails.
 
 Version 2.18
 
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index 50ed84d..eb042ff 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -45,7 +45,7 @@
 typedef struct
 {
   int len;			/* Length of the current sequence.  */
-  int val;			/* Position of the sequence relative to the
+  size_t val;			/* Position of the sequence relative to the
 				   previous non-ignored sequence.  */
   size_t idxnow;		/* Current index in sequences.  */
   size_t idxmax;		/* Maximum index in sequences.  */
@@ -55,6 +55,12 @@ typedef struct
   const USTRING_TYPE *us;	/* The string.  */
   int32_t *idxarr;		/* Array to cache weight indices.  */
   unsigned char *rulearr;	/* Array to cache rules.  */
+  unsigned char rule;		/* Saved rule for the first sequence.  */
+  int32_t idx;			/* Index to weight of the current sequence.  */
+  int32_t save_idx;		/* Save looked up index of a forward
+				   sequence after the last backward
+				   sequence.  */
+  const USTRING_TYPE *back_us;	/* Beginning of the backward sequence.  */
 } coll_seq;
 
 /* Get next sequence.  The weight indices are cached, so we don't need to
@@ -64,7 +70,7 @@ get_next_seq_cached (coll_seq *seq, int nrules, int pass,
 		     const unsigned char *rulesets,
 		     const USTRING_TYPE *weights)
 {
-  int val = seq->val = 0;
+  size_t val = seq->val = 0;
   int len = seq->len;
   size_t backw_stop = seq->backw_stop;
   size_t backw = seq->backw;
@@ -146,7 +152,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
 	      const USTRING_TYPE *extra, const int32_t *indirect)
 {
 #include WEIGHT_H
-  int val = seq->val = 0;
+  size_t val = seq->val = 0;
   int len = seq->len;
   size_t backw_stop = seq->backw_stop;
   size_t backw = seq->backw;
@@ -162,7 +168,7 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
       ++val;
       if (backw_stop != ~0ul)
 	{
-	  /* The is something pushed.  */
+	  /* There is something pushed.  */
 	  if (backw == backw_stop)
 	    {
 	      /* The last pushed character was handled.  Continue
@@ -227,15 +233,199 @@ get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
   seq->us = us;
 }
 
-/* Compare two sequences.  */
+/* Get next sequence.  Traverse the string as required.  This function does not
+   set or use any index or rule cache.  */
+static void
+get_next_seq_nocache (coll_seq *seq, int nrules, const unsigned char *rulesets,
+		      const USTRING_TYPE *weights, const int32_t *table,
+		      const USTRING_TYPE *extra, const int32_t *indirect,
+		      int pass)
+{
+#include WEIGHT_H
+  size_t val = seq->val = 0;
+  int len = seq->len;
+  size_t backw_stop = seq->backw_stop;
+  size_t backw = seq->backw;
+  size_t idxcnt = seq->idxcnt;
+  size_t idxmax = seq->idxmax;
+  int32_t idx = seq->idx;
+  const USTRING_TYPE *us = seq->us;
+
+  while (len == 0)
+    {
+      ++val;
+      if (backw_stop != ~0ul)
+	{
+	  /* There is something pushed.  */
+	  if (backw == backw_stop)
+	    {
+	      /* The last pushed character was handled.  Continue
+		 with forward characters.  */
+	      if (idxcnt < idxmax)
+		{
+		  idx = seq->save_idx;
+		  backw_stop = ~0ul;
+		}
+	      else
+		{
+		  /* Nothing anymore.  The backward sequence ended with
+		     the last sequence in the string.  Note that len is
+		     still zero.  */
+		  idx = 0;
+		  break;
+	        }
+	    }
+	  else
+	    {
+	      /* XXX Traverse BACKW sequences from the beginning of
+		 BACKW_STOP to get the next sequence.  Is ther a quicker way
+	         to do this?  */
+	      size_t i = backw_stop;
+	      us = seq->back_us;
+	      while (i < backw)
+		{
+		  int32_t tmp = findidx (&us, -1);
+		  idx = tmp & 0xffffff;
+		  i++;
+		}
+	      --backw;
+	      us = seq->us;
+	    }
+	}
+      else
+	{
+	  backw_stop = idxmax;
+	  int32_t prev_idx = idx;
+
+	  while (*us != L('\0'))
+	    {
+	      int32_t tmp = findidx (&us, -1);
+	      unsigned char rule = tmp >> 24;
+	      prev_idx = idx;
+	      idx = tmp & 0xffffff;
+	      idxcnt = idxmax++;
+
+	      /* Save the rule for the first sequence.  */
+	      if (__glibc_unlikely (idxcnt == 0))
+	        seq->rule = rule;
+
+	      if ((rulesets[rule * nrules + pass]
+		   & sort_backward) == 0)
+		/* No more backward characters to push.  */
+		break;
+	      ++idxcnt;
+	    }
+
+	  if (backw_stop >= idxcnt)
+	    {
+	      /* No sequence at all or just one.  */
+	      if (idxcnt == idxmax || backw_stop > idxcnt)
+		/* Note that len is still zero.  */
+		break;
+
+	      backw_stop = ~0ul;
+	    }
+	  else
+	    {
+	      /* We pushed backward sequences.  If the stream ended with the
+		 backward sequence, then we process the last sequence we
+		 found.  Otherwise we process the sequence before the last
+		 one since the last one was a forward sequence.  */
+	      seq->back_us = seq->us;
+	      seq->us = us;
+	      backw = idxcnt;
+	      if (idxmax > idxcnt)
+		{
+		  backw--;
+		  seq->save_idx = idx;
+		  idx = prev_idx;
+		}
+	      if (backw > backw_stop)
+		backw--;
+	    }
+	}
+
+      len = weights[idx++];
+      /* Skip over indices of previous levels.  */
+      for (int i = 0; i < pass; i++)
+	{
+	  idx += len;
+	  len = weights[idx];
+	  idx++;
+	}
+    }
+
+  /* Update the structure.  */
+  seq->val = val;
+  seq->len = len;
+  seq->backw_stop = backw_stop;
+  seq->backw = backw;
+  seq->idxcnt = idxcnt;
+  seq->idxmax = idxmax;
+  seq->us = us;
+  seq->idx = idx;
+}
+
+/* Compare two sequences.  This version does not use the index and rules
+   cache.  */
+static int
+do_compare_nocache (coll_seq *seq1, coll_seq *seq2, int position,
+		    const USTRING_TYPE *weights)
+{
+  int seq1len = seq1->len;
+  int seq2len = seq2->len;
+  size_t val1 = seq1->val;
+  size_t val2 = seq2->val;
+  int idx1 = seq1->idx;
+  int idx2 = seq2->idx;
+  int result = 0;
+
+  /* Test for position if necessary.  */
+  if (position && val1 != val2)
+    {
+      result = val1 > val2 ? 1 : -1;
+      goto out;
+    }
+
+  /* Compare the two sequences.  */
+  do
+    {
+      if (weights[idx1] != weights[idx2])
+	{
+	  /* The sequences differ.  */
+	  result = weights[idx1] - weights[idx2];
+	  goto out;
+	}
+
+      /* Increment the offsets.  */
+      ++idx1;
+      ++idx2;
+
+      --seq1len;
+      --seq2len;
+    }
+  while (seq1len > 0 && seq2len > 0);
+
+  if (position && seq1len != seq2len)
+    result = seq1len - seq2len;
+
+out:
+  seq1->len = seq1len;
+  seq2->len = seq2len;
+  seq1->idx = idx1;
+  seq2->idx = idx2;
+  return result;
+}
+
+/* Compare two sequences using the index cache.  */
 static int
 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
 	    const USTRING_TYPE *weights)
 {
   int seq1len = seq1->len;
   int seq2len = seq2->len;
-  int val1 = seq1->val;
-  int val2 = seq2->val;
+  size_t val1 = seq1->val;
+  size_t val2 = seq2->val;
   int32_t *idx1arr = seq1->idxarr;
   int32_t *idx2arr = seq2->idxarr;
   int idx1now = seq1->idxnow;
@@ -245,7 +435,7 @@ do_compare (coll_seq *seq1, coll_seq *seq2, int position,
   /* Test for position if necessary.  */
   if (position && val1 != val2)
     {
-      result = val1 - val2;
+      result = val1 > val2 ? 1 : -1;
       goto out;
     }
 
@@ -334,57 +524,62 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
   memset (&seq1, 0, sizeof (seq1));
   seq2 = seq1;
 
-  /* We need the elements of the strings as unsigned values since they
-     are used as indices.  */
-  seq1.us = (const USTRING_TYPE *) s1;
-  seq2.us = (const USTRING_TYPE *) s2;
-
   if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
     {
       seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
-      seq2.idxarr = &seq1.idxarr[s1len];
-      seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
-      seq2.rulearr = &seq1.rulearr[s1len];
-
-      if (seq1.idxarr == NULL)
-	/* No memory.  Well, go with the stack then.
-
-	   XXX Once this implementation is stable we will handle this
-	   differently.  Instead of precomputing the indices we will
-	   do this in time.  This means, though, that this happens for
-	   every pass again.  */
-	goto try_stack;
-      use_malloc = true;
+
+      /* If we failed to allocate memory, we leave everything as NULL so that
+	 we use the nocache version of traversal and comparison functions.  */
+      if (seq1.idxarr != NULL)
+	{
+	  seq2.idxarr = &seq1.idxarr[s1len];
+	  seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
+	  seq2.rulearr = &seq1.rulearr[s1len];
+	  use_malloc = true;
+	}
     }
   else
     {
-    try_stack:
       seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t));
       seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t));
       seq1.rulearr = (unsigned char *) alloca (s1len);
       seq2.rulearr = (unsigned char *) alloca (s2len);
     }
 
-  seq1.rulearr[0] = 0;
+  int rule = 0;
 
   /* Cache values in the first pass and if needed, use them in subsequent
      passes.  */
   for (int pass = 0; pass < nrules; ++pass)
     {
       seq1.idxcnt = 0;
+      seq1.idx = 0;
+      seq2.idx = 0;
       seq1.backw_stop = ~0ul;
       seq1.backw = ~0ul;
       seq2.idxcnt = 0;
       seq2.backw_stop = ~0ul;
       seq2.backw = ~0ul;
 
+      /* We need the elements of the strings as unsigned values since they
+	 are used as indices.  */
+      seq1.us = (const USTRING_TYPE *) s1;
+      seq2.us = (const USTRING_TYPE *) s2;
+
       /* We assume that if a rule has defined `position' in one section
 	 this is true for all of them.  */
-      int position = rulesets[seq1.rulearr[0] * nrules + pass] & sort_position;
+      int position = rulesets[rule * nrules + pass] & sort_position;
 
       while (1)
 	{
-	  if (pass == 0)
+	  if (__glibc_unlikely (seq1.idxarr == NULL))
+	    {
+	      get_next_seq_nocache (&seq1, nrules, rulesets, weights, table,
+				    extra, indirect, pass);
+	      get_next_seq_nocache (&seq2, nrules, rulesets, weights, table,
+				    extra, indirect, pass);
+	    }
+	  else if (pass == 0)
 	    {
 	      get_next_seq (&seq1, nrules, rulesets, weights, table, extra,
 			    indirect);
@@ -411,10 +606,18 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 	      goto free_and_return;
 	    }
 
-	  result = do_compare (&seq1, &seq2, position, weights);
+	  if (__glibc_unlikely (seq1.idxarr == NULL))
+	    result = do_compare_nocache (&seq1, &seq2, position, weights);
+	  else
+	    result = do_compare (&seq1, &seq2, position, weights);
 	  if (result != 0)
 	    goto free_and_return;
 	}
+
+      if (__glibc_likely (seq1.rulearr != NULL))
+	rule = seq1.rulearr[0];
+      else
+	rule = seq1.rule;
     }
 
   /* Free the memory if needed.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=eb5e4ca15296911a3052268da127ac5fe9cb8a5c

commit eb5e4ca15296911a3052268da127ac5fe9cb8a5c
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date:   Tue Aug 20 08:40:05 2013 +0530

    Simplify strcoll implementation
    
    Break up strcoll into simpler functions so that the logic is easier to
    follow and maintain.

diff --git a/ChangeLog b/ChangeLog
index 9124a19..2394806 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-08-20  Siddhesh Poyarekar  <siddhesh@redhat.com>
+
+	* string/strcoll_l.c (coll_seq): New structure.
+	(get_next_seq_cached): New function.
+	(get_next_seq): New function.
+	(do_compare): New function.
+	(STRCOLL): Use GNU style definition.  Simplify implementation
+	by using get_next_seq, get_next_seq_cached and do_compare.
+
 2013-10-31  Andreas Schwab  <schwab@suse.de>
 
 	[BZ# 15917]
diff --git a/string/strcoll_l.c b/string/strcoll_l.c
index ecda08f..50ed84d 100644
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -41,11 +41,244 @@
 
 #include "../locale/localeinfo.h"
 
+/* Track status while looking for sequences in a string.  */
+typedef struct
+{
+  int len;			/* Length of the current sequence.  */
+  int val;			/* Position of the sequence relative to the
+				   previous non-ignored sequence.  */
+  size_t idxnow;		/* Current index in sequences.  */
+  size_t idxmax;		/* Maximum index in sequences.  */
+  size_t idxcnt;		/* Current count of indices.  */
+  size_t backw;			/* Current Backward sequence index.  */
+  size_t backw_stop;		/* Index where the backward sequences stop.  */
+  const USTRING_TYPE *us;	/* The string.  */
+  int32_t *idxarr;		/* Array to cache weight indices.  */
+  unsigned char *rulearr;	/* Array to cache rules.  */
+} coll_seq;
+
+/* Get next sequence.  The weight indices are cached, so we don't need to
+   traverse the string.  */
+static void
+get_next_seq_cached (coll_seq *seq, int nrules, int pass,
+		     const unsigned char *rulesets,
+		     const USTRING_TYPE *weights)
+{
+  int val = seq->val = 0;
+  int len = seq->len;
+  size_t backw_stop = seq->backw_stop;
+  size_t backw = seq->backw;
+  size_t idxcnt = seq->idxcnt;
+  size_t idxmax = seq->idxmax;
+  size_t idxnow = seq->idxnow;
+  unsigned char *rulearr = seq->rulearr;
+  int32_t *idxarr = seq->idxarr;
+
+  while (len == 0)
+    {
+      ++val;
+      if (backw_stop != ~0ul)
+	{
+	  /* There is something pushed.  */
+	  if (backw == backw_stop)
+	    {
+	      /* The last pushed character was handled.  Continue
+		 with forward characters.  */
+	      if (idxcnt < idxmax)
+		{
+		  idxnow = idxcnt;
+		  backw_stop = ~0ul;
+		}
+	      else
+		{
+		  /* Nothing any more.  The backward sequence
+		     ended with the last sequence in the string.  */
+		  idxnow = ~0ul;
+		  break;
+		}
+	    }
+	  else
+	    idxnow = --backw;
+	}
+      else
+	{
+	  backw_stop = idxcnt;
+
+	  while (idxcnt < idxmax)
+	    {
+	      if ((rulesets[rulearr[idxcnt] * nrules + pass]
+		   & sort_backward) == 0)
+		/* No more backward characters to push.  */
+		break;
+	      ++idxcnt;
+	    }
+
+	  if (backw_stop == idxcnt)
+	    {
+	      /* No sequence at all or just one.  */
+	      if (idxcnt == idxmax)
+		/* Note that LEN is still zero.  */
+		break;
+
+	      backw_stop = ~0ul;
+	      idxnow = idxcnt++;
+	    }
+	  else
+	    /* We pushed backward sequences.  */
+	    idxnow = backw = idxcnt - 1;
+	}
+      len = weights[idxarr[idxnow]++];
+    }
+
+  /* Update the structure.  */
+  seq->val = val;
+  seq->len = len;
+  seq->backw_stop = backw_stop;
+  seq->backw = backw;
+  seq->idxcnt = idxcnt;
+  seq->idxnow = idxnow;
+}
+
+/* Get next sequence.  Traverse the string as required.  */
+static void
+get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
+	      const USTRING_TYPE *weights, const int32_t *table,
+	      const USTRING_TYPE *extra, const int32_t *indirect)
+{
+#include WEIGHT_H
+  int val = seq->val = 0;
+  int len = seq->len;
+  size_t backw_stop = seq->backw_stop;
+  size_t backw = seq->backw;
+  size_t idxcnt = seq->idxcnt;
+  size_t idxmax = seq->idxmax;
+  size_t idxnow = seq->idxnow;
+  unsigned char *rulearr = seq->rulearr;
+  int32_t *idxarr = seq->idxarr;
+  const USTRING_TYPE *us = seq->us;
+
+  while (len == 0)
+    {
+      ++val;
+      if (backw_stop != ~0ul)
+	{
+	  /* The is something pushed.  */
+	  if (backw == backw_stop)
+	    {
+	      /* The last pushed character was handled.  Continue
+		 with forward characters.  */
+	      if (idxcnt < idxmax)
+		{
+		  idxnow = idxcnt;
+		  backw_stop = ~0ul;
+		}
+	      else
+		/* Nothing any more.  The backward sequence ended with
+		   the last sequence in the string.  Note that LEN
+		   is still zero.  */
+		break;
+	    }
+	  else
+	    idxnow = --backw;
+	}
+      else
+	{
+	  backw_stop = idxmax;
+
+	  while (*us != L('\0'))
+	    {
+	      int32_t tmp = findidx (&us, -1);
+	      rulearr[idxmax] = tmp >> 24;
+	      idxarr[idxmax] = tmp & 0xffffff;
+	      idxcnt = idxmax++;
+
+	      if ((rulesets[rulearr[idxcnt] * nrules]
+		   & sort_backward) == 0)
+		/* No more backward characters to push.  */
+		break;
+	      ++idxcnt;
+	    }
+
+	  if (backw_stop >= idxcnt)
+	    {
+	      /* No sequence at all or just one.  */
+	      if (idxcnt == idxmax || backw_stop > idxcnt)
+		/* Note that LEN is still zero.  */
+		break;
+
+	      backw_stop = ~0ul;
+	      idxnow = idxcnt;
+	    }
+	  else
+	    /* We pushed backward sequences.  */
+	    idxnow = backw = idxcnt - 1;
+	}
+      len = weights[idxarr[idxnow]++];
+    }
+
+  /* Update the structure.  */
+  seq->val = val;
+  seq->len = len;
+  seq->backw_stop = backw_stop;
+  seq->backw = backw;
+  seq->idxcnt = idxcnt;
+  seq->idxmax = idxmax;
+  seq->idxnow = idxnow;
+  seq->us = us;
+}
+
+/* Compare two sequences.  */
+static int
+do_compare (coll_seq *seq1, coll_seq *seq2, int position,
+	    const USTRING_TYPE *weights)
+{
+  int seq1len = seq1->len;
+  int seq2len = seq2->len;
+  int val1 = seq1->val;
+  int val2 = seq2->val;
+  int32_t *idx1arr = seq1->idxarr;
+  int32_t *idx2arr = seq2->idxarr;
+  int idx1now = seq1->idxnow;
+  int idx2now = seq2->idxnow;
+  int result = 0;
+
+  /* Test for position if necessary.  */
+  if (position && val1 != val2)
+    {
+      result = val1 - val2;
+      goto out;
+    }
+
+  /* Compare the two sequences.  */
+  do
+    {
+      if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
+	{
+	  /* The sequences differ.  */
+	  result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
+	  goto out;
+	}
+
+      /* Increment the offsets.  */
+      ++idx1arr[idx1now];
+      ++idx2arr[idx2now];
+
+      --seq1len;
+      --seq2len;
+    }
+  while (seq1len > 0 && seq2len > 0);
+
+  if (position && seq1len != seq2len)
+    result = seq1len - seq2len;
+
+out:
+  seq1->len = seq1len;
+  seq2->len = seq2len;
+  return result;
+}
+
 int
-STRCOLL (s1, s2, l)
-     const STRING_TYPE *s1;
-     const STRING_TYPE *s2;
-     __locale_t l;
+STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
 {
   struct __locale_data *current = l->__locales[LC_COLLATE];
   uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
@@ -56,34 +289,6 @@ STRCOLL (s1, s2, l)
   const USTRING_TYPE *weights;
   const USTRING_TYPE *extra;
   const int32_t *indirect;
-  uint_fast32_t pass;
-  int result = 0;
-  const USTRING_TYPE *us1;
-  const USTRING_TYPE *us2;
-  size_t s1len;
-  size_t s2len;
-  int32_t *idx1arr;
-  int32_t *idx2arr;
-  unsigned char *rule1arr;
-  unsigned char *rule2arr;
-  size_t idx1max;
-  size_t idx2max;
-  size_t idx1cnt;
-  size_t idx2cnt;
-  size_t idx1now;
-  size_t idx2now;
-  size_t backw1_stop;
-  size_t backw2_stop;
-  size_t backw1;
-  size_t backw2;
-  int val1;
-  int val2;
-  int position;
-  int seq1len;
-  int seq2len;
-  int use_malloc;
-
-#include WEIGHT_H
 
   if (nrules == 0)
     return STRCMP (s1, s2);
@@ -98,7 +303,6 @@ STRCOLL (s1, s2, l)
     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
   indirect = (const int32_t *)
     current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
-  use_malloc = 0;
 
   assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
   assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
@@ -106,18 +310,13 @@ STRCOLL (s1, s2, l)
   assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
 
   /* We need this a few times.  */
-  s1len = STRLEN (s1);
-  s2len = STRLEN (s2);
+  size_t s1len = STRLEN (s1);
+  size_t s2len = STRLEN (s2);
 
   /* Catch empty strings.  */
-  if (__builtin_expect (s1len == 0, 0) || __builtin_expect (s2len == 0, 0))
+  if (__glibc_unlikely (s1len == 0) || __glibc_unlikely (s2len == 0))
     return (s1len != 0) - (s2len != 0);
 
-  /* We need the elements of the strings as unsigned values since they
-     are used as indeces.  */
-  us1 = (const USTRING_TYPE *) s1;
-  us2 = (const USTRING_TYPE *) s2;
-
   /* Perform the first pass over the string and while doing this find
      and store the weights for each character.  Since we want this to
      be as fast as possible we are using `alloca' to store the temporary
@@ -127,411 +326,101 @@ STRCOLL (s1, s2, l)
 
      Please note that the localedef programs makes sure that `position'
      is not used at the first level.  */
+
+  coll_seq seq1, seq2;
+  bool use_malloc = false;
+  int result = 0;
+
+  memset (&seq1, 0, sizeof (seq1));
+  seq2 = seq1;
+
+  /* We need the elements of the strings as unsigned values since they
+     are used as indices.  */
+  seq1.us = (const USTRING_TYPE *) s1;
+  seq2.us = (const USTRING_TYPE *) s2;
+
   if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
     {
-      idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
-      idx2arr = &idx1arr[s1len];
-      rule1arr = (unsigned char *) &idx2arr[s2len];
-      rule2arr = &rule1arr[s1len];
+      seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
+      seq2.idxarr = &seq1.idxarr[s1len];
+      seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
+      seq2.rulearr = &seq1.rulearr[s1len];
 
-      if (idx1arr == NULL)
+      if (seq1.idxarr == NULL)
 	/* No memory.  Well, go with the stack then.
 
 	   XXX Once this implementation is stable we will handle this
-	   differently.  Instead of precomputing the indeces we will
+	   differently.  Instead of precomputing the indices we will
 	   do this in time.  This means, though, that this happens for
 	   every pass again.  */
 	goto try_stack;
-      use_malloc = 1;
+      use_malloc = true;
     }
   else
     {
     try_stack:
-      idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
-      idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
-      rule1arr = (unsigned char *) alloca (s1len);
-      rule2arr = (unsigned char *) alloca (s2len);
+      seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t));
+      seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t));
+      seq1.rulearr = (unsigned char *) alloca (s1len);
+      seq2.rulearr = (unsigned char *) alloca (s2len);
     }
 
-  idx1cnt = 0;
-  idx2cnt = 0;
-  idx1max = 0;
-  idx2max = 0;
-  idx1now = 0;
-  idx2now = 0;
-  backw1_stop = ~0ul;
-  backw2_stop = ~0ul;
-  backw1 = ~0ul;
-  backw2 = ~0ul;
-  seq1len = 0;
-  seq2len = 0;
-  position = rulesets[0] & sort_position;
-  while (1)
-    {
-      val1 = 0;
-      val2 = 0;
-
-      /* Get the next non-IGNOREd element for string `s1'.  */
-      if (seq1len == 0)
-	do
-	  {
-	    ++val1;
-
-	    if (backw1_stop != ~0ul)
-	      {
-		/* The is something pushed.  */
-		if (backw1 == backw1_stop)
-		  {
-		    /* The last pushed character was handled.  Continue
-		       with forward characters.  */
-		    if (idx1cnt < idx1max)
-		      {
-			idx1now = idx1cnt;
-			backw1_stop = ~0ul;
-		      }
-		    else
-		      /* Nothing anymore.  The backward sequence ended with
-			 the last sequence in the string.  Note that seq1len
-			 is still zero.  */
-		      break;
-		  }
-		else
-		  idx1now = --backw1;
-	      }
-	    else
-	      {
-		backw1_stop = idx1max;
-
-		while (*us1 != L('\0'))
-		  {
-		    int32_t tmp = findidx (&us1, -1);
-		    rule1arr[idx1max] = tmp >> 24;
-		    idx1arr[idx1max] = tmp & 0xffffff;
-		    idx1cnt = idx1max++;
-
-		    if ((rulesets[rule1arr[idx1cnt] * nrules]
-			 & sort_backward) == 0)
-		      /* No more backward characters to push.  */
-		      break;
-		    ++idx1cnt;
-		  }
-
-		if (backw1_stop >= idx1cnt)
-		  {
-		    /* No sequence at all or just one.  */
-		    if (idx1cnt == idx1max || backw1_stop > idx1cnt)
-		      /* Note that seq1len is still zero.  */
-		      break;
-
-		    backw1_stop = ~0ul;
-		    idx1now = idx1cnt;
-		  }
-		else
-		  /* We pushed backward sequences.  */
-		  idx1now = backw1 = idx1cnt - 1;
-	      }
-	  }
-	while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
-
-      /* And the same for string `s2'.  */
-      if (seq2len == 0)
-	do
-	  {
-	    ++val2;
-
-	    if (backw2_stop != ~0ul)
-	      {
-		/* The is something pushed.  */
-		if (backw2 == backw2_stop)
-		  {
-		    /* The last pushed character was handled.  Continue
-		       with forward characters.  */
-		    if (idx2cnt < idx2max)
-		      {
-			idx2now = idx2cnt;
-			backw2_stop = ~0ul;
-		      }
-		    else
-		      /* Nothing anymore.  The backward sequence ended with
-			 the last sequence in the string.  Note that seq2len
-			 is still zero.  */
-		      break;
-		  }
-		else
-		  idx2now = --backw2;
-	      }
-	    else
-	      {
-		backw2_stop = idx2max;
-
-		while (*us2 != L('\0'))
-		  {
-		    int32_t tmp = findidx (&us2, -1);
-		    rule2arr[idx2max] = tmp >> 24;
-		    idx2arr[idx2max] = tmp & 0xffffff;
-		    idx2cnt = idx2max++;
-
-		    if ((rulesets[rule2arr[idx2cnt] * nrules]
-			 & sort_backward) == 0)
-		      /* No more backward characters to push.  */
-		      break;
-		    ++idx2cnt;
-		  }
-
-		if (backw2_stop >= idx2cnt)
-		  {
-		    /* No sequence at all or just one.  */
-		    if (idx2cnt == idx2max || backw2_stop > idx2cnt)
-		      /* Note that seq1len is still zero.  */
-		      break;
-
-		    backw2_stop = ~0ul;
-		    idx2now = idx2cnt;
-		  }
-		else
-		  /* We pushed backward sequences.  */
-		  idx2now = backw2 = idx2cnt - 1;
-	      }
-	  }
-	while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
-
-      /* See whether any or both strings are empty.  */
-      if (seq1len == 0 || seq2len == 0)
-	{
-	  if (seq1len == seq2len)
-	    /* Both ended.  So far so good, both strings are equal at the
-	       first level.  */
-	    break;
-
-	  /* This means one string is shorter than the other.  Find out
-	     which one and return an appropriate value.  */
-	  result = seq1len == 0 ? -1 : 1;
-	  goto free_and_return;
-	}
-
-      /* Test for position if necessary.  */
-      if (position && val1 != val2)
-	{
-	  result = val1 - val2;
-	  goto free_and_return;
-	}
-
-      /* Compare the two sequences.  */
-      do
-	{
-	  if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
-	    {
-	      /* The sequences differ.  */
-	      result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
-	      goto free_and_return;
-	    }
-
-	  /* Increment the offsets.  */
-	  ++idx1arr[idx1now];
-	  ++idx2arr[idx2now];
+  seq1.rulearr[0] = 0;
 
-	  --seq1len;
-	  --seq2len;
-	}
-      while (seq1len > 0 && seq2len > 0);
-
-      if (position && seq1len != seq2len)
-	{
-	  result = seq1len - seq2len;
-	  goto free_and_return;
-	}
-    }
-
-  /* Now the remaining passes over the weights.  We now use the
-     indeces we found before.  */
-  for (pass = 1; pass < nrules; ++pass)
+  /* Cache values in the first pass and if needed, use them in subsequent
+     passes.  */
+  for (int pass = 0; pass < nrules; ++pass)
     {
+      seq1.idxcnt = 0;
+      seq1.backw_stop = ~0ul;
+      seq1.backw = ~0ul;
+      seq2.idxcnt = 0;
+      seq2.backw_stop = ~0ul;
+      seq2.backw = ~0ul;
+
       /* We assume that if a rule has defined `position' in one section
 	 this is true for all of them.  */
-      idx1cnt = 0;
-      idx2cnt = 0;
-      backw1_stop = ~0ul;
-      backw2_stop = ~0ul;
-      backw1 = ~0ul;
-      backw2 = ~0ul;
-      position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
+      int position = rulesets[seq1.rulearr[0] * nrules + pass] & sort_position;
 
       while (1)
 	{
-	  val1 = 0;
-	  val2 = 0;
-
-	  /* Get the next non-IGNOREd element for string `s1'.  */
-	  if (seq1len == 0)
-	    do
-	      {
-		++val1;
-
-		if (backw1_stop != ~0ul)
-		  {
-		    /* The is something pushed.  */
-		    if (backw1 == backw1_stop)
-		      {
-			/* The last pushed character was handled.  Continue
-			   with forward characters.  */
-			if (idx1cnt < idx1max)
-			  {
-			    idx1now = idx1cnt;
-			    backw1_stop = ~0ul;
-			  }
-			else
-			  {
-			    /* Nothing anymore.  The backward sequence
-			       ended with the last sequence in the string.  */
-			    idx1now = ~0ul;
-			    break;
-			  }
-		      }
-		    else
-		      idx1now = --backw1;
-		  }
-		else
-		  {
-		    backw1_stop = idx1cnt;
-
-		    while (idx1cnt < idx1max)
-		      {
-			if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
-			     & sort_backward) == 0)
-			  /* No more backward characters to push.  */
-			  break;
-			++idx1cnt;
-		      }
-
-		    if (backw1_stop == idx1cnt)
-		      {
-			/* No sequence at all or just one.  */
-			if (idx1cnt == idx1max)
-			  /* Note that seq1len is still zero.  */
-			  break;
-
-			backw1_stop = ~0ul;
-			idx1now = idx1cnt++;
-		      }
-		    else
-		      /* We pushed backward sequences.  */
-		      idx1now = backw1 = idx1cnt - 1;
-		  }
-	      }
-	    while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
-
-	  /* And the same for string `s2'.  */
-	  if (seq2len == 0)
-	    do
-	      {
-		++val2;
-
-		if (backw2_stop != ~0ul)
-		  {
-		    /* The is something pushed.  */
-		    if (backw2 == backw2_stop)
-		      {
-			/* The last pushed character was handled.  Continue
-			   with forward characters.  */
-			if (idx2cnt < idx2max)
-			  {
-			    idx2now = idx2cnt;
-			    backw2_stop = ~0ul;
-			  }
-			else
-			  {
-			    /* Nothing anymore.  The backward sequence
-			       ended with the last sequence in the string.  */
-			    idx2now = ~0ul;
-			    break;
-			  }
-		      }
-		    else
-		      idx2now = --backw2;
-		  }
-		else
-		  {
-		    backw2_stop = idx2cnt;
-
-		    while (idx2cnt < idx2max)
-		      {
-			if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
-			     & sort_backward) == 0)
-			  /* No more backward characters to push.  */
-			  break;
-			++idx2cnt;
-		      }
-
-		    if (backw2_stop == idx2cnt)
-		      {
-			/* No sequence at all or just one.  */
-			if (idx2cnt == idx2max)
-			  /* Note that seq2len is still zero.  */
-			  break;
-
-			backw2_stop = ~0ul;
-			idx2now = idx2cnt++;
-		      }
-		    else
-		      /* We pushed backward sequences.  */
-		      idx2now = backw2 = idx2cnt - 1;
-		  }
-	      }
-	    while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
+	  if (pass == 0)
+	    {
+	      get_next_seq (&seq1, nrules, rulesets, weights, table, extra,
+			    indirect);
+	      get_next_seq (&seq2, nrules, rulesets, weights, table, extra,
+			    indirect);
+	    }
+	  else
+	    {
+	      get_next_seq_cached (&seq1, nrules, pass, rulesets, weights);
+	      get_next_seq_cached (&seq2, nrules, pass, rulesets, weights);
+	    }
 
 	  /* See whether any or both strings are empty.  */
-	  if (seq1len == 0 || seq2len == 0)
+	  if (seq1.len == 0 || seq2.len == 0)
 	    {
-	      if (seq1len == seq2len)
+	      if (seq1.len == seq2.len)
 		/* Both ended.  So far so good, both strings are equal
 		   at this level.  */
 		break;
 
 	      /* This means one string is shorter than the other.  Find out
 		 which one and return an appropriate value.  */
-	      result = seq1len == 0 ? -1 : 1;
+	      result = seq1.len == 0 ? -1 : 1;
 	      goto free_and_return;
 	    }
 
-	  /* Test for position if necessary.  */
-	  if (position && val1 != val2)
-	    {
-	      result = val1 - val2;
-	      goto free_and_return;
-	    }
-
-	  /* Compare the two sequences.  */
-	  do
-	    {
-	      if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
-		{
-		  /* The sequences differ.  */
-		  result = (weights[idx1arr[idx1now]]
-			    - weights[idx2arr[idx2now]]);
-		  goto free_and_return;
-		}
-
-	      /* Increment the offsets.  */
-	      ++idx1arr[idx1now];
-	      ++idx2arr[idx2now];
-
-	      --seq1len;
-	      --seq2len;
-	    }
-	  while (seq1len > 0 && seq2len > 0);
-
-	  if (position && seq1len != seq2len)
-	    {
-	      result = seq1len - seq2len;
-	      goto free_and_return;
-	    }
+	  result = do_compare (&seq1, &seq2, position, weights);
+	  if (result != 0)
+	    goto free_and_return;
 	}
     }
 
   /* Free the memory if needed.  */
  free_and_return:
   if (use_malloc)
-    free (idx1arr);
+    free (seq1.idxarr);
 
   return result;
 }

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=92f45f230c8abdf91195f2f7e4f003754755a0c6

commit 92f45f230c8abdf91195f2f7e4f003754755a0c6
Author: Andreas Schwab <schwab@suse.de>
Date:   Thu Oct 31 12:51:03 2013 +0100

    Fix parsing of 0e+0 as float

diff --git a/ChangeLog b/ChangeLog
index 2404b75..9124a19 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2013-10-31  Andreas Schwab  <schwab@suse.de>
+
+	[BZ# 15917]
+	* stdio-common/vfscanf.c (_IO_vfwscanf): Handle leading '0' not
+	followed by 'x' as part of digit sequence.
+	* stdio-common/tst-sscanf.c (double_tests2): New tests.
+
 2013-10-04  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/strcpy.S (strcpy): Add word load/store
diff --git a/NEWS b/NEWS
index c70f333..8b228b0 100644
--- a/NEWS
+++ b/NEWS
@@ -9,8 +9,8 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14155, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909, 15996,
-  16072, 16150.
+  14155, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909, 15917,
+  15996, 16072, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/stdio-common/tst-sscanf.c b/stdio-common/tst-sscanf.c
index 1edb227..3c34f58 100644
--- a/stdio-common/tst-sscanf.c
+++ b/stdio-common/tst-sscanf.c
@@ -109,6 +109,19 @@ struct test double_tests[] =
   { L("-inf"), L("%g"), 1 }
 };
 
+struct test2
+{
+  const CHAR *str;
+  const CHAR *fmt;
+  int retval;
+  char residual;
+} double_tests2[] =
+{
+  { L("0e+0"), L("%g%c"), 1, 0 },
+  { L("0xe+0"), L("%g%c"), 2, '+' },
+  { L("0x.e+0"), L("%g%c"), 2, '+' },
+};
+
 int
 main (void)
 {
@@ -196,5 +209,26 @@ main (void)
 	}
     }
 
+  for (i = 0; i < sizeof (double_tests2) / sizeof (double_tests2[0]); ++i)
+    {
+      double dummy;
+      int ret;
+      char c = 0;
+
+      if ((ret = SSCANF (double_tests2[i].str, double_tests2[i].fmt,
+			 &dummy, &c)) != double_tests2[i].retval)
+	{
+	  printf ("double_tests2[%d] returned %d != %d\n",
+		  i, ret, double_tests2[i].retval);
+	  result = 1;
+	}
+      else if (ret == 2 && c != double_tests2[i].residual)
+	{
+	  printf ("double_tests2[%d] stopped at '%c' != '%c'\n",
+		  i, c, double_tests2[i].residual);
+	  result = 1;
+	}
+    }
+
   return result;
 }
diff --git a/stdio-common/vfscanf.c b/stdio-common/vfscanf.c
index 3430567..093c8b0 100644
--- a/stdio-common/vfscanf.c
+++ b/stdio-common/vfscanf.c
@@ -1966,6 +1966,8 @@ _IO_vfscanf_internal (_IO_FILE *s, const char *format, _IO_va_list argptr,
 		  if (width > 0)
 		    --width;
 		}
+	      else
+		got_digit = 1;
 	    }
 
 	  while (1)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2849f111a87a5e9c715fb8379163869781dc9ad7

commit 2849f111a87a5e9c715fb8379163869781dc9ad7
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Thu Sep 26 09:29:19 2013 -0500

    PowerPC: strcpy/stpcpy optimization for PPC64/POWER7
    
    This patch intends to unify both strcpy and stpcpy implementationsi
    for PPC64 and PPC64/POWER7. The idead default powerpc64 implementation
    is to provide both doubleword and word aligned memory access.
    
    For PPC64/POWER7 is also provide doubleword and word memory access,
    remove the branch hints, use the cmpb instruction for compare
    doubleword/words, and add an optimization for inputs of same alignment.

diff --git a/ChangeLog b/ChangeLog
index 0afcd3f..2404b75 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2013-10-04  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/strcpy.S (strcpy): Add word load/store
+	to provide a boost for large inputs with word alignment.
+	* sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Rewrite
+	implementation based on optimized PPC64 strcpy.
+	* sysdeps/powerpc/powerpc64/power7/strcpy.S: New file: optimized
+	strcpy for PPC64/POWER7 based on both doubleword and word load/store.
+	* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file: optimized
+	stpcpy for PPC64/POWER7 based on PPC64/POWER7 strcpy.
+
 2013-10-25  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 <<<<<<< HEAD
diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
new file mode 100644
index 0000000..727dd06
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power7/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
new file mode 100644
index 0000000..5c341a1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -0,0 +1,274 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   or
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   if USE_AS_STPCPY is defined. It tries to use aligned memory accesses
+   when possible using the following algorithm:
+
+   if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
+     goto aligned_doubleword_copy;
+   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
+     goto aligned_word_copy;
+   if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
+     goto same_alignment;
+   goto unaligned;
+
+   The aligned comparison are made using cmpb instructions.  */
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+	CALL_MCOUNT 2
+
+#define rTMP	r0
+#ifdef USE_AS_STPCPY
+#define rRTN	r3	/* pointer to previous word/doubleword in dest */
+#else
+#define rRTN	r12	/* pointer to previous word/doubleword in dest */
+#endif
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
+#define rMASK	r5	/* mask 0xffffffff | 0xffffffffffffffff */
+#define rWORD	r6	/* current word from src */
+#define rALT	r7	/* alternate word from src */
+#define rRTNAL	r8	/* alignment of return pointer */
+#define rSRCAL	r9	/* alignment of source pointer */
+#define rALCNT	r10	/* bytes to read to reach 8 bytes alignment */
+#define rSUBAL	r11	/* doubleword minus unaligned displacement */
+
+#ifndef USE_AS_STPCPY
+/* Save the dst pointer to use as return value.  */
+	mr	rRTN, r3
+#endif
+	or	rTMP, rSRC, rRTN
+	clrldi.	rTMP, rTMP, 61
+	bne	L(check_word_alignment)
+	b	L(aligned_doubleword_copy)
+
+L(same_alignment):
+/* Src and dst with same alignment: align both to doubleword.  */
+	mr	rALCNT, rRTN
+	lbz	rWORD, 0(rSRC)
+	subfic	rSUBAL, rRTNAL, 8
+	addi	rRTN, rRTN, 1
+	addi	rSRC, rSRC, 1
+	cmpdi	cr7, rWORD, 0
+	stb	rWORD, 0(rALCNT)
+	beq	cr7, L(s2)
+
+	add	rALCNT, rALCNT, rSUBAL
+	subf	rALCNT, rRTN, rALCNT
+	addi	rALCNT, rALCNT, 1
+	mtctr	rALCNT
+	b	L(s1)
+
+	.align 4
+L(s0):
+	addi	rSRC, rSRC, 1
+	lbz	rWORD, -1(rSRC)
+	cmpdi	cr7, rWORD, 0
+	stb	rWORD, -1(rALCNT)
+	beqlr	cr7
+	mr	rRTN, rALCNT
+L(s1):
+	addi	rALCNT, rRTN,1
+	bdnz	L(s0)
+	b L(aligned_doubleword_copy)
+	.align 4
+L(s2):
+	mr	rRTN, rALCNT
+	blr
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	.align 4
+L(aligned_doubleword_copy):
+	li	rMASK, 0
+	addi	rRTN, rRTN, -8
+	ld	rWORD, 0(rSRC)
+	b	L(g2)
+
+	.align 4
+L(g0):	ldu	rALT, 8(rSRC)
+	stdu	rWORD, 8(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpdi	rTMP, 0
+	bne	L(g1)
+	ldu	rWORD, 8(rSRC)
+	stdu	rALT, 8(rRTN)
+L(g2):	cmpb	rTMP, rWORD, rMASK
+	cmpdi	rTMP, 0		/* If rTMP is 0, no null's have been found.  */
+	beq	L(g0)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	extrdi.	rTMP, rALT, 8, 56
+	stbu	rALT, 8(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	extrdi	rTMP, rALT, 8, 0
+	stbu	rTMP, 1(rRTN)
+#else
+	extrdi.	rTMP, rALT, 8, 0
+	stbu	rTMP, 8(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 8
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 16
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 24
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 32
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 40
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	extrdi.	rTMP, rALT, 8, 48
+	stbu	rTMP, 1(rRTN)
+	beqlr
+	stbu	rALT, 1(rRTN)
+#endif
+	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	beq	L(aligned_word_copy)
+	rldicl	rRTNAL, rRTN, 0, 61
+	rldicl	rSRCAL, rSRC, 0, 61
+	cmpld	cr7, rSRCAL, rRTNAL
+	beq	cr7, L(same_alignment)
+	b	L(unaligned)
+
+/* For word aligned memory, operate using word load and stores.  */
+	.align	4
+L(aligned_word_copy):
+	li	rMASK, 0
+	addi	rRTN, rRTN, -4
+	lwz	rWORD, 0(rSRC)
+	b	L(g5)
+
+	.align	4
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	cmpb	rTMP, rALT, rMASK
+	cmpwi	rTMP, 0
+	bne	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	cmpb	rTMP, rWORD, rMASK
+	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
+	beq	L(g3)
+
+	mr      rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rRTN)
+#else
+	rlwinm. rTMP, rALT, 8, 24, 31
+	stbu    rTMP, 4(rRTN)
+	beqlr
+	rlwinm. rTMP, rALT, 16, 24, 31
+	stbu    rTMP, 1(rRTN)
+	beqlr
+	rlwinm. rTMP, rALT, 24, 24, 31
+	stbu    rTMP, 1(rRTN)
+	beqlr
+	stbu    rALT, 1(rRTN)
+#endif
+	blr
+
+/* Oh well.  In this case, we just do a byte-by-byte copy.  */
+	.align	4
+L(unaligned):
+	lbz	rWORD, 0(rSRC)
+	addi	rRTN, rRTN, -1
+	cmpdi	rWORD, 0
+	beq	L(u2)
+
+	.align 	5
+L(u0):	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	beq	L(u2)
+	lbzu	rALT, 1(rSRC)
+	stbu	rWORD, 1(rRTN)
+	cmpdi	rALT, 0
+	beq	L(u1)
+	lbzu	rWORD, 1(rSRC)
+	stbu	rALT, 1(rRTN)
+	cmpdi	rWORD, 0
+	bne	L(u0)
+L(u2):	stbu	rWORD, 1(rRTN)
+	blr
+L(u1):	stbu	rALT, 1(rRTN)
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
index c0b3972..09aa3be 100644
--- a/sysdeps/powerpc/powerpc64/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/stpcpy.S
@@ -16,103 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-/* See strlen.s for comments on how the end-of-string testing works.  */
-
-/* char * [r3] stpcpy (char *dest [r3], const char *src [r4])  */
-
-EALIGN (__stpcpy, 4, 0)
-	CALL_MCOUNT 2
-
-#define rTMP	r0
-#define rRTN	r3
-#define rDEST	r3		/* pointer to previous word in dest */
-#define rSRC	r4		/* pointer to previous word in src */
-#define rWORD	r6		/* current word from src */
-#define rFEFE	r7		/* 0xfefefeff */
-#define r7F7F	r8		/* 0x7f7f7f7f */
-#define rNEG	r9		/* ~(word in src | 0x7f7f7f7f) */
-#define rALT	r10		/* alternate word from src */
-
-	or	rTMP, rSRC, rDEST
-	clrldi.	rTMP, rTMP, 62
-	addi	rDEST, rDEST, -4
-	bne	L(unaligned)
-
-	lis	rFEFE, -0x101
-	lis	r7F7F, 0x7f7f
-	lwz	rWORD, 0(rSRC)
-	addi	rFEFE, rFEFE, -0x101
-	addi	r7F7F, r7F7F, 0x7f7f
-	b	L(g2)
-
-L(g0):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rDEST)
-	add	rTMP, rFEFE, rALT
-	nor	rNEG, r7F7F, rALT
-	and.	rTMP, rTMP, rNEG
-	bne-	L(g1)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rDEST)
-L(g2):	add	rTMP, rFEFE, rWORD
-	nor	rNEG, r7F7F, rWORD
-	and.	rTMP, rTMP, rNEG
-	beq+	L(g0)
-
-	mr	rALT, rWORD
-/* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):
-#ifdef __LITTLE_ENDIAN__
-	rlwinm.	rTMP, rALT, 0, 24, 31
-	stbu	rALT, 4(rDEST)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rDEST)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rDEST)
-	beqlr-
-	rlwinm	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 1(rDEST)
-	blr
-#else
-	rlwinm.	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 4(rDEST)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rDEST)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rDEST)
-	beqlr-
-	stbu	rALT, 1(rDEST)
-	blr
-#endif
-
-/* Oh well.  In this case, we just do a byte-by-byte copy.  */
-	.align 4
-	nop
-L(unaligned):
-	lbz	rWORD, 0(rSRC)
-	addi	rDEST, rDEST, 3
-	cmpwi	rWORD, 0
-	beq-	L(u2)
-
-L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rDEST)
-	cmpwi	rALT, 0
-	beq-	L(u1)
-	nop		/* Let 601 load start of loop.  */
-	lbzu	rWORD, 1(rSRC)
-	stbu	rALT, 1(rDEST)
-	cmpwi	rWORD, 0
-	bne+	L(u0)
-L(u2):	stbu	rWORD, 1(rDEST)
-	blr
-L(u1):	stbu	rALT, 1(rDEST)
-	blr
-END (__stpcpy)
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/strcpy.S>
 
 weak_alias (__stpcpy, stpcpy)
 libc_hidden_def (__stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
index a7fd85b..793325d 100644
--- a/sysdeps/powerpc/powerpc64/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/strcpy.S
@@ -22,25 +22,38 @@
 
 /* char * [r3] strcpy (char *dest [r3], const char *src [r4])  */
 
-EALIGN (strcpy, 4, 0)
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+EALIGN (FUNC_NAME, 4, 0)
 	CALL_MCOUNT 2
 
 #define rTMP	r0
-#define rRTN	r3	/* incoming DEST arg preserved as result */
-#define rSRC	r4	/* pointer to previous word in src */
-#define rDEST	r5	/* pointer to previous word in dest */
+#ifdef USE_AS_STPCPY
+#define rRTN    r3      /* pointer to previous word/doubleword in dest */
+#else
+#define rRTN    r12     /* pointer to previous word/doubleword in dest */
+#endif
+#define rSRC	r4	/* pointer to previous word/doubleword in src */
 #define rWORD	r6	/* current word from src */
-#define rFEFE	r7	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
-#define r7F7F	r8	/* constant 0x7f7f7f7f7f7f7f7f */
-#define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rFEFE	r7	/* constant 0xfefefeff | 0xfefefefefefefeff */
+#define r7F7F	r8	/* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r9	/* ~(word in s1 | r7F7F) */
 #define rALT	r10	/* alternate word from src */
 
-	dcbt	0,rSRC
+#ifndef USE_AS_STPCPY
+/* Save the dst pointer to use as return value.  */
+	mr      rRTN, r3
+#endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	addi	rDEST, rRTN, -8
-	dcbtst	0,rRTN
-	bne	L(unaligned)
+	bne	L(check_word_alignment)
+
+/* For doubleword aligned memory, operate using doubleword load and stores.  */
+	addi	rRTN, rRTN, -8
 
 	lis	rFEFE, -0x101
 	lis	r7F7F, 0x7f7f
@@ -53,13 +66,13 @@ EALIGN (strcpy, 4, 0)
 	b	L(g2)
 
 L(g0):	ldu	rALT, 8(rSRC)
-	stdu	rWORD, 8(rDEST)
+	stdu	rWORD, 8(rRTN)
 	add	rTMP, rFEFE, rALT
 	nor	rNEG, r7F7F, rALT
 	and.	rTMP, rTMP, rNEG
 	bne-	L(g1)
 	ldu	rWORD, 8(rSRC)
-	stdu	rALT, 8(rDEST)
+	stdu	rALT, 8(rRTN)
 L(g2):	add	rTMP, rFEFE, rWORD
 	nor	rNEG, r7F7F, rWORD
 	and.	rTMP, rTMP, rNEG
@@ -70,77 +83,134 @@ L(g2):	add	rTMP, rFEFE, rWORD
 L(g1):
 #ifdef __LITTLE_ENDIAN__
 	extrdi.	rTMP, rALT, 8, 56
-	stb	rALT, 8(rDEST)
+	stbu	rALT, 8(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 48
-	stb	rTMP, 9(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 40
-	stb	rTMP, 10(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 32
-	stb	rTMP, 11(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 24
-	stb	rTMP, 12(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 16
-	stb	rTMP, 13(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 8
-	stb	rTMP, 14(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi	rTMP, rALT, 8, 0
-	stb	rTMP, 15(rDEST)
-	blr
+	stbu	rTMP, 1(rRTN)
 #else
 	extrdi.	rTMP, rALT, 8, 0
-	stb	rTMP, 8(rDEST)
+	stbu	rTMP, 8(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 8
-	stb	rTMP, 9(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 16
-	stb	rTMP, 10(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 24
-	stb	rTMP, 11(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 32
-	stb	rTMP, 12(rDEST)
-	beqlr-
+	stbu	rTMP, 1(rRTN)
+	beqlr
 	extrdi.	rTMP, rALT, 8, 40
-	stb	rTMP, 13(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
 	extrdi.	rTMP, rALT, 8, 48
-	stb	rTMP, 14(rDEST)
+	stbu	rTMP, 1(rRTN)
 	beqlr-
-	stb	rALT, 15(rDEST)
+	stbu	rALT, 1(rRTN)
+#endif
 	blr
+
+L(check_word_alignment):
+	clrldi. rTMP, rTMP, 62
+	bne     L(unaligned)
+
+/* For word aligned memory, operate using word load and stores.  */
+	addi	rRTN, rRTN, -4
+
+	lis	rFEFE, -0x101
+	lis	r7F7F, 0x7f7f
+	lwz	rWORD, 0(rSRC)
+	addi	rFEFE, rFEFE, -0x101
+	addi	r7F7F, r7F7F, 0x7f7f
+	b	L(g5)
+
+L(g3):	lwzu	rALT, 4(rSRC)
+	stwu	rWORD, 4(rRTN)
+	add	rTMP, rFEFE, rALT
+	nor	rNEG, r7F7F, rALT
+	and.	rTMP, rTMP, rNEG
+	bne-	L(g4)
+	lwzu	rWORD, 4(rSRC)
+	stwu	rALT, 4(rRTN)
+L(g5):	add	rTMP, rFEFE, rWORD
+	nor	rNEG, r7F7F, rWORD
+	and.	rTMP, rTMP, rNEG
+	beq+	L(g3)
+
+	mr	rALT, rWORD
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(g4):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rRTN)
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 4(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rRTN)
+	beqlr-
+	stbu	rALT, 1(rRTN)
 #endif
+	blr
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
 	nop
 L(unaligned):
 	lbz	rWORD, 0(rSRC)
-	addi	rDEST, rRTN, -1
+	addi	rRTN, rRTN, -1
 	cmpwi	rWORD, 0
 	beq-	L(u2)
 
 L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rDEST)
+	stbu	rWORD, 1(rRTN)
 	cmpwi	rALT, 0
 	beq-	L(u1)
 	nop		/* Let 601 load start of loop.  */
 	lbzu	rWORD, 1(rSRC)
-	stbu	rALT, 1(rDEST)
+	stbu	rALT, 1(rRTN)
 	cmpwi	rWORD, 0
 	bne+	L(u0)
-L(u2):	stb	rWORD, 1(rDEST)
+L(u2):	stbu	rWORD, 1(rRTN)
 	blr
-L(u1):	stb	rALT, 1(rDEST)
+L(u1):	stbu	rALT, 1(rRTN)
 	blr
+END (FUNC_NAME)
 
-END (strcpy)
+#ifndef USE_AS_STPCPY
 libc_hidden_builtin_def (strcpy)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6f4b109567445b1ed70300bb0cb1752cb9998ec9

commit 6f4b109567445b1ed70300bb0cb1752cb9998ec9
Author: Siddhesh Poyarekar <siddhesh@redhat.com>
Date:   Fri Oct 25 10:22:12 2013 +0530

    Fix stack overflow due to large AF_INET6 requests
    
    Resolves #16072 (CVE-2013-4458).
    
    This patch fixes another stack overflow in getaddrinfo when it is
    called with AF_INET6.  The AF_UNSPEC case was fixed as CVE-2013-1914,
    but the AF_INET6 case went undetected back then.

diff --git a/ChangeLog b/ChangeLog
index b970fe1..0afcd3f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,6 @@
-2013-10-10  Joseph Myers  <joseph@codsourcery.com>
+2013-10-25  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
+<<<<<<< HEAD
 	* sysdeps/ieee754/ldbl-128ibm/e_acosl.c (__ieee754_acosl): Check
 	for NaNs before doing comparisons on argument.
 	* sysdeps/ieee754/ldbl-128ibm/e_asinl.c (__ieee754_asinl):
@@ -490,8 +491,12 @@
 
 	* sysdeps/unix/sysv/linux/tst-fanotify.c: New test.
 	* sysdeps/unix/sysv/linux/Makefile (tests): Add tst-fanotify.
+=======
+	[BZ #16072]
+	* sysdeps/posix/getaddrinfo.c (gethosts): Allocate tmpbuf on
+	heap for large requests.
+>>>>>>> 6f95434... Fix stack overflow due to large AF_INET6 requests
 
->>>>>>> ffa3cd7... Fix lgammaf spurious underflow (bug 15427).
 2013-09-02  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #14155]
diff --git a/NEWS b/NEWS
index b25af8e..c70f333 100644
--- a/NEWS
+++ b/NEWS
@@ -10,11 +10,14 @@ Version 2.18.1
 * The following bugs are resolved with this release:
 
   14155, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909, 15996,
-  16150.
+  16072, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
   character.  (Bugzilla #14699).
+
+* CVE-2013-4458 Stack overflow in getaddrinfo with large number of results
+  for AF_INET6 has been fixed (Bugzilla #16072).
 
 Version 2.18
 
diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
index 7bb3ded..2e97255 100644
--- a/sysdeps/posix/getaddrinfo.c
+++ b/sysdeps/posix/getaddrinfo.c
@@ -197,7 +197,22 @@ gaih_inet_serv (const char *servicename, const struct gaih_typeproto *tp,
 				&rc, &herrno, NULL, &localcanon));	      \
     if (rc != ERANGE || herrno != NETDB_INTERNAL)			      \
       break;								      \
-    tmpbuf = extend_alloca (tmpbuf, tmpbuflen, 2 * tmpbuflen);		      \
+    if (!malloc_tmpbuf && __libc_use_alloca (alloca_used + 2 * tmpbuflen))    \
+      tmpbuf = extend_alloca_account (tmpbuf, tmpbuflen, 2 * tmpbuflen,	      \
+				      alloca_used);			      \
+    else								      \
+      {									      \
+	char *newp = realloc (malloc_tmpbuf ? tmpbuf : NULL,		      \
+			      2 * tmpbuflen);				      \
+	if (newp == NULL)						      \
+	  {								      \
+	    result = -EAI_MEMORY;					      \
+	    goto free_and_return;					      \
+	  }								      \
+	tmpbuf = newp;							      \
+	malloc_tmpbuf = true;						      \
+	tmpbuflen = 2 * tmpbuflen;					      \
+      }									      \
   }									      \
   if (status == NSS_STATUS_SUCCESS && rc == 0)				      \
     h = &th;								      \
@@ -209,7 +224,8 @@ gaih_inet_serv (const char *servicename, const struct gaih_typeproto *tp,
 	{								      \
 	  __set_h_errno (herrno);					      \
 	  _res.options |= old_res_options & RES_USE_INET6;		      \
-	  return -EAI_SYSTEM;						      \
+	  result = -EAI_SYSTEM;						      \
+	  goto free_and_return;						      \
 	}								      \
       if (herrno == TRY_AGAIN)						      \
 	no_data = EAI_AGAIN;						      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=52aafee302d34ddd2afb6bea31d838e61555e154

commit 52aafee302d34ddd2afb6bea31d838e61555e154
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Thu Oct 10 19:11:30 2013 +0000

    Avoid ordered comparisons of NaNs in ldbl-128ibm acosl and asinl.

diff --git a/ChangeLog b/ChangeLog
index bf099c9..b970fe1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2013-10-10  Joseph Myers  <joseph@codsourcery.com>
+
+	* sysdeps/ieee754/ldbl-128ibm/e_acosl.c (__ieee754_acosl): Check
+	for NaNs before doing comparisons on argument.
+	* sysdeps/ieee754/ldbl-128ibm/e_asinl.c (__ieee754_asinl):
+	Likewise.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
index 8663993..2cb2882 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
@@ -153,6 +153,8 @@ __ieee754_acosl (long double x)
 {
   long double a, z, r, w, p, q, s, t, f2;
 
+  if (__glibc_unlikely (__isnanl (x)))
+    return x + x;
   a = __builtin_fabsl (x);
   if (a == 1.0L)
     {
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
index 99a5b85..dece118 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
@@ -134,6 +134,8 @@ __ieee754_asinl (long double x)
   long double a, t, w, p, q, c, r, s;
   int flag;
 
+  if (__glibc_unlikely (__isnanl (x)))
+    return x + x;
   flag = 0;
   a = __builtin_fabsl (x);
   if (a == 1.0L)	/* |x|>= 1 */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6c02f0569e758cd1973797c4a46b8c74425e3b72

commit 6c02f0569e758cd1973797c4a46b8c74425e3b72
Author: Alan Modra <amodra@gmail.com>
Date:   Fri Oct 4 12:48:51 2013 +0930

    Use stdint.h types in union unaligned.
    
    	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
    	Use stdint types in rather than __attribute__((mode())).
    	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Likewise.

diff --git a/ChangeLog b/ChangeLog
index a961126..bf099c9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,12 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
+	Use stdint types in rather than __attribute__((mode())).
+	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
 	Correct handling of unaligned relocs for little-endian.
 	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Likewise.
 
diff --git a/sysdeps/powerpc/powerpc32/dl-machine.c b/sysdeps/powerpc/powerpc32/dl-machine.c
index ec64951..df8c14e 100644
--- a/sysdeps/powerpc/powerpc32/dl-machine.c
+++ b/sysdeps/powerpc/powerpc32/dl-machine.c
@@ -425,8 +425,8 @@ __process_machine_rela (struct link_map *map,
 {
   union unaligned
     {
-      unsigned u2 __attribute__ ((mode (HI)));
-      unsigned u4 __attribute__ ((mode (SI)));
+      uint16_t u2;
+      uint32_t u4;
     } __attribute__((__packed__));
 
   switch (rinfo)
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index b69a1ce..18cf157 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -563,10 +563,10 @@ elf_machine_rela (struct link_map *map,
   const Elf64_Sym *const refsym = sym;
   union unaligned
     {
-      unsigned u2 __attribute__ ((mode (HI)));
-      unsigned u4 __attribute__ ((mode (SI)));
-      unsigned u8 __attribute__ ((mode (DI)));
-    } __attribute__((__packed__));
+      uint16_t u2;
+      uint32_t u4;
+      uint64_t u8;
+    } __attribute__ ((__packed__));
 
   if (r_type == R_PPC64_RELATIVE)
     {

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=723630a980657c4c88193448f7b4b6bf09128fda

commit 723630a980657c4c88193448f7b4b6bf09128fda
Author: Alan Modra <amodra@gmail.com>
Date:   Thu Oct 3 13:51:52 2013 +0930

    Correct little-endian relocation of UADDR64,32,16.
    
    	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
    	Correct handling of unaligned relocs for little-endian.
    	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Likewise.

diff --git a/ChangeLog b/ChangeLog
index 0c727fa..a961126 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc32/dl-machine.c (__process_machine_rela):
+	Correct handling of unaligned relocs for little-endian.
+	* sysdeps/powerpc/powerpc64/dl-machine.h (elf_machine_rela): Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* configure.in: Map powerpc64le and powerpcle to base_machine/machine.
 	* configure: Regenerate.
 	* nptl/shlib-versions: Powerpc*le starts at 2.18.
diff --git a/sysdeps/powerpc/powerpc32/dl-machine.c b/sysdeps/powerpc/powerpc32/dl-machine.c
index 188f72c..ec64951 100644
--- a/sysdeps/powerpc/powerpc32/dl-machine.c
+++ b/sysdeps/powerpc/powerpc32/dl-machine.c
@@ -423,6 +423,12 @@ __process_machine_rela (struct link_map *map,
 			Elf32_Addr const finaladdr,
 			int rinfo)
 {
+  union unaligned
+    {
+      unsigned u2 __attribute__ ((mode (HI)));
+      unsigned u4 __attribute__ ((mode (SI)));
+    } __attribute__((__packed__));
+
   switch (rinfo)
     {
     case R_PPC_NONE:
@@ -439,10 +445,7 @@ __process_machine_rela (struct link_map *map,
       return;
 
     case R_PPC_UADDR32:
-      ((char *) reloc_addr)[0] = finaladdr >> 24;
-      ((char *) reloc_addr)[1] = finaladdr >> 16;
-      ((char *) reloc_addr)[2] = finaladdr >> 8;
-      ((char *) reloc_addr)[3] = finaladdr;
+      ((union unaligned *) reloc_addr)->u4 = finaladdr;
       break;
 
     case R_PPC_ADDR24:
@@ -460,8 +463,7 @@ __process_machine_rela (struct link_map *map,
     case R_PPC_UADDR16:
       if (__builtin_expect (finaladdr > 0x7fff && finaladdr < 0xffff8000, 0))
 	_dl_reloc_overflow (map,  "R_PPC_UADDR16", reloc_addr, refsym);
-      ((char *) reloc_addr)[0] = finaladdr >> 8;
-      ((char *) reloc_addr)[1] = finaladdr;
+      ((union unaligned *) reloc_addr)->u2 = finaladdr;
       break;
 
     case R_PPC_ADDR16_LO:
diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
index 059fdaf..b69a1ce 100644
--- a/sysdeps/powerpc/powerpc64/dl-machine.h
+++ b/sysdeps/powerpc/powerpc64/dl-machine.h
@@ -561,6 +561,12 @@ elf_machine_rela (struct link_map *map,
   Elf64_Addr *const reloc_addr = reloc_addr_arg;
   const int r_type = ELF64_R_TYPE (reloc->r_info);
   const Elf64_Sym *const refsym = sym;
+  union unaligned
+    {
+      unsigned u2 __attribute__ ((mode (HI)));
+      unsigned u4 __attribute__ ((mode (SI)));
+      unsigned u8 __attribute__ ((mode (DI)));
+    } __attribute__((__packed__));
 
   if (r_type == R_PPC64_RELATIVE)
     {
@@ -741,23 +747,11 @@ elf_machine_rela (struct link_map *map,
       return;
 
     case R_PPC64_UADDR64:
-      /* We are big-endian.  */
-      ((char *) reloc_addr_arg)[0] = (value >> 56) & 0xff;
-      ((char *) reloc_addr_arg)[1] = (value >> 48) & 0xff;
-      ((char *) reloc_addr_arg)[2] = (value >> 40) & 0xff;
-      ((char *) reloc_addr_arg)[3] = (value >> 32) & 0xff;
-      ((char *) reloc_addr_arg)[4] = (value >> 24) & 0xff;
-      ((char *) reloc_addr_arg)[5] = (value >> 16) & 0xff;
-      ((char *) reloc_addr_arg)[6] = (value >> 8) & 0xff;
-      ((char *) reloc_addr_arg)[7] = (value >> 0) & 0xff;
+      ((union unaligned *) reloc_addr)->u8 = value;
       return;
 
     case R_PPC64_UADDR32:
-      /* We are big-endian.  */
-      ((char *) reloc_addr_arg)[0] = (value >> 24) & 0xff;
-      ((char *) reloc_addr_arg)[1] = (value >> 16) & 0xff;
-      ((char *) reloc_addr_arg)[2] = (value >> 8) & 0xff;
-      ((char *) reloc_addr_arg)[3] = (value >> 0) & 0xff;
+      ((union unaligned *) reloc_addr)->u4 = value;
       return;
 
     case R_PPC64_ADDR32:
@@ -781,10 +775,8 @@ elf_machine_rela (struct link_map *map,
     case R_PPC64_UADDR16:
       if (dont_expect ((value + 0x8000) >= 0x10000))
 	_dl_reloc_overflow (map, "R_PPC64_UADDR16", reloc_addr, refsym);
-      /* We are big-endian.  */
-      ((char *) reloc_addr_arg)[0] = (value >> 8) & 0xff;
-      ((char *) reloc_addr_arg)[1] = (value >> 0) & 0xff;
-      break;
+      ((union unaligned *) reloc_addr)->u2 = value;
+      return;
 
     case R_PPC64_ADDR16_DS:
       if (dont_expect ((value + 0x8000) >= 0x10000 || (value & 3) != 0))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cde0ef248c7566567e7461e58e0b91707fe2330c

commit cde0ef248c7566567e7461e58e0b91707fe2330c
Author: Alan Modra <amodra@gmail.com>
Date:   Thu Oct 3 14:03:03 2013 +0930

    PowerPC LE configury
    http://sourceware.org/ml/libc-alpha/2013-08/msg00096.html
    
    This adds the basic configury bits for powerpc64le and powerpcle.
    
    	* configure.in: Map powerpc64le and powerpcle to base_machine/machine.
    	* configure: Regenerate.
    	* nptl/shlib-versions: Powerpc*le starts at 2.18.
    	* shlib-versions: Likewise.

diff --git a/ChangeLog b/ChangeLog
index f1aaebf..0c727fa 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* configure.in: Map powerpc64le and powerpcle to base_machine/machine.
+	* configure: Regenerate.
+	* nptl/shlib-versions: Powerpc*le starts at 2.18.
+	* shlib-versions: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* string/tester.c (test_memrchr): Increment reported test cycle.
 
 2013-10-04  Alan Modra  <amodra@gmail.com>
diff --git a/configure b/configure
index 1ee4c42..e8a581c 100755
--- a/configure
+++ b/configure
@@ -3969,8 +3969,8 @@ fi
 # base_machine, we don't change it.
 test -n "$base_machine" || case "$machine" in
 i[4567]86)	base_machine=i386 machine=i386/$machine ;;
-powerpc)	base_machine=powerpc machine=powerpc/powerpc32 ;;
-powerpc64)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc64*)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc*)	base_machine=powerpc machine=powerpc/powerpc32 ;;
 s390)           base_machine=s390 machine=s390/s390-32 ;;
 s390x)          base_machine=s390 machine=s390/s390-64 ;;
 sh3*)		base_machine=sh machine=sh/sh3 ;;
diff --git a/configure.in b/configure.in
index 769e8ef..848870b 100644
--- a/configure.in
+++ b/configure.in
@@ -587,8 +587,8 @@ changequote(,)dnl
 # base_machine, we don't change it.
 test -n "$base_machine" || case "$machine" in
 i[4567]86)	base_machine=i386 machine=i386/$machine ;;
-powerpc)	base_machine=powerpc machine=powerpc/powerpc32 ;;
-powerpc64)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc64*)	base_machine=powerpc machine=powerpc/powerpc64 ;;
+powerpc*)	base_machine=powerpc machine=powerpc/powerpc32 ;;
 s390)           base_machine=s390 machine=s390/s390-32 ;;
 s390x)          base_machine=s390 machine=s390/s390-64 ;;
 sh3*)		base_machine=sh machine=sh/sh3 ;;
diff --git a/nptl/shlib-versions b/nptl/shlib-versions
index e49e7ca..495b240 100644
--- a/nptl/shlib-versions
+++ b/nptl/shlib-versions
@@ -2,4 +2,5 @@ sparc64.*-.*-linux.*	libpthread=0		GLIBC_2.2
 sh.*-.*-linux.*		libpthread=0		GLIBC_2.2
 s390x-.*-linux.*	libpthread=0		GLIBC_2.2
 powerpc64-.*-linux.*	libpthread=0		GLIBC_2.3
+powerpc.*le-.*-linux.*	libpthread=0		GLIBC_2.18
 .*-.*-linux.*		libpthread=0
diff --git a/shlib-versions b/shlib-versions
index 9344590..51f5327 100644
--- a/shlib-versions
+++ b/shlib-versions
@@ -23,6 +23,7 @@
 
 s390x-.*-linux.*        DEFAULT			GLIBC_2.2
 powerpc64-.*-linux.*	DEFAULT			GLIBC_2.3
+powerpc.*le-.*-linux.*	DEFAULT			GLIBC_2.18
 .*-.*-gnu-gnu.*		DEFAULT			GLIBC_2.2.6
 
 # Configuration		ABI			Identifier for ABI data files

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=126904dbbf11a7d6692a7286414ab93d72fe16cd

commit 126904dbbf11a7d6692a7286414ab93d72fe16cd
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:38:26 2013 +0930

    string/tester memrchr test
    http://sourceware.org/ml/libc-alpha/2013-08/msg00095.html
    
    I found this useful at one stage when I was seeing a huge number of
    memrchr failures all of test number 10.
    
    	* string/tester.c (test_memrchr): Increment reported test cycle.

diff --git a/ChangeLog b/ChangeLog
index eae9d39..f1aaebf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* string/tester.c (test_memrchr): Increment reported test cycle.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* string/test-memcpy.c (do_one_test): When reporting errors, print
 	string address and don't overrun end of string.
 
diff --git a/string/tester.c b/string/tester.c
index 2fe520c..f1ea438 100644
--- a/string/tester.c
+++ b/string/tester.c
@@ -704,7 +704,7 @@ test_memrchr (void)
      more than 128 byte chunks: */
   {
     char buf[128 + sizeof(long)];
-    long align, len, i, pos;
+    long align, len, i, pos, n = 9;
 
     for (align = 0; align < (long) sizeof(long); ++align) {
       for (len = 0; len < (long) (sizeof(buf) - align); ++len) {
@@ -715,9 +715,9 @@ test_memrchr (void)
 #if 0
 	  printf("align %d, len %d, pos %d\n", align, len, pos);
 #endif
-	  check(memrchr(buf + align, 'x', len) == buf + align + pos, 9);
+	  check(memrchr(buf + align, 'x', len) == buf + align + pos, n++);
 	  check(memrchr(buf + align + pos + 1, 'x', len - (pos + 1)) == NULL,
-		10);
+		n++);
 	  buf[align + pos] = '-';
 	}
       }

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b921c5bc50a863b130fda0c9e7d5f50e6a4783ee

commit b921c5bc50a863b130fda0c9e7d5f50e6a4783ee
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:37:58 2013 +0930

    string/test-memcpy error reporting
    http://sourceware.org/ml/libc-alpha/2013-08/msg00094.html
    
    Using plain %s here runs the risk of segfaulting when displaying the
    string.  src and dst aren't zero terminated strings.
    
    	* string/test-memcpy.c (do_one_test): When reporting errors, print
    	string address and don't overrun end of string.

diff --git a/ChangeLog b/ChangeLog
index 888c7d9..eae9d39 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* string/test-memcpy.c (do_one_test): When reporting errors, print
+	string address and don't overrun end of string.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/power7/memchr.S: Replace rlwimi with
 	insrdi.  Make better use of reg selection to speed exit slightly.
 	Schedule entry path a little better.  Remove useless "are we done"
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index d121ef0..b7ebe5f 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -63,8 +63,8 @@ do_one_test (impl_t *impl, char *dst, const char *src,
 
   if (memcmp (dst, src, len) != 0)
     {
-      error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
-	     impl->name, dst, src);
+      error (0, 0, "Wrong result in function %s dst %p \"%.*s\" src %p \"%.*s\" len %zu",
+	     impl->name, dst, (int) len, dst, src, (int) len, src, len);
       ret = 1;
       return;
     }

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=01f25c0e1c5f1f49df18c5f3f67c873ce1739481

commit 01f25c0e1c5f1f49df18c5f3f67c873ce1739481
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:48:36 2013 +0930

    PowerPC LE memchr and memrchr
    http://sourceware.org/ml/libc-alpha/2013-08/msg00105.html
    
    Like strnlen, memchr and memrchr had a number of defects fixed by this
    patch as well as adding little-endian support.  The first one I
    noticed was that the entry to the main loop needlessly checked for
    "are we done yet?" when we know the size is large enough that we can't
    be done.  The second defect I noticed was that the main loop count was
    wrong, which in turn meant that the small loop needed to handle an
    extra word.  Thirdly, there is nothing to say that the string can't
    wrap around zero, except of course that we'd normally hit a segfault
    on trying to read from address zero.  Fixing that simplified a number
    of places:
    
    -	/* Are we done already?  */
    -	addi    r9,r8,8
    -	cmpld	r9,r7
    -	bge	L(null)
    
    becomes
    
    +	cmpld	r8,r7
    +	beqlr
    
    However, the exit gets an extra test because I test for being on the
    last word then if so whether the byte offset is less than the end.
    Overall, the change is a win.
    
    Lastly, memrchr used the wrong cache hint.
    
    	* sysdeps/powerpc/powerpc64/power7/memchr.S: Replace rlwimi with
    	insrdi.  Make better use of reg selection to speed exit slightly.
    	Schedule entry path a little better.  Remove useless "are we done"
    	checks on entry to main loop.  Handle wrapping around zero address.
    	Correct main loop count.  Handle single left-over word from main
    	loop inline rather than by using loop_small.  Remove extra word
    	case in loop_small caused by wrong loop count.  Add little-endian
    	support.
    	* sysdeps/powerpc/powerpc32/power7/memchr.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/memrchr.S: Likewise.  Use proper
    	cache hint.
    	* sysdeps/powerpc/powerpc32/power7/memrchr.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/rawmemchr.S: Add little-endian
    	support.  Avoid rlwimi.
    	* sysdeps/powerpc/powerpc32/power7/rawmemchr.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 99d2b70..888c7d9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,23 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/power7/memchr.S: Replace rlwimi with
+	insrdi.  Make better use of reg selection to speed exit slightly.
+	Schedule entry path a little better.  Remove useless "are we done"
+	checks on entry to main loop.  Handle wrapping around zero address.
+	Correct main loop count.  Handle single left-over word from main
+	loop inline rather than by using loop_small.  Remove extra word
+	case in loop_small caused by wrong loop count.  Add little-endian
+	support.
+	* sysdeps/powerpc/powerpc32/power7/memchr.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memrchr.S: Likewise.  Use proper
+	cache hint.
+	* sysdeps/powerpc/powerpc32/power7/memrchr.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/rawmemchr.S: Add little-endian
+	support.  Avoid rlwimi.
+	* sysdeps/powerpc/powerpc32/power7/rawmemchr.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/memset.S: Replace rlwimi with
         insrdi.  Formatting.
 	* sysdeps/powerpc/powerpc64/power4/memset.S: Likewise.
diff --git a/sysdeps/powerpc/powerpc32/power7/memchr.S b/sysdeps/powerpc/powerpc32/power7/memchr.S
index 369e5e0..85754f3 100644
--- a/sysdeps/powerpc/powerpc32/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/memchr.S
@@ -25,107 +25,111 @@ ENTRY (__memchr)
 	CALL_MCOUNT
 	dcbt	0,r3
 	clrrwi  r8,r3,2
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
 	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	insrdi	r4,r4,16,32
 	cmplwi	r5,16
+	li	r9, -1
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	slw	r9,r9,r6
+#else
+	srw	r9,r9,r6
+#endif
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Compare the starting address (r3) with the
-				 ending address (r7).  If (r3 >= r7), the size
-				 passed in is zero or negative.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Artificially set our ending address (r7)
-				 such that we will exit early. */
-L(proceed):
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	cmpli	cr6,r6,0      /* cr6 == Do we have padding?  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTEs in WORD1.  */
-	beq	cr6,L(proceed_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(proceed_no_padding):
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in WORD1.  */
+	and	r3,r3,r9
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-
 	bt	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 L(loop_setup):
-	sub	r5,r7,r9
-	srwi	r6,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r6,r7,r8
+	srwi	r6,r6,3	      /* Number of loop iterations.  */
 	mtctr	r6            /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since
-	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 	lwz	r12,4(r8)
 	lwzu	r11,8(r8)
-	cmpb	r10,r12,r4
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
-	cmplwi	cr7,r5,0
+	or	r6,r9,r3      /* Merge everything in one word.  */
+	cmplwi	cr7,r6,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
 
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  */
-	subi	r11,r7,4
-	cmplw	cr6,r8,r11
-	blt	cr6,L(loop_small)
-	b	L(null)
+	/* We may have one more dword to read.  */
+	cmplw	r8,r7
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
 
+	.align	4
+L(found):
 	/* OK, one (or both) of the words contains BYTE.  Check
 	   the first word and decrement the address in case the first
 	   word really contains BYTE.  */
-	.align	4
-L(found):
-	cmplwi	cr6,r10,0
+	cmplwi	cr6,r3,0
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntw	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzw	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmplw	r8,r7         /* Are we on the last word?  */
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r8,r0
-	cmplw	r3,r7
-	bge	L(null)
+	cmplw	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -137,67 +141,42 @@ L(null):
 	.align	4
 L(small_range):
 	cmplwi	r5,0
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	beq	L(null)       /* This branch is for the cmplwi r5,0 above */
+	beq	L(null)
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmplwi	cr6,r6,0      /* cr6 == Do we have padding?  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	beq	cr6,L(small_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(small_no_padding):
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplw	r8,r7         /* Are we done already?  */
 	bne	cr7,L(done)
+	beqlr
 
-	/* Are we done already?  */
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(null)
-
-L(loop_small):                /* loop_small has been unrolled.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
-	bge	L(null)
-
-	/* For most cases we will never get here.  Under some combinations of
-	   padding + length there is a leftover word that still needs to be
-	   checked.  */
-	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
-
-	/* save a branch and exit directly */
-	li	r3,0
 	blr
 
 END (__memchr)
diff --git a/sysdeps/powerpc/powerpc32/power7/memrchr.S b/sysdeps/powerpc/powerpc32/power7/memrchr.S
index defd832..9601aa7 100644
--- a/sysdeps/powerpc/powerpc32/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -23,117 +23,131 @@
 	.machine  power7
 ENTRY (__memrchr)
 	CALL_MCOUNT
-	dcbt	0,r3
-	mr	r7,r3
-	add	r3,r7,r5      /* Calculate the last acceptable address.  */
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  */
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrwi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
-	bge	cr7,L(proceed)
-
-	li	r3,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 	li	r6,-4
-	addi	r9,r3,-1
-	clrrwi  r8,r9,2
-	addi	r8,r8,4
-	neg	r0,r3
+	li	r9,-1
 	rlwinm	r0,r0,3,27,28 /* Calculate padding.  */
-
+	clrrwi	r8,r7,2
+	srw	r9,r9,r0
 	cmplwi	r5,16
+	clrrwi	r0,r10,2
 	ble	L(small_range)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,-4
-	cmplw	cr6,r9,r7
-	ble	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-	mr	r8,r9
-	bt	29,L(loop_setup)
+	bf	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,r8,r6
+#else
 	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
-	bne	cr7,L(done)
-
-	/* Are we done already?  */
+#endif
 	addi	r8,r8,-4
-	cmplw	cr6,r8,r7
-	ble	cr6,L(null)
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 
 L(loop_setup):
-	li	r0,-8
-	sub	r5,r8,r7
-	srwi	r9,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the word at
+	   s & ~3, or r0.  The first word read is at r8 - 4, we
+	   read 2 * cnt words, so the last word read will be at
+	   r8 - 4 - 8 * cnt + 4.  Solving for cnt gives
+	   cnt = (r8 - r0) / 8  */
+	sub	r5,r8,r0
+	addi	r8,r8,-4
+	srwi	r9,r5,3       /* Number of loop iterations.  */
 	mtctr	r9	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since it's a
-	   small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 
-	lwbrx	r12,r8,r6
-	lwbrx	r11,r8,r0
-	addi	r8,r8,-4
-	cmpb	r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+	lwzx	r11,r8,r6
+#else
+	lwbrx	r12,0,r8
+	lwbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
+	or	r5,r9,r3      /* Merge everything in one word.  */
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
-	addi	r8,r8,-4
+	addi	r8,r8,-8
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  Just return
-	   the original range.  */
-	addi	r8,r8,4
-	cmplw	cr6,r8,r7
-	bgt	cr6,L(loop_small)
-	b	L(null)
 
-	/* OK, one (or both) of the words contains BYTE.  Check
-	   the first word and decrement the address in case the first
-	   word really contains BYTE.  */
+	/* We may have one more word to read.  */
+	cmplw	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
 	.align	4
 L(found):
-	cmplwi	cr6,r10,0
-	addi	r8,r8,4
+	/* OK, one (or both) of the words contains BYTE.  Check
+	   the first word.  */
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,-4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r6,r0,3	      /* Convert leading zeroes to bytes.  */
-	addi	r0,r6,1
+	cntlzw	r9,r3	      /* Count leading zeros before the match.  */
+	cmplw	r8,r0         /* Are we on the last word?  */
+	srwi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-3
 	sub	r3,r8,r0
-	cmplw	r3,r7
-	blt	L(null)
+	cmplw	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -147,28 +161,35 @@ L(small_range):
 	cmplwi	r5,0
 	beq	L(null)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for null bytes in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
+	/* Are we done already?  */
+	cmplw	r8,r0
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	beqlr
 
-	.p2align  5
+	.align	5
 L(loop_small):
-	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplw	r8,r0
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	bne	L(loop_small)
+	blr
 
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/powerpc/powerpc32/power7/rawmemchr.S b/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
index a80c74a..c2d8c4b 100644
--- a/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
@@ -27,16 +27,21 @@ ENTRY (__rawmemchr)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6      /* Move left to discard ignored bits.  */
 	srw	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
 	bne	cr7,L(done)
 
@@ -90,8 +95,14 @@ L(loop):
 	   word from the string.  Use that fact to find out what is
 	   the position of the byte inside the string.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching char.  */
 	blr
 END (__rawmemchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memchr.S b/sysdeps/powerpc/powerpc64/power7/memchr.S
index 3416897..5076dd0 100644
--- a/sysdeps/powerpc/powerpc64/power7/memchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -25,109 +25,112 @@ ENTRY (__memchr)
 	CALL_MCOUNT 2
 	dcbt	0,r3
 	clrrdi  r8,r3,3
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
 	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	insrdi	r4,r4,16,32
 	cmpldi	r5,32
+	li	r9, -1
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	insrdi  r4,r4,32,0
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r9,r6
+#else
+	srd	r9,r9,r6
+#endif
 	ble	L(small_range)
 
-	cmpld	cr7,r3,r7     /* Compare the starting address (r3) with the
-				 ending address (r7).  If (r3 >= r7),
-				 the size passed in was zero or negative.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1         /* Artificially set our ending address (r7)
-				 such that we will exit early.  */
-
-L(proceed):
-	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	cmpldi	cr6,r6,0      /* cr6 == Do we have padding?  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTEs in DWORD1.  */
-	beq	cr6,L(proceed_no_padding)
-	sld	r10,r10,r6
-	srd	r10,r10,r6
-L(proceed_no_padding):
-	cmpldi	cr7,r10,0     /* Does r10 indicate we got a hit?  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in DWORD1.  */
+	and	r3,r3,r9
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpldi	cr7,r3,0      /* Does r3 indicate we got a hit?  */
 	bne	cr7,L(done)
 
-	/* See if we are at the last acceptable address yet.  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-
 	bt	28,L(loop_setup)
 
 	/* Handle DWORD2 of pair.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	cmpldi	cr7,r10,0
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(null)
-
 L(loop_setup):
-	sub	r5,r7,r9
-	srdi	r6,r5,4	      /* Number of loop iterations.  */
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r6,r7,r8
+	srdi	r6,r6,4	      /* Number of loop iterations.  */
 	mtctr	r6            /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since
-	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
 L(loop):
 	/* Load two doublewords, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 	ld	r12,8(r8)
 	ldu	r11,16(r8)
-	cmpb	r10,r12,r4
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one doubleword.  */
-	cmpldi	cr7,r5,0
+	or	r6,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r6,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
 
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  */
-	subi	r11,r7,8
-	cmpld	cr6,r8,r11
-	blt	cr6,L(loop_small)
-	b	L(null)
+	/* We may have one more dword to read.  */
+	cmpld	r8,r7
+	beqlr
 
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+	.align	4
+L(found):
 	/* OK, one (or both) of the doublewords contains BYTE.  Check
 	   the first doubleword and decrement the address in case the first
 	   doubleword really contains BYTE.  */
-	.align	4
-L(found):
-	cmpldi	cr6,r10,0
+	cmpldi	cr6,r3,0
 	addi	r8,r8,-8
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second doubleword.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,8
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   doubleword from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmpld	r8,r7         /* Are we on the last dword?  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r8,r0
-	cmpld	r3,r7
-	bge	L(null)
+	cmpld	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -139,67 +142,44 @@ L(null):
 	.align	4
 L(small_range):
 	cmpldi	r5,0
-	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	beq	L(null)       /* This branch is for the cmpldi r5,0 above.  */
+	beq	L(null)
 	ld	r12,0(r8)     /* Load word from memory.  */
-	cmpldi	cr6,r6,0      /* cr6 == Do we have padding?  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-			      /* If no padding, skip the shifts.  */
-	beq	cr6,L(small_no_padding)
-	sld	r10,r10,r6
-	srd	r10,r10,r6
-L(small_no_padding):
-	cmpldi	cr7,r10,0
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpld	r8,r7         /* Are we done already?  */
 	bne	cr7,L(done)
+	beqlr
 
-	/* Are we done already?  */
-	addi    r9,r8,8
-	cmpld	r9,r7
-	bge	L(null)
-	/* If we're not done, drop through into loop_small.  */
-
-L(loop_small):                /* loop_small has been unrolled.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
 	bne	cr6,L(done)   /* Found something.  */
-	bge	L(null)       /* Hit end of string (length).  */
+	beqlr		      /* Hit end of string (length).  */
 
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
-	bne	cr6,L(done)   /* Found something.  */
-	bge	L(null)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
 
 	ldu	r12,8(r8)
-	subi	r11,r7,8
-	cmpb	r10,r12,r4
-	cmpldi	cr6,r10,0
-	ori	r2,r2,0       /* Force a dispatch group.  */
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
 	bne	cr6,L(done)
+	beqlr
 
-	cmpld	r8,r11        /* At end of range?  */
-	bge	L(null)
-
-	/* For most cases we will never get here.  Under some combinations of
-	   padding + length there is a leftover double that still needs to be
-	   checked.  */
 	ldu	r12,8(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,8
-	cmpldi	cr6,r10,0
-	cmpld	r9,r7
-	bne	cr6,L(done)   /* Found something.  */
-
-	/* Save a branch and exit directly.  */
-	li	r3,0
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
 	blr
 
-
 END (__memchr)
 weak_alias (__memchr, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
index c499952..a9e86cb 100644
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -23,118 +23,132 @@
 	.machine  power7
 ENTRY (__memrchr)
 	CALL_MCOUNT
-	dcbt	0,r3
-	mr	r7,r3
-	add	r3,r7,r5      /* Calculate the last acceptable address.  */
-	cmpld	cr7,r3,r7     /* Is the address equal or less than r3?  */
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrdi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi  r4,r4,32,0
-	bge	cr7,L(proceed)
-
-	li	r3,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	li	r6,-8
-	addi	r9,r3,-1
-	clrrdi  r8,r9,3
-	addi	r8,r8,8
-	neg	r0,r3
+	li	r9,-1
 	rlwinm	r0,r0,3,26,28 /* Calculate padding.  */
-
+	clrrdi	r8,r7,3
+	srd	r9,r9,r0
 	cmpldi	r5,32
+	clrrdi	r0,r10,3
 	ble	L(small_range)
 
-	ldbrx	r12,r8,r6     /* Load reversed doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	sld	r10,r10,r0
-	srd	r10,r10,r0
-	cmpldi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,-8
-	cmpld	cr6,r9,r7
-	ble	cr6,L(null)
-
 	mtcrf   0x01,r8
-	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-	mr	r8,r9
-	bt	28,L(loop_setup)
+	bf	28,L(loop_setup)
 
 	/* Handle DWORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,r8,r6
+#else
 	ldbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmpldi	cr7,r10,0
-	bne	cr7,L(done)
-
-	/* Are we done already.  */
+#endif
 	addi	r8,r8,-8
-	cmpld	cr6,r8,r7
-	ble	cr6,L(null)
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
 
 L(loop_setup):
-	li	r0,-16
-	sub	r5,r8,r7
-	srdi	r9,r5,4	      /* Number of loop iterations.  */
+	/* The last dword we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the dword at
+	   s & ~7, or r0.  The first dword read is at r8 - 8, we
+	   read 2 * cnt dwords, so the last dword read will be at
+	   r8 - 8 - 16 * cnt + 8.  Solving for cnt gives
+	   cnt = (r8 - r0) / 16  */
+	sub	r5,r8,r0
+	addi	r8,r8,-8
+	srdi	r9,r5,4       /* Number of loop iterations.  */
 	mtctr	r9	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since it's a
-	   small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
 L(loop):
 	/* Load two doublewords, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 
-	ldbrx	r12,r8,r6
-	ldbrx	r11,r8,r0
-	addi	r8,r8,-8
-	cmpb	r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+	ldx	r11,r8,r6
+#else
+	ldbrx	r12,0,r8
+	ldbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one doubleword.  */
+	or	r5,r9,r3      /* Merge everything in one doubleword.  */
 	cmpldi	cr7,r5,0
 	bne	cr7,L(found)
-	addi	r8,r8,-8
+	addi	r8,r8,-16
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  Just return
-	   the original range.  */
-	addi	r8,r8,8
-	cmpld	cr6,r8,r7
-	bgt	cr6,L(loop_small)
-	b	L(null)
-
-	/* OK, one (or both) of the words contains BYTE.  Check
-	   the first word and decrement the address in case the first
-	   word really contains BYTE.  */
+
+	/* We may have one more word to read.  */
+	cmpld	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
 	.align	4
 L(found):
-	cmpldi	cr6,r10,0
-	addi	r8,r8,8
+	/* OK, one (or both) of the dwords contains BYTE.  Check
+	   the first dword.  */
+	cmpldi	cr6,r3,0
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,-8
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
-	   0xff in the same position as the BYTE in the original
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the
 	   range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r6,r0,3	      /* Convert leading zeroes to bytes.  */
-	addi	r0,r6,1
+	cntlzd	r9,r3	      /* Count leading zeros before the match.  */
+	cmpld	r8,r0         /* Are we on the last word?  */
+	srdi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-7
 	sub	r3,r8,r0
-	cmpld	r3,r7
-	blt	L(null)
+	cmpld	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -148,29 +162,35 @@ L(small_range):
 	cmpldi	r5,0
 	beq	L(null)
 
-	ldbrx	r12,r8,r6     /* Load reversed doubleword from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	sld	r10,r10,r0
-	srd	r10,r10,r0
-	cmpldi	cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
 	bne	cr7,L(done)
 
 	/* Are we done already?  */
+	cmpld	r8,r0
 	addi	r8,r8,-8
-	cmpld	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	beqlr
 
-	.p2align  5
+	.align	5
 L(loop_small):
-	ldbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmpldi	cr6,r10,0
-	bne	cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpld	r8,r0
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
 	addi	r8,r8,-8
-	cmpld	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	bne	L(loop_small)
+	blr
 
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
index 50a33d8..547aed7 100644
--- a/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -27,8 +27,8 @@ ENTRY (__rawmemchr)
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi	r4,r4,32,0
 
 	/* Now r4 has a doubleword of c bytes.  */
@@ -36,8 +36,13 @@ ENTRY (__rawmemchr)
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
 	sld	r5,r5,r6      /* Move left to discard ignored bits.  */
 	srd	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
 	bne	cr7,L(done)
 
@@ -91,8 +96,14 @@ L(loop):
 	   doubleword from the string.  Use that fact to find out what is
 	   the position of the byte inside the string.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching char.  */
 	blr
 END (__rawmemchr)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=62b5c6de0ec2b1a3d4779797dece796edb631e17

commit 62b5c6de0ec2b1a3d4779797dece796edb631e17
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:47:59 2013 +0930

    PowerPC LE memset
    http://sourceware.org/ml/libc-alpha/2013-08/msg00104.html
    
    One of the things I noticed when looking at power7 timing is that rlwimi
    is cracked and the two resulting insns have a register dependency.
    That makes it a little slower than the equivalent rldimi.
    
    	* sysdeps/powerpc/powerpc64/memset.S: Replace rlwimi with
            insrdi.  Formatting.
    	* sysdeps/powerpc/powerpc64/power4/memset.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power6/memset.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/memset.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power4/memset.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power6/memset.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/memset.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 3632248..99d2b70 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,16 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/memset.S: Replace rlwimi with
+        insrdi.  Formatting.
+	* sysdeps/powerpc/powerpc64/power4/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power6/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power6/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/memset.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
 	* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
 	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
diff --git a/sysdeps/powerpc/powerpc32/power4/memset.S b/sysdeps/powerpc/powerpc32/power4/memset.S
index c2d288b..4fd9d8c 100644
--- a/sysdeps/powerpc/powerpc32/power4/memset.S
+++ b/sysdeps/powerpc/powerpc32/power4/memset.S
@@ -50,7 +50,7 @@ L(_memset):
 
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48     /* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -65,7 +65,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32    /* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/sysdeps/powerpc/powerpc32/power6/memset.S b/sysdeps/powerpc/powerpc32/power6/memset.S
index ce06630..508d560 100644
--- a/sysdeps/powerpc/powerpc32/power6/memset.S
+++ b/sysdeps/powerpc/powerpc32/power6/memset.S
@@ -48,7 +48,7 @@ L(_memset):
 	ble-	cr1, L(small)
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -64,7 +64,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/sysdeps/powerpc/powerpc32/power7/memset.S b/sysdeps/powerpc/powerpc32/power7/memset.S
index 360ea71..aadda25 100644
--- a/sysdeps/powerpc/powerpc32/power7/memset.S
+++ b/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -35,8 +35,8 @@ L(_memset):
 	cfi_offset(31,-8)
 
 	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
 
 	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 
diff --git a/sysdeps/powerpc/powerpc64/memset.S b/sysdeps/powerpc/powerpc64/memset.S
index 6acf149..1027a59 100644
--- a/sysdeps/powerpc/powerpc64/memset.S
+++ b/sysdeps/powerpc/powerpc64/memset.S
@@ -55,14 +55,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -84,14 +84,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -212,7 +212,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S
index dbecee8..ad0d381 100644
--- a/sysdeps/powerpc/powerpc64/power4/memset.S
+++ b/sysdeps/powerpc/powerpc64/power4/memset.S
@@ -50,14 +50,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -79,14 +79,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -146,24 +146,24 @@ L(zloopstart):
 L(getCacheAligned):
 	cmpldi	cr1,rLEN,32
 	andi.	rTMP,rMEMP,127
-	blt		cr1,L(handletail32)
-	beq		L(cacheAligned)
+	blt	cr1,L(handletail32)
+	beq	L(cacheAligned)
 	addi	rMEMP,rMEMP,32
 	addi	rLEN,rLEN,-32
-	std		rCHR,-32(rMEMP)
-	std		rCHR,-24(rMEMP)
-	std		rCHR,-16(rMEMP)
-	std		rCHR,-8(rMEMP)
-	b		L(getCacheAligned)
+	std	rCHR,-32(rMEMP)
+	std	rCHR,-24(rMEMP)
+	std	rCHR,-16(rMEMP)
+	std	rCHR,-8(rMEMP)
+	b	L(getCacheAligned)
 
 /* Now we are aligned to the cache line and can use dcbz.  */
 L(cacheAligned):
 	cmpld	cr1,rLEN,rCLS
-	blt		cr1,L(handletail32)
+	blt	cr1,L(handletail32)
 	dcbz	0,rMEMP
 	subf	rLEN,rCLS,rLEN
-	add		rMEMP,rMEMP,rCLS
-	b		L(cacheAligned)
+	add	rMEMP,rMEMP,rCLS
+	b	L(cacheAligned)
 
 /* We are here because the cache line size was set and was not 32-bytes
    and the remainder (rLEN) is less than the actual cache line size.
@@ -200,7 +200,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S
index 541a45f..a3864cc 100644
--- a/sysdeps/powerpc/powerpc64/power6/memset.S
+++ b/sysdeps/powerpc/powerpc64/power6/memset.S
@@ -47,14 +47,14 @@ L(_memset):
 
 /* Align to doubleword boundary.  */
 	cmpldi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned2)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 8
 	cror	28,30,31		/* Detect odd word aligned.  */
 	add	rMEMP, rMEMP, rALIGN
 	sub	rLEN, rLEN, rALIGN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	bt	29, L(g4)
 /* Process the even word of doubleword.  */
 	bf+	31, L(g2)
@@ -76,14 +76,14 @@ L(g0):
 
 /* Handle the case of size < 31.  */
 L(aligned2):
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 L(aligned):
 	mtcrf	0x01, rLEN
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x18
 	subfic	rALIGN, rALIGN, 0x20
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word. */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word. */
 	beq	L(caligned)
 	mtcrf	0x01, rALIGN
 	add	rMEMP, rMEMP, rALIGN
@@ -344,7 +344,7 @@ L(le4):
 /* Memset of 0-31 bytes.  */
 	.align 5
 L(medium):
-	insrdi	rCHR,rCHR,32,0 /* Replicate word to double word.  */
+	insrdi	rCHR, rCHR, 32, 0	/* Replicate word to double word.  */
 	cmpldi	cr1, rLEN, 16
 L(medium_tail2):
 	add	rMEMP, rMEMP, rLEN
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index b24cfa1..8b081e8 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -32,8 +32,8 @@ L(_memset):
 	mr	10,3
 
 	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
 	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 
 	neg	0,3
@@ -321,7 +321,7 @@ L(medium):
 	clrldi	0,0,62
 	beq	L(medium_aligned)
 
-	/* Force 4-bytes alignment for SRC.  */
+	/* Force 4-bytes alignment for DST.  */
 	mtocrf	0x01,0
 	subf	5,0,5
 1:	/* Copy 1 byte.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0455a7b91e468c01869cdbb8ac6b532e4cf4a60a

commit 0455a7b91e468c01869cdbb8ac6b532e4cf4a60a
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:47:22 2013 +0930

    PowerPC LE memcpy
    http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html
    
    LIttle-endian support for memcpy.  I spent some time cleaning up the
    64-bit power7 memcpy, in order to avoid the extra alignment traps
    power7 takes for little-endian.  It probably would have been better
    to copy the linux kernel version of memcpy.
    
    	* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
    	* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
    	use of regs.  Use power7 mtocrf.  Tidy function tails.

diff --git a/ChangeLog b/ChangeLog
index 42d88db..3632248 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+	* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
+	use of regs.  Use power7 mtocrf.  Tidy function tails.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
 	Formatting.  Consistently use rXXX register defines or rN defines.
 	Use early exit labels that avoid restoring unused non-volatile regs.
diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d914663..338d3cc 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
     bf      30,1f
 
     /* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   6,8(5)  /* load the 3rd src word */
     stw   0,0(4)  /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10  /* now left align 2nd src word into R0 */
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   7,12(5)
     stw   0,4(4)  /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
     addi  5,5,16
     bf    31,4f
     /* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     stw   0,0(4)  /* store 3rd dst word */
     mr    6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
     b     4f
     .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srw     0,6,10
+    slw     8,7,9
+#else
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     addi  5,5,8
     or    0,0,8   /* or them to get word to store */
     bf    31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
     .align  4
 4:
     /* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,0(5)
     stw   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,4(5)
     stw   0,4(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,8(5)
     stw   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,12(5)
     stw   0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
     bdnz+ 4b
 8:
     /* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     stw   0,0(4)
 3:
diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S
index c3d55b7..9c80767 100644
--- a/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
     lwz     6,-1(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,8
+#else
     slwi    6,6,8
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu1_32tail)
     mtctr   8
@@ -585,8 +602,12 @@ L(wdu1_32):
 
     lwz   8,3(4)
     lwz   7,4(4)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
     b      L(wdu1_loop32x)
     .align  4
 L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
 L(wdu1_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,8
+#else
     slwi  6,8,8
+#endif
     bdnz+ L(wdu1_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,3(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
     rlwimi 6,8,8,(32-8),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
     lwz     6,-2(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,16
+#else
     slwi    6,6,16
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu2_32tail)
     mtctr   8
@@ -641,8 +678,11 @@ L(wdu2_32):
 
     lwz   8,2(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b      L(wdu2_loop32x)
     .align  4
 L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
 L(wdu2_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,16
+#else
     slwi  6,8,16
+#endif
     bdnz+ L(wdu2_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,2(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
     lwz     6,-3(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,24
+#else
     slwi    6,6,24
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu3_32tail)
     mtctr   8
@@ -698,8 +752,11 @@ L(wdu3_32):
 
     lwz   8,1(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b      L(wdu3_loop32x)
     .align  4
 L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
 L(wdu3_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,24
+#else
     slwi  6,8,24
+#endif
     bdnz+ L(wdu3_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,1(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b     L(wdu_32tailx)
     .align  4
 L(wdu_32tailx):
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f00778..acf3c10 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
 
 	beq    L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf   0x01,0
 	subf    31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
 	mr      11,12
 	mtcrf   0x01,9
 	cmplwi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
 	lvsl    5,0,12
+#endif
 	lvx     3,0,12
 	bf      31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx     4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
 	vperm   6,3,4,5
+#endif
 	addi    11,12,16
 	addi    10,3,16
 	stvx    6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi    11,11,32
 	stvx    6,0,10
 	stvx    10,10,6
diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb..4610ec5 100644
--- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
 
 	beq	L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf	0x01,0
 	subf	31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmplwi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6
diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S
index b8c4cc8..5fc7401 100644
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
     ld    7,8(5)
     subfic  9,10,64
     beq   2f
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+#else
     sld   0,6,10
+#endif
     cmpldi  11,1
     mr    6,7
     addi  4,4,-8
@@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
     b     1f
 2:  addi  5,5,8
     .align  4
+#ifdef __LITTLE_ENDIAN__
+0:  srd   0,6,10
+    sld   8,7,9
+#else
 0:  sld   0,6,10
     srd   8,7,9
+#endif
     cmpldi  11,2
     ld    6,8(5)
     or    0,0,8
     addi  11,11,-2
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+1:  sld   8,6,9
+#else
     sld   0,7,10
 1:  srd   8,6,9
+#endif
     or    0,0,8
     beq   8f
     ld    7,16(5)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S
index 4317c7e..f9a7260 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S
@@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
     bf      30,1f
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd     0,7,10
+    sld     8,6,9
+#else
     sld     0,7,10
     srd     8,6,9
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
     blt     cr6,8f  /* if total DWs = 3, then bypass loop */
     bf      31,4f
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
     b       4f
     .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
     sld     0,6,10
     srd     8,7,9
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,4f
@@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
     addi    4,4,8
     .align 4
 /* copy 32 bytes at a time */
-4:  sld   0,6,10
+4:
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
+    sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
     sld   0,7,10
     srd   8,6,9
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
     sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
     sld   0,7,10
     srd   8,6,9
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
     .align 4
 8:
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
     sld   0,6,10
     srd   8,7,9
+#endif
     or    0,0,8
     std   0,0(4)
 3:
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S
index db29e2b..aa0802e 100644
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@@ -400,15 +400,28 @@ L(das_tail2):
     blt   cr6,5f
     srdi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmpldi	cr1,10,16
@@ -595,13 +608,24 @@ L(du1_do):
     bf      30,L(du1_1dw)
 
     /* there are at least two DWs to copy */
+    /* FIXME: can combine last shift and "or" into "rldimi" */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 8
+    sldi     8,6, 64-8
+#else
     sldi     0,7, 8
     srdi     8,6, 64-8
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -610,8 +634,13 @@ L(du1_do):
     blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du1_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -622,8 +651,13 @@ L(du1_do):
     b       L(du1_loop)
     .align 4
 L(du1_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
     sldi     0,6, 8
     srdi     8,7, 64-8
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du1_loop)
@@ -635,23 +669,43 @@ L(du1_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du1_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
     sldi   0,7, 8
     srdi   8,6, 64-8
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
     sldi   0,7, 8
     srdi   8,6, 64-8
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -661,8 +715,13 @@ L(du1_loop):
     .align 4
 L(du1_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
     sldi   0,6, 8
     srdi   8,7, 64-8
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -672,13 +731,23 @@ L(du2_do):
     bf      30,L(du2_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 16
+    sldi     8,6, 64-16
+#else
     sldi     0,7, 16
     srdi     8,6, 64-16
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -687,8 +756,13 @@ L(du2_do):
     blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du2_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -699,8 +773,13 @@ L(du2_do):
     b       L(du2_loop)
     .align 4
 L(du2_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
     sldi     0,6, 16
     srdi     8,7, 64-16
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du2_loop)
@@ -712,23 +791,43 @@ L(du2_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du2_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
     sldi   0,7, 16
     srdi   8,6, 64-16
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
     sldi   0,7, 16
     srdi   8,6, 64-16
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -738,8 +837,13 @@ L(du2_loop):
     .align 4
 L(du2_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
     sldi   0,6, 16
     srdi   8,7, 64-16
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -749,13 +853,23 @@ L(du3_do):
     bf      30,L(du3_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 24
+    sldi     8,6, 64-24
+#else
     sldi     0,7, 24
     srdi     8,6, 64-24
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -764,8 +878,13 @@ L(du3_do):
     blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du3_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -776,8 +895,13 @@ L(du3_do):
     b       L(du3_loop)
     .align 4
 L(du3_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
     sldi     0,6, 24
     srdi     8,7, 64-24
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du3_loop)
@@ -789,23 +913,43 @@ L(du3_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du3_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
     sldi   0,7, 24
     srdi   8,6, 64-24
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
     sldi   0,7, 24
     srdi   8,6, 64-24
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -815,8 +959,13 @@ L(du3_loop):
     .align 4
 L(du3_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
     sldi   0,6, 24
     srdi   8,7, 64-24
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -832,13 +981,23 @@ L(du4_dox):
     bf      30,L(du4_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 32
+    sldi     8,6, 64-32
+#else
     sldi     0,7, 32
     srdi     8,6, 64-32
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -847,8 +1006,13 @@ L(du4_dox):
     blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du4_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -859,8 +1023,13 @@ L(du4_dox):
     b       L(du4_loop)
     .align 4
 L(du4_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
     sldi     0,6, 32
     srdi     8,7, 64-32
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du4_loop)
@@ -872,23 +1041,43 @@ L(du4_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du4_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
     sldi   0,7, 32
     srdi   8,6, 64-32
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
     sldi   0,7, 32
     srdi   8,6, 64-32
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -898,8 +1087,13 @@ L(du4_loop):
     .align 4
 L(du4_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
     sldi   0,6, 32
     srdi   8,7, 64-32
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -909,13 +1103,23 @@ L(du5_do):
     bf      30,L(du5_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 40
+    sldi     8,6, 64-40
+#else
     sldi     0,7, 40
     srdi     8,6, 64-40
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -924,8 +1128,13 @@ L(du5_do):
     blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du5_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -936,8 +1145,13 @@ L(du5_do):
     b       L(du5_loop)
     .align 4
 L(du5_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
     sldi     0,6, 40
     srdi     8,7, 64-40
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du5_loop)
@@ -949,23 +1163,43 @@ L(du5_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du5_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
     sldi   0,7, 40
     srdi   8,6, 64-40
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
     sldi   0,7, 40
     srdi   8,6, 64-40
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -975,8 +1209,13 @@ L(du5_loop):
     .align 4
 L(du5_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
     sldi   0,6, 40
     srdi   8,7, 64-40
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -986,13 +1225,23 @@ L(du6_do):
     bf      30,L(du6_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 48
+    sldi     8,6, 64-48
+#else
     sldi     0,7, 48
     srdi     8,6, 64-48
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -1001,8 +1250,13 @@ L(du6_do):
     blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du6_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -1013,8 +1267,13 @@ L(du6_do):
     b       L(du6_loop)
     .align 4
 L(du6_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
     sldi     0,6, 48
     srdi     8,7, 64-48
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du6_loop)
@@ -1026,23 +1285,43 @@ L(du6_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du6_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
     sldi   0,7, 48
     srdi   8,6, 64-48
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
     sldi   0,7, 48
     srdi   8,6, 64-48
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -1052,8 +1331,13 @@ L(du6_loop):
     .align 4
 L(du6_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
     sldi   0,6, 48
     srdi   8,7, 64-48
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
@@ -1063,13 +1347,23 @@ L(du7_do):
     bf      30,L(du7_1dw)
 
     /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     or      0,0,8
     ld      6,16(5)
     std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 56
+    sldi     8,6, 64-56
+#else
     sldi     0,7, 56
     srdi     8,6, 64-56
+#endif
     or      0,0,8
     ld      7,24(5)
     std     0,8(4)
@@ -1078,8 +1372,13 @@ L(du7_do):
     blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
     bf      31,L(du7_loop)
     /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     or      0,0,8
     std     0,0(4)
     mr      6,7
@@ -1090,8 +1389,13 @@ L(du7_do):
     b       L(du7_loop)
     .align 4
 L(du7_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
     sldi     0,6, 56
     srdi     8,7, 64-56
+#endif
     addi    5,5,16
     or      0,0,8
     bf      31,L(du7_loop)
@@ -1103,23 +1407,43 @@ L(du7_1dw):
     .align 4
 /* copy 32 bytes at a time */
 L(du7_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     ld    6,0(5)
     std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
     sldi   0,7, 56
     srdi   8,6, 64-56
+#endif
     or    0,0,8
     ld    7,8(5)
     std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     ld    6,16(5)
     std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
     sldi   0,7, 56
     srdi   8,6, 64-56
+#endif
     or    0,0,8
     ld    7,24(5)
     std   0,24(4)
@@ -1129,8 +1453,13 @@ L(du7_loop):
     .align 4
 L(du7_fini):
     /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
     sldi   0,6, 56
     srdi   8,7, 64-56
+#endif
     or    0,0,8
     std   0,0(4)
     b     L(du_done)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 800a9f1..e8df75f 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -23,418 +23,361 @@
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
    Returns 'dst'.  */
 
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
 	.machine power7
 EALIGN (memcpy, 5, 0)
 	CALL_MCOUNT 3
 
-	cmpldi  cr1,5,31
+	cmpldi	cr1,cnt,31
 	neg	0,3
-	std	3,-16(1)
-	std	31,-8(1)
-	cfi_offset(31,-8)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */
 
-	andi.   11,3,7	      /* Check alignment of DST.  */
-
-
-	clrldi  10,4,61       /* Check alignment of SRC.  */
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
-	mr	12,4
-	mr	31,5
+#ifdef __LITTLE_ENDIAN__
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
+   loop is only used for quadword aligned copies.  */
+	andi.	10,3,15
+	clrldi	11,4,60
+#else
+	andi.	10,3,7		/* Check alignment of DST.  */
+	clrldi	11,4,61		/* Check alignment of SRC.  */
+#endif
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
 	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
 
-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
-
-	beq    L(copy_GE_32_aligned_cont)
-
-	clrldi  0,0,61
-	mtcrf   0x01,0
-	subf    31,0,5
-
-	/* Get the SRC aligned to 8 bytes.  */
-
-1:	bf	31,2f
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	bf      30,4f
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	bf      29,0f
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-0:
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
-
-L(copy_GE_32_aligned_cont):
-
-	clrldi  11,31,61
-	mtcrf   0x01,9
-
-	srdi    8,31,5
-	cmpldi  cr1,9,4
-	cmpldi  cr6,11,0
-	mr	11,12
-
-	/* Copy 1~3 doublewords so the main loop starts
-	at a multiple of 32 bytes.  */
+	mtocrf	0x01,0
+#ifdef __LITTLE_ENDIAN__
+	clrldi	0,0,60
+#else
+	clrldi	0,0,61
+#endif
 
-	bf	30,1f
-	ld      6,0(12)
-	ld      7,8(12)
-	addi    11,12,16
-	mtctr   8
-	std     6,0(3)
-	std     7,8(3)
-	addi    10,3,16
-	bf      31,4f
-	ld      0,16(12)
-	std     0,16(3)
-	blt     cr1,3f
-	addi    11,12,24
-	addi    10,3,24
-	b       4f
-
-	.align  4
-1:	/* Copy 1 doubleword and set the counter.  */
-	mr	10,3
-	mtctr   8
-	bf      31,4f
-	ld      6,0(12)
-	addi    11,12,8
-	std     6,0(3)
-	addi    10,3,8
-
-L(aligned_copy):
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
-	.align  4
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
 4:
-	/* check for any 32-byte or 64-byte lumps that are outside of a
-	   nice 128-byte range.  R8 contains the number of 32-byte
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
-	   unrolled 128-bytes-at-a-time copy loop. */
-	mtocrf	1,8
-	li	6,16	# 16() index
-	li	7,32	# 32() index
-	li	8,48	# 48() index
-
-L(aligned_32byte):
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
-	bns	cr7,L(aligned_64byte)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	11,11,32
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	addi	10,10,32
-
-L(aligned_64byte):
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
-	bne	cr7,L(aligned_128setup)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
-
-L(aligned_128setup):
-	/* Set up for the 128-byte at a time copy loop.  */
-	srdi	8,31,7
-	cmpdi	8,0	# Any 4x lumps left?
-	beq	3f	# if not, move along.
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	mtctr	8	# otherwise, load the ctr and begin.
-	li	8,48	# 48() index
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+#ifdef __LITTLE_ENDIAN__
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+#endif
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	mtctr	12
 	b	L(aligned_128loop)
 
+	.align  4
 L(aligned_128head):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
 L(aligned_128loop):
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	stxvd2x	6,0,10
-	addi	11,11,64
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	10,10,64
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	stxvd2x	6,0,dst
+	addi	src,src,64
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	dst,dst,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
 	bdnz	L(aligned_128head)
 
-3:
-	/* Check for tail bytes.  */
-	rldicr  0,31,0,60
-	mtcrf   0x01,31
-	beq	cr6,0f
-
-.L9:
-	add	3,3,0
-	add	12,12,0
-
-	/*  At this point we have a tail of 0-7 bytes and we know that the
-	destination is doubleword-aligned.  */
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2 bytes.  */
-	bf	30,1f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	src,src,32
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,src
+	addi	src,src,16
+	stxvd2x	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
 	blr
 
-	/* Handle copies of 0~31 bytes.  */
-	.align  4
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
 L(copy_LT_32):
-	cmpldi  cr6,5,8
-	mr	12,4
-	mtcrf   0x01,5
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
 	ble	cr6,L(copy_LE_8)
 
 	/* At least 9 bytes to go.  */
 	neg	8,4
-	clrrdi  11,4,2
-	andi.   0,8,3
-	cmpldi  cr1,5,16
-	mr	10,5
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
 	beq	L(copy_LT_32_aligned)
 
-	/* Force 4-bytes alignment for SRC.  */
-	mtocrf  0x01,0
-	subf    10,0,5
-2:	bf	30,1f
-
-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	bf	31,L(end_4bytes_alignment)
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-
-	.align  4
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
 L(end_4bytes_alignment):
-	cmpldi  cr1,10,16
-	mtcrf   0x01,10
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
 
 L(copy_LT_32_aligned):
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
 	blt	cr1,8f
 
 	/* Copy 16 bytes.  */
-	lwz	6,0(12)
-	lwz     7,4(12)
-	stw     6,0(3)
-	lwz     8,8(12)
-	stw     7,4(3)
-	lwz     6,12(12)
-	addi    12,12,16
-	stw     8,8(3)
-	stw     6,12(3)
-	addi    3,3,16
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
 8:	/* Copy 8 bytes.  */
-	bf	28,4f
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
 
-	lwz     6,0(12)
-	lwz     7,4(12)
-	addi    12,12,8
-	stw     6,0(3)
-	stw     7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2-3 bytes.  */
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
 	bf	30,1f
-
-	lhz     6,0(12)
-	sth     6,0(3)
-	bf      31,0f
-	lbz     7,2(12)
-	stb     7,2(3)
-	ld	3,-16(1)
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
 	blr
 
-	.align  4
-1:	/* Copy 1 byte.  */
-	bf	31,0f
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
 
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
 	blr
 
-	/* Handles copies of 0~8 bytes.  */
-	.align  4
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
 L(copy_LE_8):
-	bne	cr6,4f
+	bne	cr6,L(tail4)
 
 	/* Though we could've used ld/std here, they are still
 	slow for unaligned cases.  */
 
-	lwz	6,0(4)
-	lwz     7,4(4)
-	stw     6,0(3)
-	stw     7,4(3)
-	ld      3,-16(1)      /* Return original DST pointers.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
 	blr
 
-	.align  4
-4:	/* Copies 4~7 bytes.  */
-	bf	29,2b
-
-	lwz	6,0(4)
-	stw     6,0(3)
-	bf      30,5f
-	lhz     7,4(4)
-	sth     7,4(3)
-	bf      31,0f
-	lbz     8,6(4)
-	stb     8,6(3)
-	ld	3,-16(1)
-	blr
-
-	.align  4
-5:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,4(4)
-	stb	6,4(3)
-
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
-	blr
 
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
-	the data, allowing for aligned DST stores.  */
-	.align  4
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
 L(copy_GE_32_unaligned):
-	clrldi  0,0,60	      /* Number of bytes until the 1st
-			      quadword.  */
-	andi.   11,3,15       /* Check alignment of DST (against
-			      quadwords).  */
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+#ifndef __LITTLE_ENDIAN__
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
+#endif
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	beq	L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
-	mtcrf   0x01,0
-	subf    31,0,5
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
 
 	/* Vector instructions work best when proper alignment (16-bytes)
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
-1:	/* Copy 1 byte.  */
+1:
 	bf	31,2f
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	/* Copy 2 bytes.  */
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
 	bf	30,4f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	/* Copy 4 bytes.  */
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
 	bf	29,8f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-8:	/* Copy 8 bytes.  */
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
 	bf	28,0f
-
-	ld	6,0(12)
-	addi    12,12,8
-	std	6,0(3)
-	addi    3,3,8
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
 0:
-	clrldi  10,12,60      /* Check alignment of SRC.  */
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	/* The proper alignment is present, it is OK to copy the bytes now.  */
 L(copy_GE_32_unaligned_cont):
 
 	/* Setup two indexes to speed up the indexed vector operations.  */
-	clrldi  11,31,60
-	li      6,16	      /* Index for 16-bytes offsets.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
 	li	7,32	      /* Index for 32-bytes offsets.  */
-	cmpldi  cr1,11,0
-	srdi    8,31,5	      /* Setup the loop counter.  */
-	mr      10,3
-	mr      11,12
-	mtcrf   0x01,9
-	cmpldi  cr6,9,1
-	lvsl    5,0,12
-	lvx     3,0,12
-	bf      31,L(setup_unaligned_loop)
-
-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
-	lvx     4,12,6
-	vperm   6,3,4,5
-	addi    11,12,16
-	addi    10,3,16
-	stvx    6,0,3
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
 	vor	3,4,4
+	clrrdi	0,src,60
 
 L(setup_unaligned_loop):
-	mtctr   8
-	ble     cr6,L(end_unaligned_loop)
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
 
 	/* Copy 32 bytes at a time using vector instructions.  */
-	.align  4
+	.align	4
 L(unaligned_loop):
 
 	/* Note: vr6/vr10 may contain data that was already copied,
@@ -442,62 +385,55 @@ L(unaligned_loop):
 	some portions again. This is faster than having unaligned
 	vector instructions though.  */
 
-	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
-	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
-	addi    11,11,32
-	stvx    6,0,10
-	stvx    10,10,6
-	addi    10,10,32
-
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
 	bdnz	L(unaligned_loop)
 
-	.align  4
+	clrrdi	0,src,60
+
+	.align	4
 L(end_unaligned_loop):
 
 	/* Check for tail bytes.  */
-	rldicr  0,31,0,59
-	mtcrf   0x01,31
-	beq	cr1,0f
+	mtocrf	0x01,cnt
+	beqlr	cr1
 
-	add	3,3,0
-	add	12,12,0
+	add	src,src,0
 
 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
-8:	/* Copy 8 bytes.  */
+	/* Copy 8 bytes.  */
 	bf	28,4f
-
-	lwz	6,0(12)
-	lwz	7,4(12)
-	addi    12,12,8
-	stw	6,0(3)
-	stw	7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz	6,0(12)
-	addi    12,12,4
-	stw	6,0(3)
-	addi    3,3,4
-2:	/* Copy 2~3 bytes.  */
-	bf	30,1f
-
-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
 	blr
 
 END_GEN_TB (memcpy,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
index f20be93..b93ab7d 100644
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmpldi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@@ -391,11 +399,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c3cb0a792e5a8fe6a776e3779775079b48029df2

commit c3cb0a792e5a8fe6a776e3779775079b48029df2
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:46:47 2013 +0930

    PowerPC LE memcmp
    http://sourceware.org/ml/libc-alpha/2013-08/msg00102.html
    
    This is a rather large patch due to formatting and renaming.  The
    formatting changes were to make it possible to compare power7 and
    power4 versions of memcmp.  Using different register defines came
    about while I was wrestling with the code, trying to find spare
    registers at one stage.  I found it much simpler if we refer to a reg
    by the same name throughout a function, so it's better if short-term
    multiple use regs like rTMP are referred to using their register
    number.  I made the cr field usage changes when attempting to reload
    rWORDn regs in the exit path to byte swap before comparing when
    little-endian.  That proved a bad idea due to the pipelining involved
    in the main loop;  Offsets to reload the regs were different first
    time around the loop..  Anyway, I left the cr field usage changes in
    place for consistency.
    
    Aside from these more-or-less cosmetic changes, I fixed a number of
    places where an early exit path restores regs unnecessarily, removed
    some dead code, and optimised one or two exits.
    
    	* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
    	Formatting.  Consistently use rXXX register defines or rN defines.
    	Use early exit labels that avoid restoring unused non-volatile regs.
    	Make cr field use more consistent with rWORDn compares.  Rename
    	regs used as shift registers for unaligned loop, using rN defines
    	for short lifetime/multiple use regs.
    	* sysdeps/powerpc/powerpc64/power4/memcmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/memcmp.S: Likewise.  Exit with
    	addi 1,1,64 to pop stack frame.  Simplify return value code.
    	* sysdeps/powerpc/powerpc32/power4/memcmp.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 37a85c2..42d88db 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
+	Formatting.  Consistently use rXXX register defines or rN defines.
+	Use early exit labels that avoid restoring unused non-volatile regs.
+	Make cr field use more consistent with rWORDn compares.  Rename
+	regs used as shift registers for unaligned loop, using rN defines
+	for short lifetime/multiple use regs.
+	* sysdeps/powerpc/powerpc64/power4/memcmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/memcmp.S: Likewise.  Exit with
+	addi 1,1,64 to pop stack frame.  Simplify return value code.
+	* sysdeps/powerpc/powerpc32/power4/memcmp.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
 	support.  Correct typos, formatting.  Optimize tail.  Use insrdi
 	rather than rlwimi.
diff --git a/sysdeps/powerpc/powerpc32/power4/memcmp.S b/sysdeps/powerpc/powerpc32/power4/memcmp.S
index d7050a2..652acb9 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcmp.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized strcmp implementation for PowerPC32.
    Copyright (C) 2003-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,13 +18,14 @@
 
 #include <sysdep.h>
 
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
 
 	.machine power4
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,33 +36,32 @@ EALIGN (memcmp, 4, 0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP, rSTR2, rSTR1
+	xor	r0, rSTR2, rSTR1
 	cmplwi	cr6, rN, 0
 	cmplwi	cr1, rN, 12
-	clrlwi.	rTMP, rTMP, 30
-	clrlwi	rBITDIF, rSTR1, 30
-	cmplwi	cr5, rBITDIF, 0
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
 	beq-	cr6, L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-        stwu    1,-64(1)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-        stw     r31,48(1)
-	cfi_offset(31,(48-64))
-        stw     r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -70,74 +70,95 @@ EALIGN (memcmp, 4, 0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
-	.align 4
+	.align	4
 L(samealignment):
 	clrrwi	rSTR1, rSTR1, 2
 	clrrwi	rSTR2, rSTR2, 2
 	beq	cr5, L(Waligned)
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(dPs3)
 	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
-	.align 3
+	.align	3
 L(dsP1):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dPs2):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(dPs3):
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD2, r11
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(dPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD2, r11
-	cmplw	cr0, rWORD1, rWORD2
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Waligned):
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	cmplwi	cr1, rBITDIF, 8
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dP4)
@@ -145,177 +166,352 @@ L(Waligned):
 	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dP1):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	bne	cr0, L(dLcr0)
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
 
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne	cr1, L(dLcr1)
 	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-	.align 3
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	.align	3
 L(dP1x):
 	slwi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
+	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr5, rWORD3, rWORD4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	slwi.	r12, rN, 3
-	bne	cr6, L(dLcr6)
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
-	bne	cr5, L(dLcr5)
+#endif
+	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
 	lwz	rWORD4, 0(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr5, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	slwi.	r12, rN, 3
-	bne	cr1, L(dLcr1)
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	bne	cr6, L(dLcr6)
+#endif
+	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5, L(dLcr5)
-        lwz     1,0(1)
+	bne	cr7, L(dLcr7x)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne-	cr1, L(dLcr1)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz+	L(dLoop)
 
 L(dL4):
@@ -325,7 +521,7 @@ L(dL4):
 	bne	cr5, L(dLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
 	bne	cr1, L(dLcr1)
 L(d24):
@@ -334,69 +530,82 @@ L(d14):
 	slwi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-        lwz     1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beqlr
-        li      rRTN,1
-        bgtlr
-        li      rRTN,-1
-        blr
-
-	.align 4
-L(dLcr0):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
 	li	rRTN, 1
-        lwz     1,0(1)
-	bgtlr	cr0
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
 	li	rRTN, -1
 	blr
 
-	.align 4
+	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -411,7 +620,7 @@ L(bytealigned):
 	lbz	rWORD1, 0(rSTR1)
 	lbz	rWORD2, 0(rSTR2)
 	bdz-	L(b11)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	lbz	rWORD3, 1(rSTR1)
 	lbz	rWORD4, 1(rSTR2)
 	bdz-	L(b12)
@@ -419,11 +628,11 @@ L(bytealigned):
 	lbzu	rWORD5, 2(rSTR1)
 	lbzu	rWORD6, 2(rSTR2)
 	bdz-	L(b13)
-	.align 4
+	.align	4
 L(bLoop):
 	lbzu	rWORD1, 1(rSTR1)
 	lbzu	rWORD2, 1(rSTR2)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 
 	cmplw	cr6, rWORD5, rWORD6
 	bdz-	L(b3i)
@@ -432,7 +641,7 @@ L(bLoop):
 	lbzu	rWORD4, 1(rSTR2)
 	bne-	cr1, L(bLcr1)
 
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz-	L(b2i)
 
 	lbzu	rWORD5, 1(rSTR1)
@@ -449,23 +658,23 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	bne-	cr1, L(bLcr1)
 	b	L(bx56)
-	.align 4
+	.align	4
 L(b2i):
 	bne-	cr6, L(bLcr6)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	b	L(bx34)
-	.align 4
+	.align	4
 L(b3i):
 	bne-	cr1, L(bLcr1)
 	bne-	cr6, L(bLcr6)
 	b	L(bx12)
-	.align 4
-L(bLcr0):
+	.align	4
+L(bLcr7):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
 L(bLcr1):
@@ -480,36 +689,31 @@ L(bLcr6):
 	blr
 
 L(b13):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 	bne-	cr1, L(bx34)
 L(bx56):
 	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 L(bx34):
 	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
-
-	.align 4
-L(zeroLengthReturn):
-
+	.align	4
 L(zeroLength):
 	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
-	.align 4
+	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not aready word aligned yet.
@@ -518,79 +722,88 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw     r29,40(r1)
-	cfi_offset(r29,(40-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
 	clrlwi	rSHL, rSTR2, 30
-        stw     r28,36(r1)
-	cfi_offset(r28,(36-64))
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
 	beq	cr5, L(Wunaligned)
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27, rSTR2, rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL, r27, 30
+	clrlwi	rSHL, rWORD8_SHIFT, 30
 	clrrwi	rSTR1, rSTR1, 2
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
 	slwi	rSHL, rSHL, 3
-	cmplw	cr5, r27, rSTR2
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	subfic	rSHR, rSHL, 32
-	srwi	rTMP, rN, 4      /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD8, 0(rSTR2)
-	la	rSTR2, 4(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
 	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
-	srw	rG, rWORD2, rSHR
+	srw	r12, rWORD2, rSHR
 	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
 	bgt	cr1, L(duPs3)
 	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dusP1):
-	slw	rB, rWORD2, rSHL
-	slw	rWORD7, rWORD1, r11
-	slw	rWORD8, rWORD8, r11
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
 	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
@@ -600,95 +813,133 @@ L(dusP1):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duPs2):
-	slw	rH, rWORD2, rSHL
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD8, r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duPs3):
-	slw	rF, rWORD2, rSHL
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD8, r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
-	slw	rD, rWORD2, rSHL
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD8, r11
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Wunaligned):
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD6, 0(rSTR2)
 	lwzu	rWORD8, 4(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	subfic	rSHR, rSHL, 32
-	slw	rH, rWORD6, rSHL
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(duP3)
 	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(duP1):
-	srw	rG, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD7, 0(rSTR1)
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
-	or	rWORD4, rC, rD
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	bne	cr0, L(duLcr0)
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
 	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
-	.align 4
+	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
@@ -698,186 +949,321 @@ L(duP1x):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
-	ld	rWORD2, 8(rSTR2)
-	srw	rA, rWORD2, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duP2):
-	srw	rE, rWORD8, rSHR
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
-	or	rWORD6, rE, rH
-	slw	rH, rWORD8, rSHL
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
-	.align 4
+	.align	4
 L(duP2x):
 	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duP3):
-	srw	rC, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
-	slw	rF, rWORD8, rSHL
-	or	rWORD4, rC, rH
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
-	.align 4
+	.align	4
 L(duP3x):
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	srw	rA, rWORD8, rSHR
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
-	slw	rD, rWORD8, rSHL
-	or	rWORD2, rA, rH
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	cmplw	cr5, rWORD7, rWORD8
 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne-	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz+	L(duLoop)
 
 L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0, L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
 	bne	cr1, L(duLcr1)
 L(du24):
@@ -887,95 +1273,101 @@ L(du14):
 	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
+   rWORD8_SHIFT).  */
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
-	.align 4
+#endif
+	srw	r0, rWORD2, rSHR
+	.align	4
 L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
 	lwz	rWORD1, 4(rSTR1)
-        lwz     r31,48(1)
+#endif
+	lwz	rWORD8, 48(r1)
 	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2, rA, rB
-        lwz     r30,44(1)
-        lwz     r29,40(r1)
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        lwz     r28,36(r1)
-        lwz     r27,32(r1)
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beq     L(dureturn26)
-        li      rRTN,1
-        bgt     L(dureturn26)
-        li      rRTN,-1
-	b    L(dureturn26)
-	.align 4
-L(duLcr0):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
+	b	L(dureturn26)
+	.align	4
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
-	bgt	cr0, L(dureturn29)
-	lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr1):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr6):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr5):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-        lwz     r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-        lwz     r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-        lwz     r25,24(r1)
-        lwz     r24,20(r1)
-        lwz     1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
 
diff --git a/sysdeps/powerpc/powerpc32/power7/memcmp.S b/sysdeps/powerpc/powerpc32/power7/memcmp.S
index f764b7c..ea001da 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -23,10 +23,9 @@
 		    size_t size [r5])  */
 
 	.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,35 +36,32 @@ EALIGN (memcmp,4,0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP,rSTR2,rSTR1
-	cmplwi	cr6,rN,0
-	cmplwi	cr1,rN,12
-	clrlwi.	rTMP,rTMP,30
-	clrlwi	rBITDIF,rSTR1,30
-	cmplwi	cr5,rBITDIF,0
-	beq-	cr6,L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
-
-	/* If less than 8 bytes or not aligned, use the unaligned
-	   byte loop.  */
-
-	blt	cr1,L(bytealigned)
-	stwu	1,-64(1)
+	xor	r0, rSTR2, rSTR1
+	cmplwi	cr6, rN, 0
+	cmplwi	cr1, rN, 12
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-	stw	r31,48(1)
-	cfi_offset(31,(48-64))
-	stw	r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -74,332 +70,541 @@ EALIGN (memcmp,4,0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
 	.align	4
 L(samealignment):
-	clrrwi	rSTR1,rSTR1,2
-	clrrwi	rSTR2,rSTR2,2
-	beq	cr5,L(Waligned)
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	clrrwi	rSTR1, rSTR1, 2
+	clrrwi	rSTR2, rSTR2, 2
+	beq	cr5, L(Waligned)
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr	rTMP
-	bgt	cr1,L(dPs3)
-	beq	cr1,L(dPs2)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
 	.align	3
 L(dsP1):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
 	.align	4
 L(dPs2):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
 	.align	4
 L(dPs3):
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD2,r11
-	cmplw	cr1,rWORD3,rWORD4
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(dPs4):
-	mtctr	rTMP
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD2,r11
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Waligned):
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dP4)
-	bgt	cr1,L(dP3)
-	beq	cr1,L(dP2)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
 	.align	4
 L(dP1):
-	mtctr	rTMP
+	mtctr	r0
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	bne	cr0,L(dLcr0)
-
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
-	bne	cr6,L(dLcr6)
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	bne	cr6, L(dLcr6)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 	.align	3
 L(dP1x):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr	rTMP
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(dLcr6)
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP2x):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr5,rWORD3,rWORD4
-	slwi.	r12,rN,3
-	bne	cr6,L(dLcr6)
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	slwi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr	rTMP
-	lwz	rWORD3,0(rSTR1)
-	lwz	rWORD4,0(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+	lwz	rWORD4, 0(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP3x)
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(dLcr1)
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP3x):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD1,rWORD2
-	slwi.	r12,rN,3
-	bne	cr1,L(dLcr1)
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(dLcr6)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5,L(dLcr5)
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	slwi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr	rTMP
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
-	bne	cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
 L(dLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
 L(dLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
 L(dLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
 
 L(dL4):
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0,L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
-	bne	cr1,L(dLcr1)
+	bne	cr1, L(dLcr1)
 L(d24):
-	bne	cr6,L(dLcr6)
+	bne	cr6, L(dLcr6)
 L(d14):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
 L(d04):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	lwz	1,0(1)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beqlr
-	li	rRTN,1
-	bgtlr
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	sub	rRTN, rWORD1, rWORD2
 	blr
 
 	.align	4
-L(dLcr0):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
-	bgtlr	cr0
-	li	rRTN,-1
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
-	li	rRTN,1
-	lwz	1,0(1)
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
 	mtctr	rN
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
@@ -411,38 +616,39 @@ L(bytealigned):
 
    So we must precondition some registers and condition codes so that
    we don't exit the loop early on the first iteration.  */
-	lbz	rWORD1,0(rSTR1)
-	lbz	rWORD2,0(rSTR2)
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
 	bdz	L(b11)
-	cmplw	cr0,rWORD1,rWORD2
-	lbz	rWORD3,1(rSTR1)
-	lbz	rWORD4,1(rSTR2)
+	cmplw	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
 	bdz	L(b12)
-	cmplw	cr1,rWORD3,rWORD4
-	lbzu	rWORD5,2(rSTR1)
-	lbzu	rWORD6,2(rSTR2)
+	cmplw	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
 	bdz	L(b13)
 	.align	4
 L(bLoop):
-	lbzu	rWORD1,1(rSTR1)
-	lbzu	rWORD2,1(rSTR2)
-	bne	cr0,L(bLcr0)
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
 
-	cmplw	cr6,rWORD5,rWORD6
+	cmplw	cr6, rWORD5, rWORD6
 	bdz	L(b3i)
 
-	lbzu	rWORD3,1(rSTR1)
-	lbzu	rWORD4,1(rSTR2)
-	bne	cr1,L(bLcr1)
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
 
-	cmplw	cr0,rWORD1,rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz	L(b2i)
 
-	lbzu	rWORD5,1(rSTR1)
-	lbzu	rWORD6,1(rSTR2)
-	bne	cr6,L(bLcr6)
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
 
-	cmplw	cr1,rWORD3,rWORD4
+	cmplw	cr1, rWORD3, rWORD4
 	bdnz	L(bLoop)
 
 /* We speculatively loading bytes before we have tested the previous
@@ -452,67 +658,62 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne	cr0,L(bLcr0)
-	bne	cr1,L(bLcr1)
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
 	b	L(bx56)
 	.align	4
 L(b2i):
-	bne	cr6,L(bLcr6)
-	bne	cr0,L(bLcr0)
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
 	b	L(bx34)
 	.align	4
 L(b3i):
-	bne	cr1,L(bLcr1)
-	bne	cr6,L(bLcr6)
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
 	b	L(bx12)
 	.align	4
-L(bLcr0):
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 L(bLcr1):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 L(bLcr6):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 L(b13):
-	bne	cr0,L(bx12)
-	bne	cr1,L(bx34)
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
 L(bx56):
-	sub	rRTN,rWORD5,rWORD6
+	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne	cr0,L(bx12)
+	bne	cr7, L(bx12)
 L(bx34):
-	sub	rRTN,rWORD3,rWORD4
+	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
-	sub	rRTN,rWORD1,rWORD2
+	sub	rRTN, rWORD1, rWORD2
 	blr
-
 	.align	4
-L(zeroLengthReturn):
-
 L(zeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
 	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not aready word aligned yet.
@@ -521,465 +722,654 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw	r29,40(r1)
-	cfi_offset(r29,(40-64))
-	clrlwi	rSHL,rSTR2,30
-	stw	r28,36(r1)
-	cfi_offset(r28,(36-64))
-	beq	cr5,L(Wunaligned)
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
+	clrlwi	rSHL, rSTR2, 30
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
+	beq	cr5, L(Wunaligned)
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27,rSTR2,rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL,r27,30
-	clrrwi	rSTR1,rSTR1,2
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	slwi	rSHL,rSHL,3
-	cmplw	cr5,r27,rSTR2
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	stw	r24,20(r1)
-	cfi_offset(r24,(20-64))
-	subfic	rSHR,rSHL,32
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
+	clrlwi	rSHL, rWORD8_SHIFT, 30
+	clrrwi	rSTR1, rSTR1, 2
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	slwi	rSHL, rSHL, 3
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	subfic	rSHR, rSHL, 32
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
-	li	rWORD8,0
-	blt	cr5,L(dus0)
-	lwz	rWORD8,0(rSTR2)
-	la	rSTR2,4(rSTR2)
-	slw	rWORD8,rWORD8,rSHL
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
+	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	srw	rG,rWORD2,rSHR
-	clrlwi	rN,rN,30
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	srw	r12, rWORD2, rSHR
+	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	bgt	cr1,L(duPs3)
-	beq	cr1,L(duPs2)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
 	.align	4
 L(dusP1):
-	slw	rB,rWORD2,rSHL
-	slw	rWORD7,rWORD1,r11
-	slw	rWORD8,rWORD8,r11
-	bge	cr7,L(duP1e)
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duPs2):
-	slw	rH,rWORD2,rSHL
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD8,r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
 	.align	4
 L(duPs3):
-	slw	rF,rWORD2,rSHL
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD8,r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duPs4):
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	slw	rD,rWORD2,rSHL
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD8,r11
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Wunaligned):
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
-	stw	r24,20(r1)
-	cfi_offset(r24,(24-64))
-	slwi	rSHL,rSHL,3
-	lwz	rWORD6,0(rSTR2)
-	lwzu	rWORD8,4(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
-	subfic	rSHR,rSHL,32
-	slw	rH,rWORD6,rSHL
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD6, 0(rSTR2)
+	lwzu	rWORD8, 4(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	subfic	rSHR, rSHL, 32
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr	rTMP
-	bgt	cr1,L(duP3)
-	beq	cr1,L(duP2)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
 	.align	4
 L(duP1):
-	srw	rG,rWORD8,rSHR
-	lwz	rWORD7,0(rSTR1)
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP1x)
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD7, 0(rSTR1)
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
 L(duP1e):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	bne	cr5,L(duLcr5)
-	or	rWORD4,rC,rD
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	bne	cr0,L(duLcr0)
-	or	rWORD6,rE,rF
-	cmplw	cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
 	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
 L(duP1x):
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duP2):
-	srw	rE,rWORD8,rSHR
-	lwz	rWORD5,0(rSTR1)
-	or	rWORD6,rE,rH
-	slw	rH,rWORD8,rSHL
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP2x)
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	cmplw	cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
-	cmplw	cr5,rWORD7,rWORD8
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
 	.align	4
 L(duP3):
-	srw	rC,rWORD8,rSHR
-	lwz	rWORD3,0(rSTR1)
-	slw	rF,rWORD8,rSHL
-	or	rWORD4,rC,rH
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP3x)
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(duLcr1)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duP4):
-	mtctr	rTMP
-	srw	rA,rWORD8,rSHR
-	lwz	rWORD1,0(rSTR1)
-	slw	rD,rWORD8,rSHL
-	or	rWORD2,rA,rH
+	mtctr	r0
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmplw	cr5, rWORD7, rWORD8
 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz	L(duLoop)
 
 L(duL4):
-	bne	cr1,L(duLcr1)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0,L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
-	bne	cr1,L(duLcr1)
+	bne	cr1, L(duLcr1)
 L(du24):
-	bne	cr6,L(duLcr6)
+	bne	cr6, L(duLcr6)
 L(du14):
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
-	cmplw	cr7,rN,rSHR
+   rWORD8_SHIFT).  */
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-	lwz	rWORD1,4(rSTR1)
-	lwz	r31,48(1)
-	subfic	rN,rN,32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2,rA,rB
-	lwz	r30,44(1)
-	lwz	r29,40(r1)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	lwz	r28,36(r1)
-	lwz	r27,32(r1)
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beq	L(dureturn26)
-	li	rRTN,1
-	bgt	L(dureturn26)
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
+	lwz	rWORD1, 4(rSTR1)
+#endif
+	lwz	rWORD8, 48(r1)
+	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
 	b	L(dureturn26)
 	.align	4
-L(duLcr0):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr0,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr1,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr6,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr5,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-	lwz	r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-	lwz	r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-	lwz	r25,24(r1)
-	lwz	r24,20(r1)
-	lwz	1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
+
 libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcmp.S b/sysdeps/powerpc/powerpc64/power4/memcmp.S
index 69caedc..80d67c9 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized memcmp implementation for PowerPC64.
    Copyright (C) 2003-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,13 +18,14 @@
 
 #include <sysdep.h>
 
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
 
 	.machine power4
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,107 +36,127 @@ EALIGN (memcmp, 4, 0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP, rSTR2, rSTR1
+	xor	r0, rSTR2, rSTR1
 	cmpldi	cr6, rN, 0
 	cmpldi	cr1, rN, 12
-	clrldi.	rTMP, rTMP, 61
-	clrldi	rBITDIF, rSTR1, 61
-	cmpldi	cr5, rBITDIF, 0
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
 	beq-	cr6, L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-	std	rWORD8,-8(r1)
-	cfi_offset(rWORD8,-8)
-	std	rWORD7,-16(r1)
-	cfi_offset(rWORD7,-16)
+	std	rWORD8, -8(r1)
+	cfi_offset(rWORD8, -8)
+	std	rWORD7, -16(r1)
+	cfi_offset(rWORD7, -16)
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
-   aligned and can perform the DWaligned loop.
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
-   yet DW).  So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
-   normal (DWaligned) compare loop, starting at the second double word,
+   normal (DW aligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
-   correct and the first DW (shifted) is in the expected resister pair.  */
-	.align 4
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
 L(samealignment):
 	clrrdi	rSTR1, rSTR1, 3
 	clrrdi	rSTR2, rSTR2, 3
 	beq	cr5, L(DWaligned)
-	add	rN, rN, rBITDIF
-	sldi	r11, rBITDIF, 3
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	beq	L(dPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(dPs3)
 	beq	cr1, L(dPs2)
 
 /* Remainder is 8 */
-	.align 3
+	.align	3
 L(dsP1):
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD2, r11
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(dPs2):
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD2, r11
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(dPs3):
-	sld	rWORD3, rWORD1, r11
-	sld	rWORD4, rWORD2, r11
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(dPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	sld	rWORD1, rWORD1, r11
-	sld	rWORD2, rWORD2, r11
-	cmpld	cr0, rWORD1, rWORD2
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are double word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(DWaligned):
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	cmpldi	cr1, rBITDIF, 16
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	beq	L(dP4)
@@ -143,174 +164,343 @@ L(DWaligned):
 	beq	cr1, L(dP2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dP1):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 0(rSTR1)
 	ld	rWORD6, 0(rSTR2)
+#endif
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	bne	cr0, L(dLcr0)
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
 
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
+#endif
 	bne	cr1, L(dLcr1)
 	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	.align 3
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	.align	3
 L(dP1x):
 	sldi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
+	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(dP2):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 0(rSTR1)
 	ld	rWORD6, 0(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 24(rSTR1)
 	ld	rWORD4, 24(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
-	cmpld	cr5, rWORD3, rWORD4
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 	sldi.	r12, rN, 3
-	bne	cr6, L(dLcr6)
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	bne	cr5, L(dLcr5)
+#endif
+	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(dP3):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 0(rSTR1)
 	ld	rWORD4, 0(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 8(rSTR1)
 	ld	rWORD6, 8(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 16(rSTR1)
 	ld	rWORD8, 16(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 24(rSTR1)
 	ld	rWORD2, 24(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
+#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
-	cmpld	cr5, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	sldi.	r12, rN, 3
-	bne	cr1, L(dLcr1)
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-	bne	cr6, L(dLcr6)
+#endif
+	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	bne	cr5, L(dLcr5)
+	bne	cr7, L(dLcr7x)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(dP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 16(rSTR1)
 	ld	rWORD6, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 24(rSTR1)
 	ldu	rWORD8, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
+#endif
 	bne-	cr1, L(dLcr1)
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdnz+	L(dLoop)
 
 L(dL4):
@@ -320,7 +510,7 @@ L(dL4):
 	bne	cr5, L(dLcr5)
 	cmpld	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
 	bne	cr1, L(dLcr1)
 L(d24):
@@ -329,60 +519,74 @@ L(d14):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	cmpld	cr5, rWORD1, rWORD2
- 	bne	cr5, L(dLcr5x)
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
 	li	rRTN, 0
 	blr
-	.align 4
-L(dLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr7x):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr1x):
 	li	rRTN, 1
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr6x):
 	li	rRTN, 1
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
 L(dLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dLcr5x):
 	li	rRTN, 1
 	bgtlr	cr5
 	li	rRTN, -1
 	blr
 
-	.align 4
+	.align	4
 L(bytealigned):
-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
+#if 0
+/* Huh?  We've already branched on cr6!  */
 	beq-	cr6, L(zeroLength)
+#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -397,7 +601,7 @@ L(bytealigned):
 	lbz	rWORD1, 0(rSTR1)
 	lbz	rWORD2, 0(rSTR2)
 	bdz-	L(b11)
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	lbz	rWORD3, 1(rSTR1)
 	lbz	rWORD4, 1(rSTR2)
 	bdz-	L(b12)
@@ -405,11 +609,11 @@ L(bytealigned):
 	lbzu	rWORD5, 2(rSTR1)
 	lbzu	rWORD6, 2(rSTR2)
 	bdz-	L(b13)
-	.align 4
+	.align	4
 L(bLoop):
 	lbzu	rWORD1, 1(rSTR1)
 	lbzu	rWORD2, 1(rSTR2)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 
 	cmpld	cr6, rWORD5, rWORD6
 	bdz-	L(b3i)
@@ -418,7 +622,7 @@ L(bLoop):
 	lbzu	rWORD4, 1(rSTR2)
 	bne-	cr1, L(bLcr1)
 
-	cmpld	cr0, rWORD1, rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdz-	L(b2i)
 
 	lbzu	rWORD5, 1(rSTR1)
@@ -435,23 +639,23 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	bne-	cr1, L(bLcr1)
 	b	L(bx56)
-	.align 4
+	.align	4
 L(b2i):
 	bne-	cr6, L(bLcr6)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	b	L(bx34)
-	.align 4
+	.align	4
 L(b3i):
 	bne-	cr1, L(bLcr1)
 	bne-	cr6, L(bLcr6)
 	b	L(bx12)
-	.align 4
-L(bLcr0):
+	.align	4
+L(bLcr7):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
 L(bLcr1):
@@ -466,14 +670,14 @@ L(bLcr6):
 	blr
 
 L(b13):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 	bne-	cr1, L(bx34)
 L(bx56):
 	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 L(bx34):
 	sub	rRTN, rWORD3, rWORD4
 	blr
@@ -481,101 +685,106 @@ L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
-	.align 4
-L(zeroLengthReturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	.align	4
 L(zeroLength):
 	li	rRTN, 0
 	blr
 
-	.align 4
+	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
    aligned and can perform the DWunaligned loop.
 
    Otherwise we know that rSTR1 is not already DW aligned yet.
    So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
    normal (DWaligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
+   versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL	r29	/* Unaligned shift left count.  */
-#define rSHR	r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	r29,-24(r1)
-	cfi_offset(r29,-24)
+	std	rSHL, -24(r1)
+	cfi_offset(rSHL, -24)
 	clrldi	rSHL, rSTR2, 61
 	beq-	cr6, L(duzeroLength)
-	std	r28,-32(r1)
-	cfi_offset(r28,-32)
+	std	rSHR, -32(r1)
+	cfi_offset(rSHR, -32)
 	beq	cr5, L(DWunaligned)
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
-	sub	r27, rSTR2, rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
 	clrrdi	rSTR2, rSTR2, 3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-/* Compute the left/right shift counts for the unalign rSTR2,
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
-	clrldi	rSHL, r27, 61
+	clrldi	rSHL, rWORD8_SHIFT, 61
 	clrrdi	rSTR1, rSTR1, 3
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
 	sldi	rSHL, rSHL, 3
-	cmpld	cr5, r27, rSTR2
-	add	rN, rN, rBITDIF
-	sldi	r11, rBITDIF, 3
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
 	subfic	rSHR, rSHL, 64
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a DW where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD8, 0(rSTR2)
-	la	rSTR2, 8(rSTR2)
+	addi	rSTR2, rSTR2, 8
+#endif
 	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 0(rSTR1)
 	ld	rWORD2, 0(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
-	srd	rG, rWORD2, rSHR
+	srd	r12, rWORD2, rSHR
 	clrldi	rN, rN, 61
 	beq	L(duPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
 	bgt	cr1, L(duPs3)
 	beq	cr1, L(duPs2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dusP1):
-	sld	rB, rWORD2, rSHL
-	sld	rWORD7, rWORD1, r11
-	sld	rWORD8, rWORD8, r11
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
 	bge	cr7, L(duP1e)
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
@@ -585,95 +794,133 @@ L(dusP1):
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(duPs2):
-	sld	rH, rWORD2, rSHL
-	sld	rWORD5, rWORD1, r11
-	sld	rWORD6, rWORD8, r11
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(duPs3):
-	sld	rF, rWORD2, rSHL
-	sld	rWORD3, rWORD1, r11
-	sld	rWORD4, rWORD8, r11
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(duPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
-	sld	rD, rWORD2, rSHL
-	sld	rWORD1, rWORD1, r11
-	sld	rWORD2, rWORD8, r11
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is double word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(DWunaligned):
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
 	clrrdi	rSTR2, rSTR2, 3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-	srdi	rTMP, rN, 5	/* Divide by 32 */
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	andi.	rBITDIF, rN, 24	/* Get the DW remainder */
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
 	sldi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD6, 0(rSTR2)
 	ldu	rWORD8, 8(rSTR2)
-	cmpldi	cr1, rBITDIF, 16
+#endif
+	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
 	subfic	rSHR, rSHL, 64
-	sld	rH, rWORD6, rSHL
+	sld	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(duP3)
 	beq	cr1, L(duP2)
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duP1):
-	srd	rG, rWORD8, rSHR
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD7, 0(rSTR1)
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+#endif
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
-	or	rWORD4, rC, rD
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	bne	cr0, L(duLcr0)
-	or	rWORD6, rE, rF
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
 	cmpld	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
-	.align 4
+	.align	4
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
@@ -683,186 +930,321 @@ L(duP1x):
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
-	.align 4
+	.align	4
 L(duP2):
-	srd	rE, rWORD8, rSHR
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD5, 0(rSTR1)
-	or	rWORD6, rE, rH
-	sld	rH, rWORD8, rSHL
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 8(rSTR1)
 	ld	rWORD8, 8(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 16(rSTR1)
 	ld	rWORD2, 16(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 24(rSTR1)
 	ld	rWORD4, 24(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
-	.align 4
+	.align	4
 L(duP2x):
 	cmpld	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 24 */
-	.align 4
+	.align	4
 L(duP3):
-	srd	rC, rWORD8, rSHR
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD3, 0(rSTR1)
-	sld	rF, rWORD8, rSHL
-	or	rWORD4, rC, rH
+#endif
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 8(rSTR1)
 	ld	rWORD6, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD7, 16(rSTR1)
 	ld	rWORD8, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 24(rSTR1)
 	ld	rWORD2, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
-	.align 4
+	.align	4
 L(duP3x):
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 32, remainder is 0 */
-	.align 4
+	.align	4
 L(duP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	srd	rA, rWORD8, rSHR
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
 	ld	rWORD1, 0(rSTR1)
-	sld	rD, rWORD8, rSHL
-	or	rWORD2, rA, rH
+#endif
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 8(rSTR1)
 	ld	rWORD4, 8(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 16(rSTR1)
 	ld	rWORD6, 16(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
-	bne	cr0, L(duLcr0)
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 24(rSTR1)
 	ldu	rWORD8, 24(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	cmpld	cr5, rWORD7, rWORD8
 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD1, 8(rSTR1)
 	ld	rWORD2, 8(rSTR2)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
-	srd	rA, rWORD2, rSHR
-	sld	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD3, 16(rSTR1)
 	ld	rWORD4, 16(rSTR2)
+#endif
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
-	srd	rC, rWORD4, rSHR
-	sld	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD5, 24(rSTR1)
 	ld	rWORD6, 24(rSTR2)
+#endif
 	cmpld	cr5, rWORD7, rWORD8
-	bne	cr0, L(duLcr0)
-	srd	rE, rWORD6, rSHR
-	sld	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
 	ldu	rWORD7, 32(rSTR1)
 	ldu	rWORD8, 32(rSTR2)
-	cmpld	cr0, rWORD1, rWORD2
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	bne-	cr1, L(duLcr1)
-	srd	rG, rWORD8, rSHR
-	sld	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz+	L(duLoop)
 
 L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	cmpld	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0, L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
 	bne	cr1, L(duLcr1)
 L(du24):
@@ -872,103 +1254,110 @@ L(du14):
 	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
    shift right double to eliminate bits beyond the compare length.
-   This allows the use of double word subtract to compute the final
-   result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
+   rWORD8_SHIFT).  */
 	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
 	ld	rWORD2, 8(rSTR2)
-	srd	rA, rWORD2, rSHR
-	.align 4
+#endif
+	srd	r0, rWORD2, rSHR
+	.align	4
 L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+#else
 	ld	rWORD1, 8(rSTR1)
-	ld	rWORD8,-8(r1)
+#endif
+	ld	rWORD8, -8(r1)
 	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
-	or	rWORD2, rA, rB
-	ld	rWORD7,-16(r1)
-	ld	r29,-24(r1)
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, -16(r1)
+	ld	rSHL, -24(r1)
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
+	ld	rSHR, -32(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 	li	rRTN, 0
-	cmpld	cr0, rWORD1, rWORD2
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
- 	beq	cr0, L(dureturn24)
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, -48(r1)
+	ld	rWORD4_SHIFT, -56(r1)
+	beq	cr7, L(dureturn24)
 	li	rRTN, 1
-	ld	r24,-64(r1)
-	bgtlr	cr0
+	ld	rWORD6_SHIFT, -64(r1)
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
-L(duLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	.align	4
+L(duLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
-	bgt	cr0, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dureturn29):
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 L(dureturn27):
-	ld	r27,-40(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 L(dureturn26):
-	ld	r26,-48(r1)
+	ld	rWORD2_SHIFT, -48(r1)
 L(dureturn25):
-	ld	r25,-56(r1)
+	ld	rWORD4_SHIFT, -56(r1)
 L(dureturn24):
-	ld	r24,-64(r1)
+	ld	rWORD6_SHIFT, -64(r1)
 	blr
 L(duzeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 END (memcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index f190c64..6851cdc 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -23,10 +23,9 @@
 		    size_t size [r5])  */
 
 	.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,354 +36,557 @@ EALIGN (memcmp,4,0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP,rSTR2,rSTR1
-	cmpldi	cr6,rN,0
-	cmpldi	cr1,rN,12
-	clrldi.	rTMP,rTMP,61
-	clrldi	rBITDIF,rSTR1,61
-	cmpldi	cr5,rBITDIF,0
-	beq-	cr6,L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	xor	r0, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 12
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
-	blt	cr1,L(bytealigned)
-	std	rWORD8,-8(r1)
-	cfi_offset(rWORD8,-8)
-	std	rWORD7,-16(r1)
-	cfi_offset(rWORD7,-16)
+	blt	cr1, L(bytealigned)
+	std	rWORD8, -8(r1)
+	cfi_offset(rWORD8, -8)
+	std	rWORD7, -16(r1)
+	cfi_offset(rWORD7, -16)
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word
-   aligned and can perform the DWaligned loop.
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
-   yet DW).  So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
-   normal (DWaligned) compare loop, starting at the second double word,
+   normal (DW aligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
-   correct and the first DW (shifted) is in the expected resister pair.  */
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
 	.align	4
 L(samealignment):
-	clrrdi	rSTR1,rSTR1,3
-	clrrdi	rSTR2,rSTR2,3
-	beq	cr5,L(DWaligned)
-	add	rN,rN,rBITDIF
-	sldi	r11,rBITDIF,3
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
 	beq	L(dPs4)
-	mtctr	rTMP
-	bgt	cr1,L(dPs3)
-	beq	cr1,L(dPs2)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
 
 /* Remainder is 8 */
 	.align	3
 L(dsP1):
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD2,r11
-	cmpld	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
 	.align	4
 L(dPs2):
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD2,r11
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
 	.align	4
 L(dPs3):
-	sld	rWORD3,rWORD1,r11
-	sld	rWORD4,rWORD2,r11
-	cmpld	cr1,rWORD3,rWORD4
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(dPs4):
-	mtctr	rTMP
-	sld	rWORD1,rWORD1,r11
-	sld	rWORD2,rWORD2,r11
-	cmpld	cr0,rWORD1,rWORD2
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are double word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWaligned):
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
 	beq	L(dP4)
-	bgt	cr1,L(dP3)
-	beq	cr1,L(dP2)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
 
 /* Remainder is 8 */
 	.align	4
 L(dP1):
-	mtctr	rTMP
+	mtctr	r0
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-	ld	rWORD5,0(rSTR1)
-	ld	rWORD6,0(rSTR2)
-	cmpld	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+	ld	rWORD6, 0(rSTR2)
+#endif
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	bne	cr0,L(dLcr0)
-
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
-	bne	cr6,L(dLcr6)
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 	.align	3
 L(dP1x):
-	sldi.	r12,rN,3
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 16 */
 	.align	4
 L(dP2):
-	mtctr	rTMP
-	ld	rWORD5,0(rSTR1)
-	ld	rWORD6,0(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+	ld	rWORD6, 0(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	ld	rWORD3,24(rSTR1)
-	ld	rWORD4,24(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(dLcr6)
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 24(rSTR1)
+	ld	rWORD4, 24(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP2x):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr5,rWORD3,rWORD4
-	sldi.	r12,rN,3
-	bne	cr6,L(dLcr6)
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 24 */
 	.align	4
 L(dP3):
-	mtctr	rTMP
-	ld	rWORD3,0(rSTR1)
-	ld	rWORD4,0(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 0(rSTR1)
+	ld	rWORD4, 0(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
-	ld	rWORD5,8(rSTR1)
-	ld	rWORD6,8(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP3x)
-	ld	rWORD7,16(rSTR1)
-	ld	rWORD8,16(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	ld	rWORD1,24(rSTR1)
-	ld	rWORD2,24(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr1,L(dLcr1)
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 8(rSTR1)
+	ld	rWORD6, 8(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 16(rSTR1)
+	ld	rWORD8, 16(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 24(rSTR1)
+	ld	rWORD2, 24(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP3x):
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr5,rWORD1,rWORD2
-	sldi.	r12,rN,3
-	bne	cr1,L(dLcr1)
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr6,L(dLcr6)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(dP4):
-	mtctr	rTMP
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	ld	rWORD5,16(rSTR1)
-	ld	rWORD6,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	ldu	rWORD7,24(rSTR1)
-	ldu	rWORD8,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
-	bne	cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 16(rSTR1)
+	ld	rWORD6, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 24(rSTR1)
+	ldu	rWORD8, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
 L(dLoop1):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
 L(dLoop2):
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
 L(dLoop3):
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
 
 L(dL4):
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	cmpld	cr5,rWORD7,rWORD8
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0,L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
-	bne	cr1,L(dLcr1)
+	bne	cr1, L(dLcr1)
 L(d24):
-	bne	cr6,L(dLcr6)
+	bne	cr6, L(dLcr6)
 L(d14):
-	sldi.	r12,rN,3
-	bne	cr5,L(dLcr5)
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	subfic	rN,r12,64	/* Shift count is 64 - (rN * 8).  */
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	srd	rWORD1,rWORD1,rN
-	srd	rWORD2,rWORD2,rN
-	cmpld	cr5,rWORD1,rWORD2
-	bne	cr5,L(dLcr5x)
-	li	rRTN,0
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	li	rRTN, 0
 	blr
+
 	.align	4
-L(dLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(dLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr1x):
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+L(dLcr6x):
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dLcr5x):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr5
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 	.align	4
 L(bytealigned):
 	mtctr	rN
-	beq	cr6,L(zeroLength)
+#if 0
+/* Huh?  We've already branched on cr6!  */
+	beq	cr6, L(zeroLength)
+#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -396,38 +598,38 @@ L(bytealigned):
    So we must precondition some registers and condition codes so that
    we don't exit the loop early on the first iteration.  */
 
-	lbz	rWORD1,0(rSTR1)
-	lbz	rWORD2,0(rSTR2)
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
 	bdz	L(b11)
-	cmpld	cr0,rWORD1,rWORD2
-	lbz	rWORD3,1(rSTR1)
-	lbz	rWORD4,1(rSTR2)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
 	bdz	L(b12)
-	cmpld	cr1,rWORD3,rWORD4
-	lbzu	rWORD5,2(rSTR1)
-	lbzu	rWORD6,2(rSTR2)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
 	bdz	L(b13)
 	.align	4
 L(bLoop):
-	lbzu	rWORD1,1(rSTR1)
-	lbzu	rWORD2,1(rSTR2)
-	bne	cr0,L(bLcr0)
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
 
-	cmpld	cr6,rWORD5,rWORD6
+	cmpld	cr6, rWORD5, rWORD6
 	bdz	L(b3i)
 
-	lbzu	rWORD3,1(rSTR1)
-	lbzu	rWORD4,1(rSTR2)
-	bne	cr1,L(bLcr1)
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
 
-	cmpld	cr0,rWORD1,rWORD2
+	cmpld	cr7, rWORD1, rWORD2
 	bdz	L(b2i)
 
-	lbzu	rWORD5,1(rSTR1)
-	lbzu	rWORD6,1(rSTR2)
-	bne	cr6,L(bLcr6)
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
 
-	cmpld	cr1,rWORD3,rWORD4
+	cmpld	cr1, rWORD3, rWORD4
 	bdnz	L(bLoop)
 
 /* We speculatively loading bytes before we have tested the previous
@@ -437,542 +639,727 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne	cr0,L(bLcr0)
-	bne	cr1,L(bLcr1)
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
 	b	L(bx56)
 	.align	4
 L(b2i):
-	bne	cr6,L(bLcr6)
-	bne	cr0,L(bLcr0)
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
 	b	L(bx34)
 	.align	4
 L(b3i):
-	bne	cr1,L(bLcr1)
-	bne	cr6,L(bLcr6)
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
 	b	L(bx12)
 	.align	4
-L(bLcr0):
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 L(bLcr1):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 L(bLcr6):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 L(b13):
-	bne	cr0,L(bx12)
-	bne	cr1,L(bx34)
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
 L(bx56):
-	sub	rRTN,rWORD5,rWORD6
+	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne	cr0,L(bx12)
+	bne	cr7, L(bx12)
 L(bx34):
-	sub	rRTN,rWORD3,rWORD4
+	sub	rRTN, rWORD3, rWORD4
 	blr
 L(b11):
 L(bx12):
-	sub	rRTN,rWORD1,rWORD2
+	sub	rRTN, rWORD1, rWORD2
 	blr
 	.align	4
-L(zeroLengthReturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
 L(zeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    3 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is double word
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
    aligned and can perform the DWunaligned loop.
 
    Otherwise we know that rSTR1 is not already DW aligned yet.
    So we can force the string addresses to the next lower DW
-   boundary and special case this first DW word using shift left to
+   boundary and special case this first DW using shift left to
    eliminate bits preceding the first byte.  Since we want to join the
    normal (DWaligned) compare loop, starting at the second double word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first DW. This insures that the loop count is
+   versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL	r29	/* Unaligned shift left count.  */
-#define rSHR	r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	r29,-24(r1)
-	cfi_offset(r29,-24)
-	clrldi	rSHL,rSTR2,61
-	beq	cr6,L(duzeroLength)
-	std	r28,-32(r1)
-	cfi_offset(r28,-32)
-	beq	cr5,L(DWunaligned)
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-/* Adjust the logical start of rSTR2 ro compensate for the extra bits
+	std	rSHL, -24(r1)
+	cfi_offset(rSHL, -24)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, -32(r1)
+	cfi_offset(rSHR, -32)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
-	sub	r27,rSTR2,rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
-	clrrdi	rSTR2,rSTR2,3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
 /* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
-	clrldi	rSHL,r27,61
-	clrrdi	rSTR1,rSTR1,3
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	sldi	rSHL,rSHL,3
-	cmpld	cr5,r27,rSTR2
-	add	rN,rN,rBITDIF
-	sldi	r11,rBITDIF,3
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
-	subfic	rSHR,rSHL,64
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
 /* We normally need to load 2 DWs to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a DW where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
-	li	rWORD8,0
-	blt	cr5,L(dus0)
-	ld	rWORD8,0(rSTR2)
-	la	rSTR2,8(rSTR2)
-	sld	rWORD8,rWORD8,rSHL
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 8
+#endif
+	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
-	ld	rWORD1,0(rSTR1)
-	ld	rWORD2,0(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	srd	rG,rWORD2,rSHR
-	clrldi	rN,rN,61
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+	ld	rWORD2, 0(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
 	beq	L(duPs4)
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	bgt	cr1,L(duPs3)
-	beq	cr1,L(duPs2)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
 
 /* Remainder is 8 */
 	.align	4
 L(dusP1):
-	sld	rB,rWORD2,rSHL
-	sld	rWORD7,rWORD1,r11
-	sld	rWORD8,rWORD8,r11
-	bge	cr7,L(duP1e)
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
-	cmpld	cr5,rWORD7,rWORD8
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duPs2):
-	sld	rH,rWORD2,rSHL
-	sld	rWORD5,rWORD1,r11
-	sld	rWORD6,rWORD8,r11
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 24 */
 	.align	4
 L(duPs3):
-	sld	rF,rWORD2,rSHL
-	sld	rWORD3,rWORD1,r11
-	sld	rWORD4,rWORD8,r11
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(duPs4):
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	sld	rD,rWORD2,rSHL
-	sld	rWORD1,rWORD1,r11
-	sld	rWORD2,rWORD8,r11
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is double word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWunaligned):
-	std	r27,-40(r1)
-	cfi_offset(r27,-40)
-	clrrdi	rSTR2,rSTR2,3
-	std	r26,-48(r1)
-	cfi_offset(r26,-48)
-	srdi	rTMP,rN,5	/* Divide by 32 */
-	std	r25,-56(r1)
-	cfi_offset(r25,-56)
-	andi.	rBITDIF,rN,24	/* Get the DW remainder */
-	std	r24,-64(r1)
-	cfi_offset(r24,-64)
-	sldi	rSHL,rSHL,3
-	ld	rWORD6,0(rSTR2)
-	ldu	rWORD8,8(rSTR2)
-	cmpldi	cr1,rBITDIF,16
-	cmpldi	cr7,rN,32
-	clrldi	rN,rN,61
-	subfic	rSHR,rSHL,64
-	sld	rH,rWORD6,rSHL
+	std	rWORD8_SHIFT, -40(r1)
+	cfi_offset(rWORD8_SHIFT, -40)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, -48(r1)
+	cfi_offset(rWORD2_SHIFT, -48)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, -56(r1)
+	cfi_offset(rWORD4_SHIFT, -56)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, -64(r1)
+	cfi_offset(rWORD6_SHIFT, -64)
+	sldi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD6, 0(rSTR2)
+	ldu	rWORD8, 8(rSTR2)
+#endif
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr	rTMP
-	bgt	cr1,L(duP3)
-	beq	cr1,L(duP2)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
 
 /* Remainder is 8 */
 	.align	4
 L(duP1):
-	srd	rG,rWORD8,rSHR
-	ld	rWORD7,0(rSTR1)
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP1x)
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD7, 0(rSTR1)
+#endif
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
 L(duP1e):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	bne	cr5,L(duLcr5)
-	or	rWORD4,rC,rD
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	bne	cr0,L(duLcr0)
-	or	rWORD6,rE,rF
-	cmpld	cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
 	.align	4
 /* At this point we exit early with the first double word compare
    complete and remainder of 0 to 7 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
 L(duP1x):
-	cmpld	cr5,rWORD7,rWORD8
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duP2):
-	srd	rE,rWORD8,rSHR
-	ld	rWORD5,0(rSTR1)
-	or	rWORD6,rE,rH
-	sld	rH,rWORD8,rSHL
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-	ld	rWORD7,8(rSTR1)
-	ld	rWORD8,8(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP2x)
-	ld	rWORD1,16(rSTR1)
-	ld	rWORD2,16(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	ld	rWORD3,24(rSTR1)
-	ld	rWORD4,24(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	bne	cr5,L(duLcr5)
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	cmpld	cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 8(rSTR1)
+	ld	rWORD8, 8(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 16(rSTR1)
+	ld	rWORD2, 16(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 24(rSTR1)
+	ld	rWORD4, 24(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
-	cmpld	cr5,rWORD7,rWORD8
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(duLcr6)
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+	cmpld	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 24 */
 	.align	4
 L(duP3):
-	srd	rC,rWORD8,rSHR
-	ld	rWORD3,0(rSTR1)
-	sld	rF,rWORD8,rSHL
-	or	rWORD4,rC,rH
+	srd	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD3, 0(rSTR1)
+#endif
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-	ld	rWORD5,8(rSTR1)
-	ld	rWORD6,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	ld	rWORD7,16(rSTR1)
-	ld	rWORD8,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP3x)
-	ld	rWORD1,24(rSTR1)
-	ld	rWORD2,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	cmpld	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 8(rSTR1)
+	ld	rWORD6, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD7, 16(rSTR1)
+	ld	rWORD8, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 24(rSTR1)
+	ld	rWORD2, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-	addi	rSTR1,rSTR1,16
-	addi	rSTR2,rSTR2,16
-	bne	cr1,L(duLcr1)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmpld	cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 32, remainder is 0 */
 	.align	4
 L(duP4):
-	mtctr	rTMP
-	srd	rA,rWORD8,rSHR
-	ld	rWORD1,0(rSTR1)
-	sld	rD,rWORD8,rSHL
-	or	rWORD2,rA,rH
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 8
+#else
+	ld	rWORD1, 0(rSTR1)
+#endif
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-	ld	rWORD3,8(rSTR1)
-	ld	rWORD4,8(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	ld	rWORD5,16(rSTR1)
-	ld	rWORD6,16(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr0,L(duLcr0)
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	ldu	rWORD7,24(rSTR1)
-	ldu	rWORD8,24(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	cmpld	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 8(rSTR1)
+	ld	rWORD4, 8(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 16(rSTR1)
+	ld	rWORD6, 16(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 24(rSTR1)
+	ldu	rWORD8, 24(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD2,8(rSTR2)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	srd	rA,rWORD2,rSHR
-	sld	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD1, 8(rSTR1)
+	ld	rWORD2, 8(rSTR2)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-	ld	rWORD3,16(rSTR1)
-	ld	rWORD4,16(rSTR2)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	srd	rC,rWORD4,rSHR
-	sld	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD3, 0, rSTR1
+	ldbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD3, 16(rSTR1)
+	ld	rWORD4, 16(rSTR2)
+#endif
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-	ld	rWORD5,24(rSTR1)
-	ld	rWORD6,24(rSTR2)
-	cmpld	cr5,rWORD7,rWORD8
-	bne	cr0,L(duLcr0)
-	srd	rE,rWORD6,rSHR
-	sld	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD5, 0, rSTR1
+	ldbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD5, 24(rSTR1)
+	ld	rWORD6, 24(rSTR2)
+#endif
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-	ldu	rWORD7,32(rSTR1)
-	ldu	rWORD8,32(rSTR2)
-	cmpld	cr0,rWORD1,rWORD2
-	bne-	cr1,L(duLcr1)
-	srd	rG,rWORD8,rSHR
-	sld	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD7, 0, rSTR1
+	ldbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#else
+	ldu	rWORD7, 32(rSTR1)
+	ldu	rWORD8, 32(rSTR2)
+#endif
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz	L(duLoop)
 
 L(duL4):
-	bne	cr1,L(duLcr1)
-	cmpld	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	cmpld	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	cmpld	cr5,rWORD7,rWORD8
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0,L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
-	bne	cr1,L(duLcr1)
+	bne	cr1, L(duLcr1)
 L(du24):
-	bne	cr6,L(duLcr6)
+	bne	cr6, L(duLcr6)
 L(du14):
-	sldi.	rN,rN,3
-	bne	cr5,L(duLcr5)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  We use
    shift right double to eliminate bits beyond the compare length.
-   This allows the use of double word subtract to compute the final
-   result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
-	cmpld	cr7,rN,rSHR
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srd	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+#else
+	ld	rWORD2, 8(rSTR2)
+#endif
+	srd	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-	ld	rWORD1,8(rSTR1)
-	ld	rWORD8,-8(r1)
-	subfic	rN,rN,64	/* Shift count is 64 - (rN * 8).  */
-	or	rWORD2,rA,rB
-	ld	rWORD7,-16(r1)
-	ld	r29,-24(r1)
-	srd	rWORD1,rWORD1,rN
-	srd	rWORD2,rWORD2,rN
-	ld	r28,-32(r1)
-	ld	r27,-40(r1)
-	li	rRTN,0
-	cmpld	cr0,rWORD1,rWORD2
-	ld	r26,-48(r1)
-	ld	r25,-56(r1)
-	beq	cr0,L(dureturn24)
-	li	rRTN,1
-	ld	r24,-64(r1)
-	bgtlr	cr0
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	ldbrx	rWORD1, 0, rSTR1
+#else
+	ld	rWORD1, 8(rSTR1)
+#endif
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, -16(r1)
+	ld	rSHL, -24(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, -32(r1)
+	ld	rWORD8_SHIFT, -40(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, -48(r1)
+	ld	rWORD4_SHIFT, -56(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, -64(r1)
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
-L(duLcr0):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr0,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+L(duLcr7):
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr1,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr6,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
-	li	rRTN,1
-	bgt	cr5,L(dureturn29)
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
-	li	rRTN,-1
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8,-8(r1)
-	ld	rWORD7,-16(r1)
+	ld	rWORD8, -8(r1)
+	ld	rWORD7, -16(r1)
 L(dureturn29):
-	ld	r29,-24(r1)
-	ld	r28,-32(r1)
+	ld	rSHL, -24(r1)
+	ld	rSHR, -32(r1)
 L(dureturn27):
-	ld	r27,-40(r1)
+	ld	rWORD8_SHIFT, -40(r1)
 L(dureturn26):
-	ld	r26,-48(r1)
+	ld	rWORD2_SHIFT, -48(r1)
 L(dureturn25):
-	ld	r25,-56(r1)
+	ld	rWORD4_SHIFT, -56(r1)
 L(dureturn24):
-	ld	r24,-64(r1)
+	ld	rWORD6_SHIFT, -64(r1)
 	blr
 L(duzeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 END (memcmp)
 libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=509b04bf435e5411bb89f95f8d4df673c3ffcf66

commit 509b04bf435e5411bb89f95f8d4df673c3ffcf66
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:46:05 2013 +0930

    PowerPC LE strchr
    http://sourceware.org/ml/libc-alpha/2013-08/msg00101.html
    
    Adds little-endian support to optimised strchr assembly.  I've also
    tweaked the big-endian code a little.  In power7/strchr.S there's a
    check in the tail of the function that we didn't match 0 before
    finding a c match, done by comparing leading zero counts.  It's just
    as valid, and quicker, to compare the raw output from cmpb.
    
    Another little tweak is to use rldimi/insrdi in place of rlwimi for
    the power7 strchr functions.  Since rlwimi is cracked, it is a few
    cycles slower.  rldimi can be used on the 32-bit power7 functions
    too.
    
    	* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
    	support.  Correct typos, formatting.  Optimize tail.  Use insrdi
    	rather than rlwimi.
    	* sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
    	little-endian support.  Correct typos.
    	* sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
    	rather than rlwimi.
    	* sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
    	in loop and entry code to keep "and." results.
    	(strchr): Add little-endian support.  Comment.  Move cntlzd
    	earlier in tail.
    	* sysdeps/powerpc/powerpc32/strchr.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 9ca77fe..37a85c2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/power7/strchr.S (strchr): Add little-endian
+	support.  Correct typos, formatting.  Optimize tail.  Use insrdi
+	rather than rlwimi.
+	* sysdeps/powerpc/powerpc32/power7/strchr.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/strchrnul.S (__strchrnul): Add
+	little-endian support.  Correct typos.
+	* sysdeps/powerpc/powerpc32/power7/strchrnul.S: Likewise.  Use insrdi
+	rather than rlwimi.
+	* sysdeps/powerpc/powerpc64/strchr.S (rTMP4, rTMP5): Define.  Use
+	in loop and entry code to keep "and." results.
+	(strchr): Add little-endian support.  Comment.  Move cntlzd
+	earlier in tail.
+	* sysdeps/powerpc/powerpc32/strchr.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/strcpy.S: Add little-endian support:
 	* sysdeps/powerpc/powerpc32/strcpy.S: Likewise.
 	* sysdeps/powerpc/powerpc64/stpcpy.S: Likewise.
diff --git a/sysdeps/powerpc/powerpc32/power7/strchr.S b/sysdeps/powerpc/powerpc32/power7/strchr.S
index 0ecadb2..b662659 100644
--- a/sysdeps/powerpc/powerpc32/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
 	beq	cr7,L(null_match)
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes and r0 has
 	   a word of null bytes.  */
@@ -46,11 +46,17 @@ ENTRY (strchr)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and to bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r11,r11,r6
+	slw	r10,r10,r6
+	slw	r11,r11,r6
+#else
 	slw	r10,r10,r6
 	slw	r11,r11,r6
 	srw	r10,r10,r6
 	srw	r11,r11,r6
+#endif
 	or	r5,r10,r11    /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -65,7 +71,7 @@ ENTRY (strchr)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb    r10,r12,r4
+	cmpb	r10,r12,r4
 	cmpb	r11,r12,r0
 	or	r5,r10,r11
 	cmpwi	cr7,r5,0
@@ -100,22 +106,31 @@ L(loop):
 	bne	cr6,L(done)
 
 	/* The c/null byte must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
-	   pointer.  */
+	   again and move the result of cmpb to r10/r11 so we can calculate
+	   the pointer.  */
 
 	mr	r10,r6
 	mr	r11,r7
 	addi	r8,r8,4
 
-	/* r5 has the output of the cmpb instruction, that is, it contains
+	/* r10/r11 have the output of the cmpb instructions, that is,
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
-	cmplw	cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntw	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmplw	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
+	cmplw	cr7,r11,r10
 	bgt	cr7,L(no_match)
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching c byte
 				 or null in case c was not found.  */
 	blr
@@ -133,10 +148,14 @@ L(null_match):
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
 
 	/* Move the words left and right to discard the bits that are
-	   not part of the string and to bring them back as zeros.  */
-
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6
 	srw	r5,r5,r6
+#endif
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
 				 have been found.  */
 	bne	cr7,L(done_null)
@@ -191,7 +210,13 @@ L(loop_null):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc32/power7/strchrnul.S b/sysdeps/powerpc/powerpc32/power7/strchrnul.S
index d4cacab..f5d24d4 100644
--- a/sysdeps/powerpc/powerpc32/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc32/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
@@ -43,10 +43,17 @@ ENTRY (__strchrnul)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r9,r9,r6
+	slw	r10,r10,r6
+	slw	r9,r9,r6
+#else
 	slw	r10,r10,r6
 	slw	r9,r9,r6
 	srw	r10,r10,r6
 	srw	r9,r9,r6
+#endif
 	or	r5,r9,r10     /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -54,7 +61,7 @@ ENTRY (__strchrnul)
 
 	mtcrf   0x01,r8
 
-	/* Are we now aligned to a quadword boundary?  If so, skip to
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop)
@@ -76,7 +83,7 @@ L(loop):
 	   single register for speed.  This is an attempt
 	   to speed up the null-checking process for bigger strings.  */
 	lwz	r12,4(r8)
-	lwzu     r11,8(r8)
+	lwzu	r11,8(r8)
 	cmpb	r10,r12,r0
 	cmpb	r9,r12,r4
 	cmpb	r6,r11,r0
@@ -95,9 +102,9 @@ L(loop):
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
-	/* The c/null byte must be in the second word.  Adjust the
-	   address again and move the result of cmpb to r10 so we can calculate
-	   the pointer.  */
+	/* The c/null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r5 so we can calculate the
+	   pointer.  */
 	mr	r5,r10
 	addi	r8,r8,4
 
@@ -105,7 +112,13 @@ L(loop):
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc32/strchr.S b/sysdeps/powerpc/powerpc32/strchr.S
index c9952ee..6050565 100644
--- a/sysdeps/powerpc/powerpc32/strchr.S
+++ b/sysdeps/powerpc/powerpc32/strchr.S
@@ -36,6 +36,8 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 
 	rlwimi	rCHR, rCHR, 8, 16, 23
@@ -49,64 +51,93 @@ ENTRY (strchr)
 	addi	r7F7F, r7F7F, 0x7f7f
 /* Test the first (partial?) word.  */
 	lwz	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rIGN
+#else
 	srw	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):lwzu rWORD, 4(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	lwzu	rWORD, 4(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmplw	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 32-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzw	rCLZB, rTMP2
 	cmplw	rWORD, rTMP2
 	bgtlr
-	cntlzw	rCLZB, rTMP2
+#endif
 	srwi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 32-7-32
+	srawi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzw	rCLZB, rTMP2
 	subi	rSTR, rSTR, 4
 	srwi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)
diff --git a/sysdeps/powerpc/powerpc64/power7/strchr.S b/sysdeps/powerpc/powerpc64/power7/strchr.S
index 3ffe7a1..4679a15 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
 	beq	cr7,L(null_match)
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi  r4,r4,32,0
 
 	/* Now r4 has a doubleword of c bytes and r0 has
@@ -47,11 +47,17 @@ ENTRY (strchr)
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
 	sld	r10,r10,r6
 	sld	r11,r11,r6
 	srd	r10,r10,r6
 	srd	r11,r11,r6
+#endif
 	or	r5,r10,r11    /* OR the results to speed things up.  */
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -108,15 +114,24 @@ L(loop):
 	mr	r11,r7
 	addi	r8,r8,8
 
-	/* r5 has the output of the cmpb instruction, that is, it contains
+	/* r10/r11 have the output of the cmpb instructions, that is,
 	   0xff in the same position as the c/null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done):
-	cntlzd	r4,r10	      /* Count leading zeroes before c matches.  */
-	cntlzd	r0,r11	      /* Count leading zeroes before null matches.  */
-	cmpld	cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
 	bgt	cr7,L(no_match)
-	srdi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching c byte
 				 or null in case c was not found.  */
 	blr
@@ -135,9 +150,13 @@ L(null_match):
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
 	sld	r5,r5,r6
 	srd	r5,r5,r6
+#endif
 	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
 				 have been found.  */
 	bne	cr7,L(done_null)
@@ -192,7 +211,13 @@ L(loop_null):
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
index 9dbc51b..df45752 100644
--- a/sysdeps/powerpc/powerpc64/power7/strchrnul.S
+++ b/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
 	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
 
 	/* Replicate byte to doubleword.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 	insrdi	r4,r4,32,0
 
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
@@ -44,10 +44,17 @@ ENTRY (__strchrnul)
 
 	/* Move the doublewords left and right to discard the bits that are
 	   not part of the string and to bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+#else
 	sld	r10,r10,r6
 	sld	r9,r9,r6
 	srd	r10,r10,r6
 	srd	r9,r9,r6
+#endif
 	or	r5,r9,r10     /* OR the results to speed things up.  */
 	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -97,7 +104,7 @@ L(loop):
 	bne	cr6,L(done)
 
 	/* The c/null byte must be in the second doubleword.  Adjust the
-	   address again and move the result of cmpb to r10 so we can calculate
+	   address again and move the result of cmpb to r5 so we can calculate
 	   the pointer.  */
 	mr	r5,r10
 	addi	r8,r8,8
@@ -106,7 +113,13 @@ L(loop):
 	   0xff in the same position as the c/null byte in the original
 	   doubleword from the string.  Use that to calculate the pointer.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
 	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
 	blr
diff --git a/sysdeps/powerpc/powerpc64/strchr.S b/sysdeps/powerpc/powerpc64/strchr.S
index d2d8cd3..da707ae 100644
--- a/sysdeps/powerpc/powerpc64/strchr.S
+++ b/sysdeps/powerpc/powerpc64/strchr.S
@@ -37,11 +37,13 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 	dcbt	0,rRTN
-	rlwimi	rCHR, rCHR, 8, 16, 23
+	insrdi	rCHR, rCHR, 8, 48
 	li	rMASK, -1
-	rlwimi	rCHR, rCHR, 16, 0, 15
+	insrdi	rCHR, rCHR, 16, 32
 	rlwinm	rIGN, rRTN, 3, 26, 28
 	insrdi	rCHR, rCHR, 32, 0
 	lis	rFEFE, -0x101
@@ -54,64 +56,93 @@ ENTRY (strchr)
 	add	rFEFE, rFEFE, rTMP1
 /* Test the first (partial?) word.  */
 	ld	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	sld	rMASK, rMASK, rIGN
+#else
 	srd	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):ldu rWORD, 8(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	ldu	rWORD, 8(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzd	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmpld	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 64-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzd	rCLZB, rTMP2
 	cmpld	rWORD, rTMP2
 	bgtlr
-	cntlzd	rCLZB, rTMP2
+#endif
 	srdi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzd	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 64-7-64
+	sradi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzd	rCLZB, rTMP2
 	subi	rSTR, rSTR, 8
 	srdi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f7721c628c1d9d3c92cb4a9f7695be07a92c9293

commit f7721c628c1d9d3c92cb4a9f7695be07a92c9293
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:45:31 2013 +0930

    PowerPC LE strcpy
    http://sourceware.org/ml/libc-alpha/2013-08/msg00100.html
    
    The strcpy changes for little-endian are quite straight-forward, just
    a matter of rotating the last word differently.
    
    I'll note that the powerpc64 version of stpcpy is just begging to be
    converted to use 64-bit loads and stores..
    
    	* sysdeps/powerpc/powerpc64/strcpy.S: Add little-endian support:
    	* sysdeps/powerpc/powerpc32/strcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc64/stpcpy.S: Likewise.
    	* sysdeps/powerpc/powerpc32/stpcpy.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 0315ee0..9ca77fe 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/strcpy.S: Add little-endian support:
+	* sysdeps/powerpc/powerpc32/strcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/stpcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc32/stpcpy.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/strcmp.S (rTMP2): Define as r0.
 	(rTMP): Define as r11.
 	(strcmp): Add little-endian support.  Optimise tail.
diff --git a/sysdeps/powerpc/powerpc32/stpcpy.S b/sysdeps/powerpc/powerpc32/stpcpy.S
index 03c6ddd..7e106e0 100644
--- a/sysdeps/powerpc/powerpc32/stpcpy.S
+++ b/sysdeps/powerpc/powerpc32/stpcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stbu	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stbu	rALT, 1(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/strcpy.S b/sysdeps/powerpc/powerpc32/strcpy.S
index 4ae577d..e938cc4 100644
--- a/sysdeps/powerpc/powerpc32/strcpy.S
+++ b/sysdeps/powerpc/powerpc32/strcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stb	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stb	rTMP, 5(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stb	rTMP, 6(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stb	rTMP, 7(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stb	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stb	rALT, 7(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S
index 070cd46..c0b3972 100644
--- a/sysdeps/powerpc/powerpc64/stpcpy.S
+++ b/sysdeps/powerpc/powerpc64/stpcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stbu	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stbu	rALT, 1(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/strcpy.S b/sysdeps/powerpc/powerpc64/strcpy.S
index 4c6fd3f..a7fd85b 100644
--- a/sysdeps/powerpc/powerpc64/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/strcpy.S
@@ -68,6 +68,32 @@ L(g2):	add	rTMP, rFEFE, rWORD
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
 L(g1):
+#ifdef __LITTLE_ENDIAN__
+	extrdi.	rTMP, rALT, 8, 56
+	stb	rALT, 8(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 48
+	stb	rTMP, 9(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 40
+	stb	rTMP, 10(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 32
+	stb	rTMP, 11(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 24
+	stb	rTMP, 12(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 16
+	stb	rTMP, 13(rDEST)
+	beqlr-
+	extrdi.	rTMP, rALT, 8, 8
+	stb	rTMP, 14(rDEST)
+	beqlr-
+	extrdi	rTMP, rALT, 8, 0
+	stb	rTMP, 15(rDEST)
+	blr
+#else
 	extrdi.	rTMP, rALT, 8, 0
 	stb	rTMP, 8(rDEST)
 	beqlr-
@@ -91,6 +117,7 @@ L(g1):
 	beqlr-
 	stb	rALT, 15(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=84f850520baed35f83fdce3ac9ca00929ba40e0d

commit 84f850520baed35f83fdce3ac9ca00929ba40e0d
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:41:17 2013 +0930

    PowerPC LE strcmp and strncmp
    http://sourceware.org/ml/libc-alpha/2013-08/msg00099.html
    
    More little-endian support.  I leave the main strcmp loops unchanged,
    (well, except for renumbering rTMP to something other than r0 since
    it's needed in an addi insn) and modify the tail for little-endian.
    
    I noticed some of the big-endian tail code was a little untidy so have
    cleaned that up too.
    
    	* sysdeps/powerpc/powerpc64/strcmp.S (rTMP2): Define as r0.
    	(rTMP): Define as r11.
    	(strcmp): Add little-endian support.  Optimise tail.
    	* sysdeps/powerpc/powerpc32/strcmp.S: Similarly.
    	* sysdeps/powerpc/powerpc64/strncmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/strncmp.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power4/strncmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power4/strncmp.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/strncmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/strncmp.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 811e852..0315ee0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/strcmp.S (rTMP2): Define as r0.
+	(rTMP): Define as r11.
+	(strcmp): Add little-endian support.  Optimise tail.
+	* sysdeps/powerpc/powerpc32/strcmp.S: Similarly.
+	* sysdeps/powerpc/powerpc64/strncmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/strncmp.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power4/strncmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/strncmp.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/strncmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/strncmp.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/power7/strnlen.S (strnlen): Add
 	little-endian support.  Remove unnecessary "are we done" tests.
 	Handle "s" wrapping around zero and extremely large "size".
diff --git a/sysdeps/powerpc/powerpc32/power4/strncmp.S b/sysdeps/powerpc/powerpc32/power4/strncmp.S
index 724d908..89b961e 100644
--- a/sysdeps/powerpc/powerpc32/power4/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/power4/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,6 +37,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -75,12 +76,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -88,28 +122,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/power7/strncmp.S b/sysdeps/powerpc/powerpc32/power7/strncmp.S
index fdae44d..10c9d25 100644
--- a/sysdeps/powerpc/powerpc32/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -26,7 +26,7 @@
 
 EALIGN (strncmp,5,0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -39,6 +39,7 @@ EALIGN (strncmp,5,0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	nop
@@ -78,13 +79,45 @@ L(g1):	add	rTMP,rFEFE,rWORD1
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
 
+#else
 L(endstring):
 	and	rTMP,r7F7F,rWORD1
 	beq	cr1,L(equal)
 	add	rTMP,rTMP,r7F7F
 	xor.	rBITDIF,rWORD1,rWORD2
-
 	andc	rNEG,rNEG,rTMP
 	blt	L(highbit)
 	cntlzw	rBITDIF,rBITDIF
@@ -92,28 +125,20 @@ L(endstring):
 	addi	rNEG,rNEG,7
 	cmpw	cr1,rNEG,rBITDIF
 	sub	rRTN,rWORD1,rWORD2
-	blt	cr1,L(equal)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr	cr1
 L(equal):
 	li	rRTN,0
 	blr
 
 L(different):
-	lwzu	rWORD1,-4(rSTR1)
+	lwz	rWORD1,-4(rSTR1)
 	xor.	rBITDIF,rWORD1,rWORD2
 	sub	rRTN,rWORD1,rWORD2
-	blt	L(highbit)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr
 L(highbit):
-	srwi	rWORD2,rWORD2,24
-	srwi	rWORD1,rWORD1,24
-	sub	rRTN,rWORD1,rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well. In this case, we just do a byte-by-byte comparison.  */
 	.align	4
diff --git a/sysdeps/powerpc/powerpc32/strcmp.S b/sysdeps/powerpc/powerpc32/strcmp.S
index 297ca3c..91d60c9 100644
--- a/sysdeps/powerpc/powerpc32/strcmp.S
+++ b/sysdeps/powerpc/powerpc32/strcmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strcmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -34,6 +34,7 @@ EALIGN (strcmp, 4, 0)
 #define r7F7F	r8	/* constant 0x7f7f7f7f */
 #define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r10	/* bits that differ in s1 & s2 words */
+#define rTMP	r11
 
 
 	or	rTMP, rSTR2, rSTR1
@@ -56,10 +57,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
 	and.	rTMP, rTMP, rNEG
 	cmpw	cr1, rWORD1, rWORD2
 	beq+	L(g0)
-L(endstring):
+
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	rlwimi	rTMP2, rTMP2, 1, 0, 30
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
+L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
@@ -84,7 +120,7 @@ L(different):
 L(highbit):
 	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc32/strncmp.S b/sysdeps/powerpc/powerpc32/strncmp.S
index fa345d2..e36a160 100644
--- a/sysdeps/powerpc/powerpc32/strncmp.S
+++ b/sysdeps/powerpc/powerpc32/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -73,12 +74,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -86,28 +120,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/power4/strncmp.S b/sysdeps/powerpc/powerpc64/power4/strncmp.S
index 1276e16..5d136cf 100644
--- a/sysdeps/powerpc/powerpc64/power4/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power4/strncmp.S
@@ -25,7 +25,7 @@
 EALIGN (strncmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -38,6 +38,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -79,12 +80,59 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -93,7 +141,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -101,7 +149,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1, -8(rSTR1)
+	ld	rWORD1, -8(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
 	blt-	L(highbit)
@@ -109,11 +157,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/power7/strncmp.S b/sysdeps/powerpc/powerpc64/power7/strncmp.S
index 77ecad5..e618b01 100644
--- a/sysdeps/powerpc/powerpc64/power7/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -27,7 +27,7 @@
 EALIGN (strncmp,5,0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -40,6 +40,7 @@ EALIGN (strncmp,5,0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	nop
@@ -83,12 +84,57 @@ L(g1):	add	rTMP,rFEFE,rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP,r7F7F,rWORD1
 	beq	cr1,L(equal)
 	add	rTMP,rTMP,r7F7F
 	xor.	rBITDIF,rWORD1,rWORD2
-
 	andc	rNEG,rNEG,rTMP
 	blt	L(highbit)
 	cntlzd	rBITDIF,rBITDIF
@@ -97,7 +143,7 @@ L(endstring):
 	cmpd	cr1,rNEG,rBITDIF
 	sub	rRTN,rWORD1,rWORD2
 	blt	cr1,L(equal)
-	sradi	rRTN,rRTN,63
+	sradi	rRTN,rRTN,63		/* must return an int.  */
 	ori	rRTN,rRTN,1
 	blr
 L(equal):
@@ -105,7 +151,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1,-8(rSTR1)
+	ld	rWORD1,-8(rSTR1)
 	xor.	rBITDIF,rWORD1,rWORD2
 	sub	rRTN,rWORD1,rWORD2
 	blt	L(highbit)
@@ -113,11 +159,10 @@ L(different):
 	ori	rRTN,rRTN,1
 	blr
 L(highbit):
-	srdi	rWORD2,rWORD2,56
-	srdi	rWORD1,rWORD1,56
-	sub	rRTN,rWORD1,rWORD2
+	sradi	rRTN,rWORD2,63
+	ori	rRTN,rRTN,1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align	4
diff --git a/sysdeps/powerpc/powerpc64/strcmp.S b/sysdeps/powerpc/powerpc64/strcmp.S
index c9d6dac..7085468 100644
--- a/sysdeps/powerpc/powerpc64/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/strcmp.S
@@ -25,7 +25,7 @@
 EALIGN (strcmp, 4, 0)
 	CALL_MCOUNT 2
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strcmp, 4, 0)
 #define r7F7F	r8	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r10	/* bits that differ in s1 & s2 words */
+#define rTMP	r11
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -58,19 +59,66 @@ L(g0):	ldu	rWORD1, 8(rSTR1)
 	ldu	rWORD2, 8(rSTR2)
 L(g1):	add	rTMP, rFEFE, rWORD1
 	nor	rNEG, r7F7F, rWORD1
-
 	and.	rTMP, rTMP, rNEG
 	cmpd	cr1, rWORD1, rWORD2
 	beq+	L(g0)
-L(endstring):
+
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
+L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -79,7 +127,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -95,11 +143,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/sysdeps/powerpc/powerpc64/strncmp.S b/sysdeps/powerpc/powerpc64/strncmp.S
index 779d9f7..8f842c4 100644
--- a/sysdeps/powerpc/powerpc64/strncmp.S
+++ b/sysdeps/powerpc/powerpc64/strncmp.S
@@ -25,7 +25,7 @@
 EALIGN (strncmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -36,6 +36,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -77,12 +78,59 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	xor	rBITDIF, rWORD1, rWORD2	/* rBITDIF has bits that differ.  */
+	neg	rNEG, rBITDIF
+	and	rNEG, rNEG, rBITDIF	/* rNEG has LS bit that differs.  */
+	cntlzd	rNEG, rNEG		/* bitcount of the bit.  */
+	andi.	rNEG, rNEG, 56		/* bitcount to LS byte that differs. */
+	sld	rWORD1, rWORD1, rNEG	/* shift left to clear MS bytes.  */
+	sld	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt-	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzd	rBITDIF, rBITDIF
@@ -91,7 +139,7 @@ L(endstring):
 	cmpd	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
 	blt-	cr1, L(equal)
-	sradi	rRTN, rRTN, 63
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
 	ori	rRTN, rRTN, 1
 	blr
 L(equal):
@@ -99,7 +147,7 @@ L(equal):
 	blr
 
 L(different):
-	ldu	rWORD1, -8(rSTR1)
+	ld	rWORD1, -8(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
 	blt-	L(highbit)
@@ -107,11 +155,10 @@ L(different):
 	ori	rRTN, rRTN, 1
 	blr
 L(highbit):
-	srdi	rWORD2, rWORD2, 56
-	srdi	rWORD1, rWORD1, 56
-	sub	rRTN, rWORD1, rWORD2
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=65d975581da8f616f0a8dfca1b961f3a006186de

commit 65d975581da8f616f0a8dfca1b961f3a006186de
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:40:48 2013 +0930

    PowerPC LE strnlen
    http://sourceware.org/ml/libc-alpha/2013-08/msg00098.html
    
    The existing strnlen code has a number of defects, so this patch is more
    than just adding little-endian support.  The changes here are similar to
    those for memchr.
    
    	* sysdeps/powerpc/powerpc64/power7/strnlen.S (strnlen): Add
    	little-endian support.  Remove unnecessary "are we done" tests.
    	Handle "s" wrapping around zero and extremely large "size".
    	Correct main loop count.  Handle single left-over word from main
    	loop inline rather than by using small_loop.  Correct comments.
    	Delete "zero" tail, use "end_max" instead.
    	* sysdeps/powerpc/powerpc32/power7/strnlen.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 690d370..811e852 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/power7/strnlen.S (strnlen): Add
+	little-endian support.  Remove unnecessary "are we done" tests.
+	Handle "s" wrapping around zero and extremely large "size".
+	Correct main loop count.  Handle single left-over word from main
+	loop inline rather than by using small_loop.  Correct comments.
+	Delete "zero" tail, use "end_max" instead.
+	* sysdeps/powerpc/powerpc32/power7/strnlen.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
 	support.  Don't branch over align.
 	* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
diff --git a/sysdeps/powerpc/powerpc32/power7/strnlen.S b/sysdeps/powerpc/powerpc32/power7/strnlen.S
index ed08836..eb52afd 100644
--- a/sysdeps/powerpc/powerpc32/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc32/power7/strnlen.S
@@ -28,51 +28,47 @@ ENTRY (__strnlen)
 	add	r7,r3,r4      /* Calculate the last acceptable address.  */
 	cmplwi	r4,16
 	li	r0,0	      /* Word with null chars.  */
+	addi	r7,r7,-1
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  If
-				 it's equal or less, it means size is either 0
-				 or a negative number.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
+	clrrwi	r7,r7,2       /* Address of last word.  */
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop_setup)
 
-	/* Handle DWORD2 of pair.  */
+	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
 L(loop_setup):
-	sub	r5,r7,r9
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r5,r7,r8
 	srwi	r6,r5,3	      /* Number of loop iterations.  */
 	mtctr	r6	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for the null byte backwards in the string.  Since
+
+	/* Main loop to look for the null byte in the string.  Since
 	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
 	.p2align  5
 L(loop):
@@ -88,15 +84,18 @@ L(loop):
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for null in the whole range.  Just return
-	   the original size.  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	blt	cr6,L(loop_small)
+
+	/* We may have one more word to read.  */
+	cmplw	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr6,r10,0
+	bne	cr6,L(done)
 
 L(end_max):
-	sub	r3,r7,r3
+	mr	r3,r4
 	blr
 
 	/* OK, one (or both) of the words contains a null byte.  Check
@@ -121,49 +120,56 @@ L(found):
 	   We need to make sure the null char is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
-	add	r9,r8,r0
-	sub	r6,r9,r3      /* Length until the match.  */
-	cmplw	r9,r7
-	bgt	L(end_max)
-	mr	r3,r6
-	blr
-
-	.align	4
-L(zero):
-	li	r3,0
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmplw	r3,r4
+	blelr
+	mr	r3,r4
 	blr
 
-/* Deals with size <= 32.  */
+/* Deals with size <= 16.  */
 	.align	4
 L(small_range):
 	cmplwi	r4,0
-	beq	L(zero)
+	beq	L(end_max)
+
+	clrrwi	r7,r7,2       /* Address of last word.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in WORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	beq	L(end_max)
 
 	.p2align  5
 L(loop_small):
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
-	addi	r9,r8,4
 	cmplwi	cr6,r10,0
 	bne	cr6,L(done)
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
 END (__strnlen)
 weak_alias (__strnlen, strnlen)
 libc_hidden_builtin_def (strnlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strnlen.S b/sysdeps/powerpc/powerpc64/power7/strnlen.S
index 37c7dbf..5159106 100644
--- a/sysdeps/powerpc/powerpc64/power7/strnlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -24,33 +24,29 @@
 ENTRY (__strnlen)
 	CALL_MCOUNT 2
 	dcbt	0,r3
-	clrrdi  r8,r3,3
+	clrrdi	r8,r3,3
 	add	r7,r3,r4      /* Calculate the last acceptable address.  */
 	cmpldi	r4,32
 	li	r0,0	      /* Doubleword with null chars.  */
+	addi	r7,r7,-1
+
 	/* If we have less than 33 bytes to search, skip to a faster code.  */
 	ble	L(small_range)
 
-	cmpld	cr7,r3,r7    /* Is the address equal or less than r3?  If
-				it's equal or less, it means size is either 0
-				or a negative number.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
 	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
 	sld	r10,r10,r6
 	srd	r10,r10,r6
+#endif
 	cmpldi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(end_max)
-
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
 	mtcrf   0x01,r8
 	/* Are we now aligned to a quadword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
@@ -63,17 +59,18 @@ L(proceed):
 	cmpldi	cr7,r10,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	bge	cr6,L(end_max)
-
 L(loop_setup):
-	sub	r5,r7,r9
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r5,r7,r8
 	srdi	r6,r5,4	      /* Number of loop iterations.  */
 	mtctr	r6	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for the null byte backwards in the string.  Since
+
+	/* Main loop to look for the null byte in the string.  Since
 	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
 	.p2align  5
 L(loop):
@@ -89,15 +86,18 @@ L(loop):
 	cmpldi	cr7,r5,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for null in the whole range.  Just return
-	   the original size.  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
-	blt	cr6,L(loop_small)
+
+	/* We may have one more dword to read.  */
+	cmpld	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
 
 L(end_max):
-	sub	r3,r7,r3
+	mr	r3,r4
 	blr
 
 	/* OK, one (or both) of the doublewords contains a null byte.  Check
@@ -119,52 +119,59 @@ L(found):
 	/* r10 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the length.
-	   We need to make sure the null char is *before* the start of the
-	   range (since we're going backwards).  */
+	   We need to make sure the null char is *before* the end of the
+	   range.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
-	add	r9,r8,r0
-	sub	r6,r9,r3      /* Length until the match.  */
-	cmpld	r9,r7
-	bgt	L(end_max)
-	mr	r3,r6
-	blr
-
-	.align	4
-L(zero):
-	li	r3,0
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4
+	blelr
+	mr	r3,r4
 	blr
 
 /* Deals with size <= 32.  */
 	.align	4
 L(small_range):
 	cmpldi	r4,0
-	beq	L(zero)
+	beq	L(end_max)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
 
 	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
-	ld	r12,0(r8)     /* Load word from memory.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
 	sld	r10,r10,r6
 	srd	r10,r10,r6
+#endif
 	cmpldi	cr7,r10,0
 	bne	cr7,L(done)
 
-	addi    r9,r8,8
-	cmpld	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmpld	r8,r7
+	beq	L(end_max)
 
 	.p2align  5
 L(loop_small):
 	ldu	r12,8(r8)
 	cmpb	r10,r12,r0
-	addi	r9,r8,8
 	cmpldi	cr6,r10,0
 	bne	cr6,L(done)
-	cmpld	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmpld	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
 END (__strnlen)
 weak_alias (__strnlen, strnlen)
 libc_hidden_builtin_def (strnlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=23fcb887a028aa322b8c4d0da881fc13ffe2ac0a

commit 23fcb887a028aa322b8c4d0da881fc13ffe2ac0a
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:40:11 2013 +0930

    PowerPC LE strlen
    http://sourceware.org/ml/libc-alpha/2013-08/msg00097.html
    
    This is the first of nine patches adding little-endian support to the
    existing optimised string and memory functions.  I did spend some
    time with a power7 simulator looking at cycle by cycle behaviour for
    memchr, but most of these patches have not been run on cpu simulators
    to check that we are going as fast as possible.  I'm sure PowerPC can
    do better.  However, the little-endian support mostly leaves main
    loops unchanged, so I'm banking on previous authors having done a
    good job on big-endian..  As with most code you stare at long enough,
    I found some improvements for big-endian too.
    
    Little-endian support for strlen.  Like most of the string functions,
    I leave the main word or multiple-word loops substantially unchanged,
    just needing to modify the tail.
    
    Removing the branch in the power7 functions is just a tidy.  .align
    produces a branch anyway.  Modifying regs in the non-power7 functions
    is to suit the new little-endian tail.
    
    	* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
    	support.  Don't branch over align.
    	* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
    	* sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian support.
    	Rearrange tmp reg use to suit.  Comment.
    	* sysdeps/powerpc/powerpc32/strlen.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 99fdeb1..690d370 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc64/power7/strlen.S (strlen): Add little-endian
+	support.  Don't branch over align.
+	* sysdeps/powerpc/powerpc32/power7/strlen.S: Likewise.
+	* sysdeps/powerpc/powerpc64/strlen.S (strlen): Add little-endian
+	support.  Rearrange tmp reg use to suit.  Comment.
+	* sysdeps/powerpc/powerpc32/strlen.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h: New file.
 
 2013-10-04  Alan Modra  <amodra@gmail.com>
diff --git a/sysdeps/powerpc/powerpc32/power7/strlen.S b/sysdeps/powerpc/powerpc32/power7/strlen.S
index b71a10f..b08d6c0 100644
--- a/sysdeps/powerpc/powerpc32/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc32/power7/strlen.S
@@ -29,7 +29,11 @@ ENTRY (strlen)
 	li	r0,0	      /* Word with null chars to use with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	lwz	r12,0(r4)     /* Load word from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	r5,r5,r6
+#else
 	srw	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in WORD1.  */
 	cmpwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@@ -47,9 +51,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpwi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */
 
 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@@ -86,9 +87,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntw r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
diff --git a/sysdeps/powerpc/powerpc32/strlen.S b/sysdeps/powerpc/powerpc32/strlen.S
index 9a6eafc..a7153ed 100644
--- a/sysdeps/powerpc/powerpc32/strlen.S
+++ b/sysdeps/powerpc/powerpc32/strlen.S
@@ -29,7 +29,12 @@
       1 is subtracted you get a value in the range 0x00-0x7f, none of which
       have their high bit set. The expression here is
       (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.
 
    2) Given a word 'x', we can test to see _which_ byte was zero by
       calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -72,7 +77,7 @@
 
 ENTRY (strlen)
 
-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@@ -82,9 +87,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string word */
 #define rWORD2	r9	/* next string word */
 #define rMASK	r9	/* mask for first string word */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12
 
 
 	clrrwi	rSTR, rRTN, 2
@@ -93,15 +98,20 @@ ENTRY (strlen)
 	lwz	rWORD1, 0(rSTR)
 	li	rMASK, -1
 	addi	r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
-   We make an exception and use method (2) on the first two words, to reduce
-   overhead.  */
+/* We use method (2) on the first two words, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rPADN
+#else
 	srw	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
 	lis	rFEFE, -0x101
@@ -110,11 +120,12 @@ ENTRY (strlen)
 	bt	29, L(loop)
 
 /* Handle second word of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	lwzu	rWORD1, 4(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)
 
 /* The loop.  */
@@ -128,28 +139,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)
 
+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 4
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1
 
 /* When we get to here, rSTR points to the first word in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
+   and 0x00 otherwise.  */
 L(done0):
-	cntlzw	rTMP3, rWORD1
+	cntlzw	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srwi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 32-7
+	srwi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzw	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 32-7-32
+	srawi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/powerpc/powerpc64/power7/strlen.S b/sysdeps/powerpc/powerpc64/power7/strlen.S
index 3432169..807ef10 100644
--- a/sysdeps/powerpc/powerpc64/power7/strlen.S
+++ b/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -30,7 +30,11 @@ ENTRY (strlen)
 				 with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
 	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
 	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@@ -48,9 +52,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpdi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */
 
 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@@ -87,9 +88,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   doubleword from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzd	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntd r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srdi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
diff --git a/sysdeps/powerpc/powerpc64/strlen.S b/sysdeps/powerpc/powerpc64/strlen.S
index 0f9b5ee..4ed1ba3 100644
--- a/sysdeps/powerpc/powerpc64/strlen.S
+++ b/sysdeps/powerpc/powerpc64/strlen.S
@@ -29,7 +29,12 @@
       1 is subtracted you get a value in the range 0x00-0x7f, none of which
       have their high bit set. The expression here is
       (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.
 
    2) Given a word 'x', we can test to see _which_ byte was zero by
       calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -62,7 +67,7 @@
    Answer:
    1) Added a Data Cache Block Touch early to prefetch the first 128
    byte cache line. Adding dcbt instructions to the loop would not be
-   effective since most strings will be shorter than the cache line.*/
+   effective since most strings will be shorter than the cache line.  */
 
 /* Some notes on register usage: Under the SVR4 ABI, we can use registers
    0 and 3 through 12 (so long as we don't call any procedures) without
@@ -78,7 +83,7 @@
 ENTRY (strlen)
 	CALL_MCOUNT 1
 
-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@@ -88,9 +93,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string doubleword */
 #define rWORD2	r9	/* next string doubleword */
 #define rMASK	r9	/* mask for first string doubleword */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12
 
 	dcbt	0,rRTN
 	clrrdi	rSTR, rRTN, 3
@@ -100,30 +105,36 @@ ENTRY (strlen)
 	addi	r7F7F, r7F7F, 0x7f7f
 	li	rMASK, -1
 	insrdi	r7F7F, r7F7F, 32, 0
-/* That's the setup done, now do the first pair of doublewords.
-   We make an exception and use method (2) on the first two doublewords,
-   to reduce overhead.  */
+/* We use method (2) on the first two doublewords, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	rMASK, rMASK, rPADN
+#else
 	srd	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	lis	rFEFE, -0x101
 	add	rTMP1, rTMP1, r7F7F
 	addi	rFEFE, rFEFE, -0x101
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
-	sldi  rTMP1, rFEFE, 32
-	add  rFEFE, rFEFE, rTMP1
+	sldi	rTMP1, rFEFE, 32
+	add	rFEFE, rFEFE, rTMP1
 /* Are we now aligned to a doubleword boundary?  */
 	bt	28, L(loop)
 
 /* Handle second doubleword of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	ldu	rWORD1, 8(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)
 
 /* The loop.  */
@@ -137,28 +148,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)
 
+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 8
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1
 
 /* When we get to here, rSTR points to the first doubleword in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero, and 0x00
+   otherwise.  */
 L(done0):
-	cntlzd	rTMP3, rWORD1
+	cntlzd	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srdi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzd	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 64-7
+	srdi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzd	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 64-7-64
+	sradi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=319cb16b517026f42a338f30a8d8d503f9530e67

commit 319cb16b517026f42a338f30a8d8d503f9530e67
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:37:18 2013 +0930

    PowerPC SIGSTKSZ
    http://sourceware.org/ml/libc-alpha/2013-08/msg00093.html
    
    This copies the sparc version of sigstack.h, which gives powerpc
     #define MINSIGSTKSZ     4096
     #define SIGSTKSZ        16384
    
    Before the VSX changes, struct rt_sigframe size was 1920 plus 128 for
    __SIGNAL_FRAMESIZE giving ppc64 exactly the default MINSIGSTKSZ of
    2048.
    
    After VSX, ucontext increased by 256 bytes.  Oops, we're over
    MINSIGSTKSZ, so powerpc has been using the wrong value for quite a
    while.  Add another ucontext for TM and rt_sigframe is now at 3872,
    giving actual MINSIGSTKSZ of 4000.
    
    The glibc testcase that I was looking at was tst-cancel21, which
    allocates 2*SIGSTKSZ (not because the test is trying to be
    conservative, but because the test actually has nested signal stack
    frames).  We blew the allocation by 48 bytes when using current
    mainline gcc to compile glibc (le ppc64).
    
    The required stack depth in _dl_lookup_symbol_x from the top of the
    next signal frame was 10944 bytes.  I guess you'd want to add 288 to
    that, implying an actual SIGSTKSZ of 11232.
    
    	* sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h: New file.

diff --git a/ChangeLog b/ChangeLog
index 696a340..99fdeb1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h: New file.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S: Use
 	conditional form of branch and link when obtaining pc.
 	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S: Likewise.
diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h b/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h
new file mode 100644
index 0000000..33be9e8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/sigstack.h
@@ -0,0 +1,54 @@
+/* sigstack, sigaltstack definitions.
+   Copyright (C) 1998-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SIGNAL_H
+# error "Never include this file directly.  Use <signal.h> instead"
+#endif
+
+
+/* Structure describing a signal stack (obsolete).  */
+struct sigstack
+  {
+    void *ss_sp;		/* Signal stack pointer.  */
+    int ss_onstack;		/* Nonzero if executing on this stack.  */
+  };
+
+
+/* Possible values for `ss_flags.'.  */
+enum
+{
+  SS_ONSTACK = 1,
+#define SS_ONSTACK	SS_ONSTACK
+  SS_DISABLE
+#define SS_DISABLE	SS_DISABLE
+};
+
+/* Minimum stack size for a signal handler.  */
+#define MINSIGSTKSZ	4096
+
+/* System default stack size.  */
+#define SIGSTKSZ	16384
+
+
+/* Alternate, preferred interface.  */
+typedef struct sigaltstack
+  {
+    void *ss_sp;
+    int ss_flags;
+    size_t ss_size;
+  } stack_t;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fb2a1205a251c0fd1458fdfffc6641c382e83717

commit fb2a1205a251c0fd1458fdfffc6641c382e83717
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:36:45 2013 +0930

    PowerPC makecontext
    http://sourceware.org/ml/libc-alpha/2013-08/msg00092.html
    
    Use conditional form of branch and link to avoid destroying the cpu
    link stack used to predict blr return addresses.
    
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S: Use
    	conditional form of branch and link when obtaining pc.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 2e6eb84..696a340 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S: Use
+	conditional form of branch and link when obtaining pc.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S: Use
 	HIWORD/LOWORD.
 	* sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S: Ditto.
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
index 95902b1..70e3c97 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/makecontext.S
@@ -47,7 +47,9 @@ ENTRY(__makecontext)
 #ifdef PIC
 	mflr	r0
 	cfi_register(lr,r0)
-	bl	1f
+	/* Use this conditional form of branch and link to avoid destroying
+	   the cpu link stack used to predict blr return addresses.  */
+	bcl	20,31,1f
 1:	mflr	r6
 	addi	r6,r6,L(exitcode)-1b
 	mtlr	r0
@@ -136,7 +138,9 @@ ENTRY(__novec_makecontext)
 #ifdef PIC
 	mflr	r0
 	cfi_register(lr,r0)
-	bl	1f
+	/* Use this conditional form of branch and link to avoid destroying
+	   the cpu link stack used to predict blr return addresses.  */
+	bcl	20,31,1f
 1:	mflr	r6
 	addi	r6,r6,L(novec_exitcode)-1b
 	mtlr	r0
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
index 4a16669..32fc47c 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/makecontext.S
@@ -124,8 +124,10 @@ L(noparms):
 
   /* If the target function returns we need to do some cleanup.  We use a
      code trick to get the address of our cleanup function into the link
-     register.  Do not add any code between here and L(exitcode).  */
-  bl  L(gotexitcodeaddr);
+     register.  Do not add any code between here and L(exitcode).
+     Use this conditional form of branch and link to avoid destroying
+     the cpu link stack used to predict blr return addresses.  */
+  bcl	20,31,L(gotexitcodeaddr);
 
 	/* This is the helper code which gets called if a function which
 	   is registered with 'makecontext' returns.  In this case we

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f50a734f453d9993dabbbd91e5e276f83c3b38e5

commit f50a734f453d9993dabbbd91e5e276f83c3b38e5
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:36:11 2013 +0930

    PowerPC LE _dl_hwcap access
    http://sourceware.org/ml/libc-alpha/2013-08/msg00091.html
    
    More LE support, correcting word accesses to _dl_hwcap.
    
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S: Use
    	HIWORD/LOWORD.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S: Ditto.
    	* sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S: Ditto.

diff --git a/ChangeLog b/ChangeLog
index f37cd7b..2e6eb84 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S: Use
+	HIWORD/LOWORD.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S: Ditto.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S: Ditto.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/longjmp.c: Use proper symbol versioning macros.
 	* sysdeps/powerpc/novmx-longjmp.c: Likewise.
 	* sysdeps/powerpc/powerpc32/bsd-_setjmp.S: Likewise.
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
index 989899e..7e108f9 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/getcontext-common.S
@@ -151,15 +151,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #   else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #   endif
 #  else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #  endif
 	andis.	r7,r7,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
index 900ce04..3121995 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/setcontext-common.S
@@ -79,15 +79,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 # ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 # else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 # endif
 #else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #endif
 
 #ifdef __CONTEXT_ENABLE_FPRS
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
index 77e47a7..0e942d3 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/swapcontext-common.S
@@ -152,15 +152,15 @@ ENTRY(__CONTEXT_FUNC_NAME)
 #  ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
 	mtlr    r8
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #  else
 	lwz     r7,_dl_hwcap@got(r7)
 	mtlr    r8
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #  endif
 # else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 # endif
 
 # ifdef __CONTEXT_ENABLE_VRS
@@ -308,14 +308,14 @@ ENTRY(__CONTEXT_FUNC_NAME)
 	mtlr    r8
 #   ifdef SHARED
 	lwz     r7,_rtld_global_ro@got(r7)
-	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r7)
+	lwz     r7,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r7)
 #   else
 	lwz     r7,_dl_hwcap@got(r7)
-	lwz     r7,4(r7)
+	lwz     r7,LOWORD(r7)
 #   endif
 #  else
-	lis	r7,(_dl_hwcap+4)@ha
-	lwz     r7,(_dl_hwcap+4)@l(r7)
+	lis	r7,(_dl_hwcap+LOWORD)@ha
+	lwz     r7,(_dl_hwcap+LOWORD)@l(r7)
 #  endif
 	andis.	r7,r7,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	la	r10,(_UC_VREGS)(r31)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e9c9326efdd0e7ad3ee06d60d6fb3b9a9370cd4b

commit e9c9326efdd0e7ad3ee06d60d6fb3b9a9370cd4b
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:35:40 2013 +0930

    PowerPC ugly symbol versioning
    http://sourceware.org/ml/libc-alpha/2013-08/msg00090.html
    
    This patch fixes symbol versioning in setjmp/longjmp.  The existing
    code uses raw versions, which results in wrong symbol versioning when
    you want to build glibc with a base version of 2.19 for LE.
    
    Note that the merging the 64-bit and 32-bit versions in novmx-lonjmp.c
    and pt-longjmp.c doesn't result in GLIBC_2.0 versions for 64-bit, due
    to the base in shlib_versions.
    
    	* sysdeps/powerpc/longjmp.c: Use proper symbol versioning macros.
    	* sysdeps/powerpc/novmx-longjmp.c: Likewise.
    	* sysdeps/powerpc/powerpc32/bsd-_setjmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/bsd-setjmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/fpu/__longjmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/fpu/setjmp.S: Likewise.
    	* sysdeps/powerpc/powerpc32/mcount.c: Likewise.
    	* sysdeps/powerpc/powerpc32/setjmp.S: Likewise.
    	* sysdeps/powerpc/powerpc64/setjmp.S: Likewise.
    	* nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 56e77be..f37cd7b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/longjmp.c: Use proper symbol versioning macros.
+	* sysdeps/powerpc/novmx-longjmp.c: Likewise.
+	* sysdeps/powerpc/powerpc32/bsd-_setjmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/bsd-setjmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/__longjmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/setjmp.S: Likewise.
+	* sysdeps/powerpc/powerpc32/mcount.c: Likewise.
+	* sysdeps/powerpc/powerpc32/setjmp.S: Likewise.
+	* sysdeps/powerpc/powerpc64/setjmp.S: Likewise.
+	* nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c: Likewise.
+
 2013-10-04  Anton Blanchard <anton@au1.ibm.com>
 	    Alistair Popple <alistair@ozlabs.au.ibm.com>
 	    Alan Modra <amodra@gmail.com>
diff --git a/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c b/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
index ace858f..4ac913c 100644
--- a/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
+++ b/nptl/sysdeps/unix/sysv/linux/powerpc/pt-longjmp.c
@@ -41,13 +41,8 @@ void __novmx_longjmp (jmp_buf env, int val)
   __novmx__libc_longjmp (env, val);
 }
 
-# if __WORDSIZE == 64
-symbol_version (__novmx_longjmp,longjmp,GLIBC_2.3);
-symbol_version (__novmx_siglongjmp,siglongjmp,GLIBC_2.3);
-# else
-symbol_version (__novmx_longjmp,longjmp,GLIBC_2.0);
-symbol_version (__novmx_siglongjmp,siglongjmp,GLIBC_2.0);
-# endif
+compat_symbol (libpthread, __novmx_longjmp, longjmp, GLIBC_2_0);
+compat_symbol (libpthread, __novmx_siglongjmp, siglongjmp, GLIBC_2_0);
 #endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4))  */
 
 void
@@ -62,5 +57,5 @@ __vmx_siglongjmp (jmp_buf env, int val)
   __libc_siglongjmp (env, val);
 }
 
-versioned_symbol (libc, __vmx_longjmp, longjmp, GLIBC_2_3_4);
-versioned_symbol (libc, __vmx_siglongjmp, siglongjmp, GLIBC_2_3_4);
+versioned_symbol (libpthread, __vmx_longjmp, longjmp, GLIBC_2_3_4);
+versioned_symbol (libpthread, __vmx_siglongjmp, siglongjmp, GLIBC_2_3_4);
diff --git a/sysdeps/powerpc/longjmp.c b/sysdeps/powerpc/longjmp.c
index 198c894..189fc03 100644
--- a/sysdeps/powerpc/longjmp.c
+++ b/sysdeps/powerpc/longjmp.c
@@ -55,6 +55,6 @@ weak_alias (__vmx__libc_siglongjmp, __vmxsiglongjmp)
 
 default_symbol_version (__vmx__libc_longjmp, __libc_longjmp, GLIBC_PRIVATE);
 default_symbol_version (__vmx__libc_siglongjmp, __libc_siglongjmp, GLIBC_PRIVATE);
-default_symbol_version (__vmx_longjmp, _longjmp, GLIBC_2.3.4);
-default_symbol_version (__vmxlongjmp, longjmp, GLIBC_2.3.4);
-default_symbol_version (__vmxsiglongjmp, siglongjmp, GLIBC_2.3.4);
+versioned_symbol (libc, __vmx_longjmp, _longjmp, GLIBC_2_3_4);
+versioned_symbol (libc, __vmxlongjmp, longjmp, GLIBC_2_3_4);
+versioned_symbol (libc, __vmxsiglongjmp, siglongjmp, GLIBC_2_3_4);
diff --git a/sysdeps/powerpc/novmx-longjmp.c b/sysdeps/powerpc/novmx-longjmp.c
index 8f6ea35..b2c0e4c 100644
--- a/sysdeps/powerpc/novmx-longjmp.c
+++ b/sysdeps/powerpc/novmx-longjmp.c
@@ -50,13 +50,7 @@ weak_alias (__novmx__libc_siglongjmp, __novmx_longjmp)
 weak_alias (__novmx__libc_siglongjmp, __novmxlongjmp)
 weak_alias (__novmx__libc_siglongjmp, __novmxsiglongjmp)
 
-# if __WORDSIZE == 64
-symbol_version (__novmx_longjmp,_longjmp,GLIBC_2.3);
-symbol_version (__novmxlongjmp,longjmp,GLIBC_2.3);
-symbol_version (__novmxsiglongjmp,siglongjmp,GLIBC_2.3);
-# else
-symbol_version (__novmx_longjmp,_longjmp,GLIBC_2.0);
-symbol_version (__novmxlongjmp,longjmp,GLIBC_2.0);
-symbol_version (__novmxsiglongjmp,siglongjmp,GLIBC_2.0);
-# endif
+compat_symbol (libc, __novmx_longjmp, _longjmp, GLIBC_2_0);
+compat_symbol (libc, __novmxlongjmp, longjmp, GLIBC_2_0);
+compat_symbol (libc, __novmxsiglongjmp, siglongjmp, GLIBC_2_0);
 #endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4))  */
diff --git a/sysdeps/powerpc/powerpc32/bsd-_setjmp.S b/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
index 95e8a5a..ad2b5ff 100644
--- a/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
+++ b/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
@@ -30,7 +30,7 @@ libc_hidden_def (_setjmp)
 /* Build a versioned object for libc.  */
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx_setjmp, _setjmp, GLIBC_2_0);
 
 ENTRY (__novmx_setjmp)
 	li r4,0			/* Set second argument to 0.  */
@@ -39,7 +39,7 @@ END (__novmx_setjmp)
 libc_hidden_def (__novmx_setjmp)
 # endif /* defined SHARED  && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) */
 
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
 /* __GI__setjmp prototype is needed for ntpl i.e. _setjmp is defined
    as a libc_hidden_proto & is used in sysdeps/generic/libc-start.c
    if HAVE_CLEANUP_JMP_BUF is defined */
diff --git a/sysdeps/powerpc/powerpc32/bsd-setjmp.S b/sysdeps/powerpc/powerpc32/bsd-setjmp.S
index 1113ea5..5e1e860 100644
--- a/sysdeps/powerpc/powerpc32/bsd-setjmp.S
+++ b/sysdeps/powerpc/powerpc32/bsd-setjmp.S
@@ -26,7 +26,7 @@ ENTRY (__novmxsetjmp)
 	b __novmx__sigsetjmp@local
 END (__novmxsetjmp)
 strong_alias (__novmxsetjmp, __novmx__setjmp)
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.0)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_0)
 
 #endif  /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) ) */
 
@@ -36,4 +36,4 @@ ENTRY (__vmxsetjmp)
 END (__vmxsetjmp)
 strong_alias (__vmxsetjmp, __vmx__setjmp)
 strong_alias (__vmx__setjmp, __setjmp)
-default_symbol_version (__vmxsetjmp,setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
index 96e50de..27166c4 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
@@ -26,14 +26,14 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__longjmp,__longjmp,GLIBC_2.3.4);
+versioned_symbol (libc, __vmx__longjmp, __longjmp, GLIBC_2_3_4);
 # define __longjmp  __vmx__longjmp
 # include "__longjmp-common.S"
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 #  define __NO_VMX__
 #  undef JB_SIZE
-symbol_version (__novmx__longjmp,__longjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx__longjmp, __longjmp, GLIBC_2_0);
 #  undef __longjmp
 #  define __longjmp  __novmx__longjmp
 #  include "__longjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp.S b/sysdeps/powerpc/powerpc32/fpu/setjmp.S
index dc93db3..671032d 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp.S
@@ -26,7 +26,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -36,7 +36,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc32/mcount.c b/sysdeps/powerpc/powerpc32/mcount.c
index 0476bf6..d8c0632 100644
--- a/sysdeps/powerpc/powerpc32/mcount.c
+++ b/sysdeps/powerpc/powerpc32/mcount.c
@@ -9,7 +9,7 @@
 /* __mcount_internal was added in glibc 2.15 with version GLIBC_PRIVATE,
    but it should have been put in version GLIBC_2.15.  Mark the
    GLIBC_PRIVATE version obsolete and add it to GLIBC_2.16 instead.  */
-default_symbol_version (___mcount_internal, __mcount_internal, GLIBC_2.16);
+versioned_symbol (libc, ___mcount_internal, __mcount_internal, GLIBC_2_16);
 
 #if SHLIB_COMPAT (libc, GLIBC_2_15, GLIBC_2_16)
 strong_alias (___mcount_internal, ___mcount_internal_private);
diff --git a/sysdeps/powerpc/powerpc32/setjmp.S b/sysdeps/powerpc/powerpc32/setjmp.S
index 851a5b9..f0c1507 100644
--- a/sysdeps/powerpc/powerpc32/setjmp.S
+++ b/sysdeps/powerpc/powerpc32/setjmp.S
@@ -25,7 +25,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -35,7 +35,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/sysdeps/powerpc/powerpc64/setjmp.S b/sysdeps/powerpc/powerpc64/setjmp.S
index 667b9d1..0a3b2fc 100644
--- a/sysdeps/powerpc/powerpc64/setjmp.S
+++ b/sysdeps/powerpc/powerpc64/setjmp.S
@@ -26,9 +26,9 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmxsetjmp, setjmp, GLIBC_2.3.4)
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define setjmp __vmxsetjmp
 # define _setjmp __vmx_setjmp
 # define __sigsetjmp __vmx__sigsetjmp
@@ -44,9 +44,9 @@ strong_alias (__vmx__sigsetjmp, __setjmp)
 #  undef __sigjmp_save
 #  undef JB_SIZE
 #  define __NO_VMX__
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.3)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.3);
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.3)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_3)
+compat_symbol (libc, __novmx_setjmp,_setjmp, GLIBC_2_3);
+compat_symbol (libc, __novmx__sigsetjmp,__sigsetjmp, GLIBC_2_3)
 #  define setjmp __novmxsetjmp
 #  define _setjmp __novmx_setjmp
 #  define __sigsetjmp __novmx__sigsetjmp

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9ed6497ec694382feabd02f16a0c7c1eda97b1c5

commit 9ed6497ec694382feabd02f16a0c7c1eda97b1c5
Author: Anton Blanchard <anton@au1.ibm.com>
Date:   Sat Aug 17 18:34:40 2013 +0930

    PowerPC LE setjmp/longjmp
    http://sourceware.org/ml/libc-alpha/2013-08/msg00089.html
    
    Little-endian fixes for setjmp/longjmp.  When writing these I noticed
    the setjmp code corrupts the non volatile VMX registers when using an
    unaligned buffer.  Anton fixed this, and also simplified it quite a
    bit.
    
    The current code uses boilerplate for the case where we want to store
    16 bytes to an unaligned address.  For that we have to do a
    read/modify/write of two aligned 16 byte quantities.  In our case we
    are storing a bunch of back to back data (consective VMX registers),
    and only the start and end of the region need the read/modify/write.
    
    	[BZ #15723]
    	* sysdeps/powerpc/jmpbuf-offsets.h: Comment fix.
    	* sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S: Correct
    	_dl_hwcap access for little-endian.
    	* sysdeps/powerpc/powerpc32/fpu/setjmp-common.S: Likewise.  Don't
    	destroy vmx regs when saving unaligned.
    	* sysdeps/powerpc/powerpc64/__longjmp-common.S: Correct CR load.
    	* sysdeps/powerpc/powerpc64/setjmp-common.S: Likewise CR save.  Don't
    	destroy vmx regs when saving unaligned.

diff --git a/ChangeLog b/ChangeLog
index 1e3a6c7..56e77be 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2013-10-04  Anton Blanchard <anton@au1.ibm.com>
+	    Alistair Popple <alistair@ozlabs.au.ibm.com>
+	    Alan Modra <amodra@gmail.com>
+
+	[BZ #15723]
+	* sysdeps/powerpc/jmpbuf-offsets.h: Comment fix.
+	* sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S: Correct
+	_dl_hwcap access for little-endian.
+	* sysdeps/powerpc/powerpc32/fpu/setjmp-common.S: Likewise.  Don't
+	destroy vmx regs when saving unaligned.
+	* sysdeps/powerpc/powerpc64/__longjmp-common.S: Correct CR load.
+	* sysdeps/powerpc/powerpc64/setjmp-common.S: Likewise CR save.  Don't
+	destroy vmx regs when saving unaligned.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/powerpc/powerpc32/power4/hp-timing.h (HP_TIMING_NOW):
diff --git a/sysdeps/powerpc/jmpbuf-offsets.h b/sysdeps/powerpc/jmpbuf-offsets.h
index 64c658a..f2116bd 100644
--- a/sysdeps/powerpc/jmpbuf-offsets.h
+++ b/sysdeps/powerpc/jmpbuf-offsets.h
@@ -21,12 +21,10 @@
 #define JB_LR     2  /* The address we will return to */
 #if __WORDSIZE == 64
 # define JB_GPRS   3  /* GPRs 14 through 31 are saved, 18*2 words total.  */
-# define JB_CR     21 /* Condition code registers with the VRSAVE at */
-                       /* offset 172 (low half of the double word.  */
+# define JB_CR     21 /* Shared dword with VRSAVE.  CR word at offset 172.  */
 # define JB_FPRS   22 /* FPRs 14 through 31 are saved, 18*2 words total.  */
 # define JB_SIZE   (64 * 8) /* As per PPC64-VMX ABI.  */
-# define JB_VRSAVE 21 /* VRSAVE shares a double word with the CR at offset */
-                       /* 168 (high half of the double word).  */
+# define JB_VRSAVE 21 /* Shared dword with CR.  VRSAVE word at offset 168.  */
 # define JB_VRS    40 /* VRs 20 through 31 are saved, 12*4 words total.  */
 #else
 # define JB_GPRS   3  /* GPRs 14 through 31 are saved, 18 in total.  */
diff --git a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 9d34cd9..d02aa57 100644
--- a/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -43,16 +43,16 @@ ENTRY (__longjmp)
 #   endif
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r5,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r5)
+	lis	r5,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r5)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
diff --git a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index 46ea2b0..f324406 100644
--- a/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -94,14 +94,14 @@ ENTRY (__sigsetjmp)
 #   else
 	lwz     r5,_rtld_global_ro@got(r5)
 #   endif
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r6,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r6)
+	lis	r6,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r6)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
@@ -111,44 +111,43 @@ ENTRY (__sigsetjmp)
 	stw	r0,((JB_VRSAVE)*4)(3)
 	addi	r6,r5,16
 	beq+	L(aligned_save_vmx)
-	lvsr	v0,0,r5
-	vspltisb v1,-1         /* set v1 to all 1's */
-	vspltisb v2,0          /* set v2 to all 0's */
-	vperm   v3,v2,v1,v0   /* v3 contains shift mask with num all 1 bytes on left = misalignment  */
 
+	lvsr	v0,0,r5
+	lvsl	v1,0,r5
+	addi	r6,r5,-16
 
-	/* Special case for v20 we need to preserve what is in save area below v20 before obliterating it */
-	lvx     v5,0,r5
-	vperm   v20,v20,v20,v0
-	vsel    v5,v5,v20,v3
-	vsel    v20,v20,v2,v3
-	stvx    v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+	addi	addgpr,addgpr,32;					 \
+	vperm	tmpvr,prevvr,savevr,shiftvr;				 \
+	stvx	tmpvr,0,savegpr
 
-#define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
-	addi    addgpr,addgpr,32; \
-	vperm   savevr,savevr,savevr,shiftvr; \
-	vsel    hivr,prev_savevr,savevr,maskvr; \
-	stvx    hivr,0,savegpr;
+	/*
+	 * We have to be careful not to corrupt the data below v20 and
+	 * above v31. To keep things simple we just rotate both ends in
+	 * the opposite direction to our main permute so we can use
+	 * the common macro.
+	 */
 
-	save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+	/* load and rotate data below v20 */
+	lvx	v2,0,r5
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+	save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+	save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+	save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+	save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+	save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+	save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+	save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+	save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+	save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+	save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+	save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+	/* load and rotate data above v31 */
+	lvx	v2,0,r6
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
 
-	/* Special case for r31 we need to preserve what is in save area above v31 before obliterating it */
-	addi    r5,r5,32
-	vperm   v31,v31,v31,v0
-	lvx     v4,0,r5
-	vsel    v5,v30,v31,v3
-	stvx    v5,0,r6
-	vsel    v4,v31,v4,v3
-	stvx    v4,0,r5
 	b	L(no_vmx)
 
 L(aligned_save_vmx):
diff --git a/sysdeps/powerpc/powerpc64/__longjmp-common.S b/sysdeps/powerpc/powerpc64/__longjmp-common.S
index 70c3704..4f1e3c8 100644
--- a/sysdeps/powerpc/powerpc64/__longjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/__longjmp-common.S
@@ -57,7 +57,7 @@ ENTRY (__longjmp)
 	beq	L(no_vmx)
 	la	r5,((JB_VRS)*8)(3)
 	andi.	r6,r5,0xf
-	lwz	r0,((JB_VRSAVE)*8)(3)
+	lwz	r0,((JB_VRSAVE)*8)(3)	/* 32-bit VRSAVE.  */
 	mtspr	VRSAVE,r0
 	beq+	L(aligned_restore_vmx)
 	addi    r6,r5,16
@@ -153,7 +153,7 @@ L(no_vmx):
 	lfd fp21,((JB_FPRS+7)*8)(r3)
 	ld r22,((JB_GPRS+8)*8)(r3)
 	lfd fp22,((JB_FPRS+8)*8)(r3)
-	ld r0,(JB_CR*8)(r3)
+	lwz r0,((JB_CR*8)+4)(r3)	/* 32-bit CR.  */
 	ld r23,((JB_GPRS+9)*8)(r3)
 	lfd fp23,((JB_FPRS+9)*8)(r3)
 	ld r24,((JB_GPRS+10)*8)(r3)
diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S
index 58ec610..1829b9a 100644
--- a/sysdeps/powerpc/powerpc64/setjmp-common.S
+++ b/sysdeps/powerpc/powerpc64/setjmp-common.S
@@ -95,7 +95,7 @@ JUMPTARGET(GLUE(__sigsetjmp,_ent)):
 	mfcr r0
 	std  r16,((JB_GPRS+2)*8)(3)
 	stfd fp16,((JB_FPRS+2)*8)(3)
-	std  r0,(JB_CR*8)(3)
+	stw  r0,((JB_CR*8)+4)(3)	/* 32-bit CR.  */
 	std  r17,((JB_GPRS+3)*8)(3)
 	stfd fp17,((JB_FPRS+3)*8)(3)
 	std  r18,((JB_GPRS+4)*8)(3)
@@ -139,50 +139,46 @@ JUMPTARGET(GLUE(__sigsetjmp,_ent)):
 	la	r5,((JB_VRS)*8)(3)
 	andi.	r6,r5,0xf
 	mfspr	r0,VRSAVE
-	stw	r0,((JB_VRSAVE)*8)(3)
+	stw	r0,((JB_VRSAVE)*8)(3)	/* 32-bit VRSAVE.  */
 	addi	r6,r5,16
 	beq+	L(aligned_save_vmx)
-	lvsr	v0,0,r5
-	vspltisb v1,-1         /* set v1 to all 1's */
-	vspltisb v2,0          /* set v2 to all 0's */
-	vperm   v3,v2,v1,v0   /* v3 contains shift mask with num all 1 bytes
-				 on left = misalignment  */
 
+	lvsr	v0,0,r5
+	lvsl	v1,0,r5
+	addi	r6,r5,-16
 
-	/* Special case for v20 we need to preserve what is in save area
-	   below v20 before obliterating it */
-	lvx     v5,0,r5
-	vperm   v20,v20,v20,v0
-	vsel    v5,v5,v20,v3
-	vsel    v20,v20,v2,v3
-	stvx    v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+	addi	addgpr,addgpr,32;					 \
+	vperm	tmpvr,prevvr,savevr,shiftvr;				 \
+	stvx	tmpvr,0,savegpr
 
-# define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
-	addi    addgpr,addgpr,32; \
-	vperm   savevr,savevr,savevr,shiftvr; \
-	vsel    hivr,prev_savevr,savevr,maskvr; \
-	stvx    hivr,0,savegpr;
+	/*
+	 * We have to be careful not to corrupt the data below v20 and
+	 * above v31. To keep things simple we just rotate both ends in
+	 * the opposite direction to our main permute so we can use
+	 * the common macro.
+	 */
 
-	save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+	/* load and rotate data below v20 */
+	lvx	v2,0,r5
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+	save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+	save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+	save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+	save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+	save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+	save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+	save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+	save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+	save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+	save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+	save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+	/* load and rotate data above v31 */
+	lvx	v2,0,r6
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
 
-	/* Special case for r31 we need to preserve what is in save area
-	   above v31 before obliterating it */
-	addi    r5,r5,32
-	vperm   v31,v31,v31,v0
-	lvx     v4,0,r5
-	vsel    v5,v30,v31,v3
-	stvx    v5,0,r6
-	vsel    v4,v31,v4,v3
-	stvx    v4,0,r5
 	b	L(no_vmx)
 
 L(aligned_save_vmx):

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4ec9dde3cc121c1167824b5c5a6fe4a4a683c4e5

commit 4ec9dde3cc121c1167824b5c5a6fe4a4a683c4e5
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:33:45 2013 +0930

    PowerPC floating point little-endian [15 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00206.html
    
    The union loses when little-endian.
    
    	* sysdeps/powerpc/powerpc32/power4/hp-timing.h (HP_TIMING_NOW):
    	Don't use a union to pack hi/low value.

diff --git a/ChangeLog b/ChangeLog
index 81dc345..1e3a6c7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/powerpc32/power4/hp-timing.h (HP_TIMING_NOW):
+	Don't use a union to pack hi/low value.
+
 2013-10-04  Anton Blanchard <anton@au1.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/fpu/s_ceilf.S: Correct float constants
diff --git a/sysdeps/powerpc/powerpc32/power4/hp-timing.h b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
index 7d6c96e..4e42374 100644
--- a/sysdeps/powerpc/powerpc32/power4/hp-timing.h
+++ b/sysdeps/powerpc/powerpc32/power4/hp-timing.h
@@ -87,18 +87,15 @@ typedef unsigned long long int hp_timing_t;
 
 #define HP_TIMING_NOW(Var)						\
   do {									\
-        union { long long ll; long ii[2]; } _var;			\
-	long tmp;							\
-        __asm__ __volatile__ (						\
-		"1:	mfspr	%0,269;"				\
-		"	mfspr	%1,268;"				\
-		"	mfspr	%2,269;"				\
-		"	cmpw	%0,%2;"					\
-		"	bne	1b;"					\
-		: "=r" (_var.ii[0]), "=r" (_var.ii[1]) , "=r" (tmp)	\
-		: : "cr0"						\
-		);							\
-	Var = _var.ll;							\
+    unsigned int hi, lo, tmp;						\
+    __asm__ __volatile__ ("1:	mfspr	%0,269;"			\
+			  "	mfspr	%1,268;"			\
+			  "	mfspr	%2,269;"			\
+			  "	cmpw	%0,%2;"				\
+			  "	bne	1b;"				\
+			  : "=&r" (hi), "=&r" (lo), "=&r" (tmp)		\
+			  : : "cr0");					\
+    Var = ((hp_timing_t) hi << 32) | lo;				\
   } while (0)
 
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=91f32e9adb55272f231009028b4d0f9d12400f25

commit 91f32e9adb55272f231009028b4d0f9d12400f25
Author: Anton Blanchard <anton@au1.ibm.com>
Date:   Sat Aug 17 18:33:02 2013 +0930

    PowerPC floating point little-endian [14 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00205.html
    
    These all wrongly specified float constants in a 64-bit word.
    
    	* sysdeps/powerpc/powerpc64/fpu/s_ceilf.S: Correct float constants
    	for little-endian.
    	* sysdeps/powerpc/powerpc64/fpu/s_floorf.S: Likewise.
    	* sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S: Likewise.
    	* sysdeps/powerpc/powerpc64/fpu/s_rintf.S: Likewise.
    	* sysdeps/powerpc/powerpc64/fpu/s_roundf.S: Likewise.
    	* sysdeps/powerpc/powerpc64/fpu/s_truncf.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 8ed3d80..81dc345 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-10-04  Anton Blanchard <anton@au1.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/fpu/s_ceilf.S: Correct float constants
+	for little-endian.
+	* sysdeps/powerpc/powerpc64/fpu/s_floorf.S: Likewise.
+	* sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S: Likewise.
+	* sysdeps/powerpc/powerpc64/fpu/s_rintf.S: Likewise.
+	* sysdeps/powerpc/powerpc64/fpu/s_roundf.S: Likewise.
+	* sysdeps/powerpc/powerpc64/fpu/s_truncf.S: Likewise.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/powerpc/powerpc32/fpu/s_roundf.S: Increase alignment of
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S b/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
index 801af5d..45f71d7 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_ceilf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__ceilf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_floorf.S b/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
index a0a22e7..e85b820 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_floorf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__floorf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S b/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
index 876707c..b1a2b8c 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_nearbyintf.S
@@ -26,8 +26,10 @@
 /* float [fp1] nearbyintf(float [fp1]) */
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__nearbyintf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_rintf.S b/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
index cb28ec7..1887717 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_rintf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 EALIGN (__rintf, 4, 0)
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_roundf.S b/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
index 980a77b..4f2c851 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_roundf.S
@@ -19,10 +19,12 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
 .LC1:	/* 0.5 */
-	.tc FD_3f000000_0[TC],0x3f00000000000000
+	.long 0x3f000000
+
 	.section	".text"
 
 /* float [fp1] roundf  (float x [fp1])
diff --git a/sysdeps/powerpc/powerpc64/fpu/s_truncf.S b/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
index 5ea5f3d..b8fd050 100644
--- a/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
+++ b/sysdeps/powerpc/powerpc64/fpu/s_truncf.S
@@ -19,8 +19,10 @@
 #include <sysdep.h>
 
 	.section	".toc","aw"
+	.p2align 3
 .LC0:	/* 2**23 */
-	.tc FD_4b000000_0[TC],0x4b00000000000000
+	.long 0x4b000000
+	.long 0x0
 	.section	".text"
 
 /* float [fp1] truncf (float x [fp1])

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e3c82825a789573337cf4d712cf21b3ba3d1fa65

commit e3c82825a789573337cf4d712cf21b3ba3d1fa65
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:32:18 2013 +0930

    PowerPC floating point little-endian [13 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00088.html
    
    	* sysdeps/powerpc/powerpc32/fpu/s_roundf.S: Increase alignment of
    	constants to usual value for .cst8 section, and remove redundant
    	high address load.
    	* sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S: Use float
    	constant for 0x1p52.  Load little-endian words of double from
    	correct stack offsets.

diff --git a/ChangeLog b/ChangeLog
index e8e9cd9..8ed3d80 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/powerpc32/fpu/s_roundf.S: Increase alignment of
+	constants to usual value for .cst8 section, and remove redundant
+	high address load.
+	* sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S: Use float
+	constant for 0x1p52.  Load little-endian words of double from
+	correct stack offsets.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/sysdep.h (LOWORD, HIWORD, HISHORT): Define.
 	* sysdeps/powerpc/powerpc32/fpu/s_copysign.S: Load little-endian
 	words of double from correct stack offsets.
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 2ed9ca7..8cff156 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -19,7 +19,7 @@
 #include <sysdep.h>
 
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	2
+	.align	3
 .LC0:	/* 2**23 */
 	.long 0x4b000000
 .LC1:	/* 0.5 */
@@ -60,7 +60,6 @@ ENTRY (__roundf )
 #ifdef SHARED
 	lfs	fp10,.LC1-.LC0(r9)
 #else
-	lis	r9,.LC1@ha
 	lfs	fp10,.LC1@l(r9)
 #endif
 	ble-	cr6,.L4
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index 631180f..7246ca4 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -19,12 +19,10 @@
 #include <sysdep.h>
 #include <math_ldbl_opt.h>
 
- .section .rodata.cst12,"aM",@progbits,12
+ .section .rodata.cst8,"aM",@progbits,8
  .align 3
- .LC0:   /* 0x1.0000000000000p+52 == 2^52 */
-	.long 0x43300000
-	.long 0x00000000
-	.long 0x3f000000 /* Use this for 0.5  */
+ .LC0:	.long (52+127)<<23 /* 0x1p+52  */
+	.long (-1+127)<<23 /* 0.5  */
 
 	.section	".text"
 
@@ -57,12 +55,12 @@ ENTRY (__llround)
 	addi	r9,r9,.LC0-got_label@l
 	mtlr	r11
 	cfi_same_value (lr)
-	lfd	fp9,0(r9)
-	lfs	fp10,8(r9)
+	lfs	fp9,0(r9)
+	lfs	fp10,4(r9)
 #else
 	lis r9,.LC0@ha
-	lfd fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
-	lfs fp10,.LC0@l+8(r9)	/* Load 0.5 into fpr10.  */
+	lfs fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
+	lfs fp10,.LC0@l+4(r9)	/* Load 0.5 into fpr10.  */
 #endif
 	fabs	fp2,fp1		/* Get the absolute value of x.  */
 	fsub	fp12,fp10,fp10	/* Compute 0.0 into fpr12.  */
@@ -80,8 +78,8 @@ ENTRY (__llround)
 	nop
 	nop
 	nop
-	lwz	r4,12(r1)	/* Load return as integer.  */
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)	/* Load return as integer.  */
+	lwz	r4,8+LOWORD(r1)
 .Lout:
 	addi	r1,r1,16
 	blr

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f1a45ccc7e37f63b76d996ffff86dfbc6f7f0676

commit f1a45ccc7e37f63b76d996ffff86dfbc6f7f0676
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:31:45 2013 +0930

    PowerPC floating point little-endian [12 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00087.html
    
    Fixes for little-endian in 32-bit assembly.
    
    	* sysdeps/powerpc/sysdep.h (LOWORD, HIWORD, HISHORT): Define.
    	* sysdeps/powerpc/powerpc32/fpu/s_copysign.S: Load little-endian
    	words of double from correct stack offsets.
    	* sysdeps/powerpc/powerpc32/fpu/s_copysignl.S: Likewise.
    	* sysdeps/powerpc/powerpc32/fpu/s_lrint.S: Likewise.
    	* sysdeps/powerpc/powerpc32/fpu/s_lround.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S: Likewise.
    	* sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S: Likewise.
    	* sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S: Use HISHORT.
    	* sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 5327d98..e8e9cd9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,28 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/sysdep.h (LOWORD, HIWORD, HISHORT): Define.
+	* sysdeps/powerpc/powerpc32/fpu/s_copysign.S: Load little-endian
+	words of double from correct stack offsets.
+	* sysdeps/powerpc/powerpc32/fpu/s_copysignl.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_lrint.S: Likewise.
+	* sysdeps/powerpc/powerpc32/fpu/s_lround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S: Use HISHORT.
+	* sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S: Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/fpu_control.h (_FPU_GETCW): Rewrite using
 	64-bit int/double union.
 	(_FPU_SETCW): Likewise.
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_copysign.S b/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
index 840891f..1da24f4 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
@@ -29,7 +29,7 @@ ENTRY(__copysign)
 	stwu	r1,-16(r1)
 	cfi_adjust_cfa_offset (16)
 	stfd	fp2,8(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi   r3,0
 	addi    r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S b/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
index 4ec8389..2ad6de2 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
@@ -30,7 +30,7 @@ ENTRY(__copysignl)
 	fmr	fp0,fp1
 	fabs	fp1,fp1
 	fcmpu	cr7,fp0,fp1
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi	cr6,r3,0
 	addi	r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lrint.S b/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
index 27881f8..249fda5 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
@@ -24,10 +24,10 @@ ENTRY (__lrint)
 	stwu	r1,-16(r1)
 	fctiw	fp13,fp1
 	stfd	fp13,8(r1)
-	nop	/* Insure the following load is in a different dispatch group */
+	nop	/* Ensure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lrint)
diff --git a/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index 92dc378..6309f86 100644
--- a/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -67,7 +67,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)	/* Load return as integer.  */
+	lwz	r3,8+LOWORD(r1)	/* Load return as integer.  */
 .Lout:
 	addi	r1,r1,16
 	blr
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
index 55b2850..e7a88fe 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
index cc80fcb..da24ad3 100644
--- a/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
+++ b/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
index ecd37c3..49c8a08 100644
--- a/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S b/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
index d4da625..780dd9c 100644
--- a/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
+++ b/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
@@ -38,7 +38,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lround)
diff --git a/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
index f2417fd..5f7ba43 100644
--- a/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
index 2c095db..3ea1858 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
index 3344b31..c0660cf 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
index 7f64f8d..ce29890 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S b/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
index 0ff04cb..abb0840 100644
--- a/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
index b2ab5bf..095c155 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -54,9 +54,8 @@ ENTRY (__finite)
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz     r0,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r0,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
index 3f8af60..0101c8f 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -48,14 +48,13 @@ ENTRY (__isinf)
 	li	r3,0
 	bflr    29	      /* If not INF, return.  */
 
-	/* Either we have -INF/+INF or a denormal.  */
+	/* Either we have +INF or -INF.  */
 
 	stwu    r1,-16(r1)    /* Allocate stack space.  */
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz	r4,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r4,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	li	r3,1
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
index 99ff126..0ad1dcf 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -53,8 +53,8 @@ ENTRY (__isnan)
 	stwu	r1,-16(r1)    /* Allocate stack space.  */
 	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lwz     r4,8(r1)      /* Load the upper half of the FP value.  */
-	lwz     r5,12(r1)     /* Load the lower half of the FP value.  */
+	lwz     r4,8+HIWORD(r1) /* Load the upper half of the FP value.  */
+	lwz     r5,8+LOWORD(r1) /* Load the lower half of the FP value.  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
 	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
index d0071c7..ebec0e0 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -39,10 +39,8 @@ EALIGN (__finite, 4, 0)
 
 	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-
-	lhz     r4,-16(r1)    /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz     r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
 	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	bltlr   cr7	      /* LT means finite, other non-finite.  */
diff --git a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
index 1aea123..8d088db 100644
--- a/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -38,9 +38,8 @@ EALIGN (__isinf, 4, 0)
 
 	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz	r4,-16(r1)    /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	li	r3,1
 	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h
index 1b5334a..bc2cb66 100644
--- a/sysdeps/powerpc/sysdep.h
+++ b/sysdeps/powerpc/sysdep.h
@@ -144,6 +144,21 @@
 
 #define VRSAVE	256
 
+/* The 32-bit words of a 64-bit dword are at these offsets in memory.  */
+#if defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+# define LOWORD 0
+# define HIWORD 4
+#else
+# define LOWORD 4
+# define HIWORD 0
+#endif
+
+/* The high 16-bit word of a 64-bit dword is at this offset in memory.  */
+#if defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
+# define HISHORT 6
+#else
+# define HISHORT 0
+#endif
 
 /* This seems to always be the case on PPC.  */
 #define ALIGNARG(log2) log2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5febc7ff25ecdbab6b445286887a4cf3b1083273

commit 5febc7ff25ecdbab6b445286887a4cf3b1083273
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:31:05 2013 +0930

    PowerPC floating point little-endian [11 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00202.html
    
    Another little-endian fix.
    
    	* sysdeps/powerpc/fpu_control.h (_FPU_GETCW): Rewrite using
    	64-bit int/double union.
    	(_FPU_SETCW): Likewise.
    	* sysdeps/powerpc/fpu/tst-setcontext-fpscr.c (_GET_DI_FPSCR): Likewise.
    	(_SET_DI_FPSCR, _GET_SI_FPSCR, _SET_SI_FPSCR): Likewise.

diff --git a/ChangeLog b/ChangeLog
index 34cc9c1..5327d98 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/fpu_control.h (_FPU_GETCW): Rewrite using
+	64-bit int/double union.
+	(_FPU_SETCW): Likewise.
+	* sysdeps/powerpc/fpu/tst-setcontext-fpscr.c (_GET_DI_FPSCR): Likewise.
+	(_SET_DI_FPSCR, _GET_SI_FPSCR, _SET_SI_FPSCR): Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/fpu/s_llround.c (__llround): Rewrite.
 	* sysdeps/powerpc/fpu/s_llroundf.c (__llroundf): Rewrite.
 
diff --git a/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c b/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
index feffa6b..cc9b320 100644
--- a/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
+++ b/sysdeps/powerpc/fpu/tst-setcontext-fpscr.c
@@ -83,7 +83,7 @@ ElfW(Addr) query_auxv(int type)
   return 0;
 }
 
-typedef unsigned long long di_fpscr_t __attribute__ ((__mode__ (__DI__)));
+typedef unsigned int di_fpscr_t __attribute__ ((__mode__ (__DI__)));
 typedef unsigned int si_fpscr_t __attribute__ ((__mode__ (__SI__)));
 
 #define _FPSCR_RESERVED 0xfffffff8ffffff04ULL
@@ -95,50 +95,51 @@ typedef unsigned int si_fpscr_t __attribute__ ((__mode__ (__SI__)));
 #define _FPSCR_TEST1_RN  0x0000000000000002ULL
 
 /* Macros for accessing the hardware control word on Power6[x].  */
-# define _GET_DI_FPSCR(__fpscr) ({					     \
-   union { double d;							     \
-           di_fpscr_t fpscr; }						     \
-     tmp __attribute__ ((__aligned__(8)));				     \
-   __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0");		     \
-   (__fpscr)=tmp.fpscr;							     \
-   tmp.fpscr; })
-
-/* We make sure to zero fp0 after we use it in order to prevent stale data
+#define _GET_DI_FPSCR(__fpscr)						\
+  ({union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    __asm__ ("mffs %0" : "=f" (fr));					\
+    u.d = fr;								\
+    (__fpscr) = u.fpscr;						\
+    u.fpscr;								\
+  })
+
+/* We make sure to zero fp after we use it in order to prevent stale data
    in an fp register from making a test-case pass erroneously.  */
-# define _SET_DI_FPSCR(__fpscr) {					     \
-  union { double d; di_fpscr_t fpscr; }					     \
-    tmp __attribute__ ((__aligned__(8)));				     \
-  tmp.fpscr = __fpscr;							     \
-  /* Set the entire 64-bit FPSCR.  */					     \
-  __asm__ ("lfd%U0 0,%0; "						     \
-	   ".machine push; "						     \
-	   ".machine \"power6\"; "					     \
-	   "mtfsf 255,0,1,0; "						     \
-	   ".machine pop" : : "m" (tmp.d) : "fr0");			     \
-  tmp.d = 0;								     \
-  __asm__("lfd%U0 0,%0" : : "m" (tmp.d) : "fr0");			     \
-}
-
-# define _GET_SI_FPSCR(__fpscr) ({					     \
-   union { double d;							     \
-           si_fpscr_t cw[2]; }						     \
-     tmp __attribute__ ((__aligned__(8)));				     \
-   __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0");		     \
-   (__fpscr)=tmp.cw[1];							     \
-   tmp.cw[0]; })
-
-/* We make sure to zero fp0 after we use it in order to prevent stale data
+# define _SET_DI_FPSCR(__fpscr)						\
+  { union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    u.fpscr = __fpscr;							\
+    fr = u.d;								\
+    /* Set the entire 64-bit FPSCR.  */					\
+    __asm__ (".machine push; "						\
+	     ".machine \"power6\"; "					\
+	     "mtfsf 255,%0,1,0; "					\
+	     ".machine pop" : : "f" (fr));				\
+    fr = 0.0;								\
+  }
+
+# define _GET_SI_FPSCR(__fpscr)						\
+  ({union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    __asm__ ("mffs %0" : "=f" (fr));					\
+    u.d = fr;								\
+    (__fpscr) = (si_fpscr_t) u.fpscr;					\
+    (si_fpscr_t) u.fpscr;						\
+  })
+
+/* We make sure to zero fp after we use it in order to prevent stale data
    in an fp register from making a test-case pass erroneously.  */
-# define _SET_SI_FPSCR(__fpscr) {					     \
-  union { double d; si_fpscr_t fpscr[2]; }				     \
-    tmp __attribute__ ((__aligned__(8)));				     \
-  /* More-or-less arbitrary; this is a QNaN. */				     \
-  tmp.fpscr[0] = 0xFFF80000;						     \
-  tmp.fpscr[1] = __fpscr;						     \
-  __asm__ ("lfd%U0 0,%0; mtfsf 255,0" : : "m" (tmp.d) : "fr0");		     \
-  tmp.d = 0;								     \
-  __asm__("lfd%U0 0,%0" : : "m" (tmp.d) : "fr0");			     \
-}
+# define _SET_SI_FPSCR(__fpscr)						\
+  { union { double d; di_fpscr_t fpscr; } u;				\
+    register double fr;							\
+    /* More-or-less arbitrary; this is a QNaN. */			\
+    u.fpscr = 0xfff80000ULL << 32;					\
+    u.fpscr |= __fpscr & 0xffffffffULL;					\
+    fr = u.d;								\
+    __asm__ ("mtfsf 255,%0" : : "f" (fr));				\
+    fr = 0.0;								\
+  }
 
 void prime_special_regs(int which)
 {
diff --git a/sysdeps/powerpc/fpu_control.h b/sysdeps/powerpc/fpu_control.h
index 159543b..c6c4cb9 100644
--- a/sysdeps/powerpc/fpu_control.h
+++ b/sysdeps/powerpc/fpu_control.h
@@ -56,22 +56,26 @@ extern fpu_control_t __fpu_control;
 # define _FPU_IEEE     0x000000f0
 
 /* Type of the control word.  */
-typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__)));
+typedef unsigned int fpu_control_t;
 
 /* Macros for accessing the hardware control word.  */
-# define _FPU_GETCW(__cw) ( { \
-  union { double d; fpu_control_t cw[2]; } \
-    tmp __attribute__ ((__aligned__(8))); \
-  __asm__ ("mffs 0; stfd%U0 0,%0" : "=m" (tmp.d) : : "fr0"); \
-  (__cw)=tmp.cw[1]; \
-  tmp.cw[1]; } )
-# define _FPU_SETCW(__cw) { \
-  union { double d; fpu_control_t cw[2]; } \
-    tmp __attribute__ ((__aligned__(8))); \
-  tmp.cw[0] = 0xFFF80000; /* More-or-less arbitrary; this is a QNaN. */ \
-  tmp.cw[1] = __cw; \
-  __asm__ ("lfd%U0 0,%0; mtfsf 255,0" : : "m" (tmp.d) : "fr0"); \
-}
+# define _FPU_GETCW(cw)						\
+  ({union { double __d; unsigned long long __ll; } __u;		\
+    register double __fr;					\
+    __asm__ ("mffs %0" : "=f" (__fr));				\
+    __u.__d = __fr;						\
+    (cw) = (fpu_control_t) __u.__ll;				\
+    (fpu_control_t) __u.__ll;					\
+  })
+
+# define _FPU_SETCW(cw)						\
+  { union { double __d; unsigned long long __ll; } __u;		\
+    register double __fr;					\
+    __u.__ll = 0xfff80000LL << 32; /* This is a QNaN.  */	\
+    __u.__ll |= (cw) & 0xffffffffLL;				\
+    __fr = __u.__d;						\
+    __asm__ ("mtfsf 255,%0" : : "f" (__fr));			\
+  }
 
 /* Default control word set at startup.  */
 extern fpu_control_t __fpu_control;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c88484088a11983552f59cafcccbcd7a16b5ab09

commit c88484088a11983552f59cafcccbcd7a16b5ab09
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:30:23 2013 +0930

    PowerPC floating point little-endian [10 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00201.html
    
    These two functions oddly test x+1>0 when a double x is >= 0.0, and
    similarly when x is negative.  I don't see the point of that since the
    test should always be true.  I also don't see any need to convert x+1
    to integer rather than simply using xr+1.  Note that the standard
    allows these functions to return any value when the input is outside
    the range of long long, but it's not too hard to prevent xr+1
    overflowing so that's what I've done.
    
    (With rounding mode FE_UPWARD, x+1 can be a lot more than what you
    might naively expect, but perhaps that situation was covered by the
    x - xrf < 1.0 test.)
    
    	* sysdeps/powerpc/fpu/s_llround.c (__llround): Rewrite.
    	* sysdeps/powerpc/fpu/s_llroundf.c (__llroundf): Rewrite.

diff --git a/ChangeLog b/ChangeLog
index 62923b5..34cc9c1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/powerpc/fpu/s_llround.c (__llround): Rewrite.
+	* sysdeps/powerpc/fpu/s_llroundf.c (__llroundf): Rewrite.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/powerpc/fpu/s_float_bitwise.h (__float_and_test28): Don't
 	use vector int constants.
 	(__float_and_test24, __float_and8, __float_get_exp): Likewise.
diff --git a/sysdeps/powerpc/fpu/s_llround.c b/sysdeps/powerpc/fpu/s_llround.c
index 9a01826..995d0a7 100644
--- a/sysdeps/powerpc/fpu/s_llround.c
+++ b/sysdeps/powerpc/fpu/s_llround.c
@@ -19,29 +19,28 @@
 #include <math.h>
 #include <math_ldbl_opt.h>
 
-/* I think that what this routine is supposed to do is round a value
-   to the nearest integer, with values exactly on the boundary rounded
-   away from zero.  */
-/* This routine relies on (long long)x, when x is out of range of a long long,
-   clipping to MAX_LLONG or MIN_LLONG.  */
+/* Round to the nearest integer, with values exactly on a 0.5 boundary
+   rounded away from zero, regardless of the current rounding mode.
+   If (long long)x, when x is out of range of a long long, clips at
+   LLONG_MAX or LLONG_MIN, then this implementation also clips.  */
 
 long long int
 __llround (double x)
 {
-  double xrf;
-  long long int xr;
-  xr = (long long int) x;
-  xrf = (double) xr;
+  long long xr = (long long) x;
+  double xrf = (double) xr;
+
   if (x >= 0.0)
-    if (x - xrf >= 0.5 && x - xrf < 1.0 && x+1 > 0)
-      return x+1;
-    else
-      return x;
+    {
+      if (x - xrf >= 0.5)
+	xr += (long long) ((unsigned long long) xr + 1) > 0;
+    }
   else
-    if (xrf - x >= 0.5 && xrf - x < 1.0 && x-1 < 0)
-      return x-1;
-    else
-      return x;
+    {
+      if (xrf - x >= 0.5)
+	xr -= (long long) ((unsigned long long) xr - 1) < 0;
+    }
+  return xr;
 }
 weak_alias (__llround, llround)
 #ifdef NO_LONG_DOUBLE
diff --git a/sysdeps/powerpc/fpu/s_llroundf.c b/sysdeps/powerpc/fpu/s_llroundf.c
index 07d12ad..0935de6 100644
--- a/sysdeps/powerpc/fpu/s_llroundf.c
+++ b/sysdeps/powerpc/fpu/s_llroundf.c
@@ -18,28 +18,27 @@
 
 #include <math.h>
 
-/* I think that what this routine is supposed to do is round a value
-   to the nearest integer, with values exactly on the boundary rounded
-   away from zero.  */
-/* This routine relies on (long long)x, when x is out of range of a long long,
-   clipping to MAX_LLONG or MIN_LLONG.  */
+/* Round to the nearest integer, with values exactly on a 0.5 boundary
+   rounded away from zero, regardless of the current rounding mode.
+   If (long long)x, when x is out of range of a long long, clips at
+   LLONG_MAX or LLONG_MIN, then this implementation also clips.  */
 
 long long int
 __llroundf (float x)
 {
-  float xrf;
-  long long int xr;
-  xr = (long long int) x;
-  xrf = (float) xr;
+  long long xr = (long long) x;
+  float xrf = (float) xr;
+
   if (x >= 0.0)
-    if (x - xrf >= 0.5 && x - xrf < 1.0 && x+1 > 0)
-      return x+1;
-    else
-      return x;
+    {
+      if (x - xrf >= 0.5)
+	xr += (long long) ((unsigned long long) xr + 1) > 0;
+    }
   else
-    if (xrf - x >= 0.5 && xrf - x < 1.0 && x-1 < 0)
-      return x-1;
-    else
-      return x;
+    {
+      if (xrf - x >= 0.5)
+	xr -= (long long) ((unsigned long long) xr - 1) < 0;
+    }
+  return xr;
 }
 weak_alias (__llroundf, llroundf)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cbeb3f0edee7afc90bb02bb973863a43ab5b076d

commit cbeb3f0edee7afc90bb02bb973863a43ab5b076d
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:29:43 2013 +0930

    PowerPC floating point little-endian [9 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00200.html
    
    This works around the fact that vsx is disabled in current
    little-endian gcc.  Also, float constants take 4 bytes in memory
    vs. 16 bytes for vector constants, and we don't need to write one lot
    of masks for double (register format) and another for float (mem
    format).
    
    	* sysdeps/powerpc/fpu/s_float_bitwise.h (__float_and_test28): Don't
    	use vector int constants.
    	(__float_and_test24, __float_and8, __float_get_exp): Likewise.

diff --git a/ChangeLog b/ChangeLog
index e9ed5d8..62923b5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/fpu/s_float_bitwise.h (__float_and_test28): Don't
+	use vector int constants.
+	(__float_and_test24, __float_and8, __float_get_exp): Likewise.
+
 2013-10-04  Anton Blanchard <anton@au1.ibm.com>
 
 	* sysdeps/powerpc/fpu/fenv_libc.h (fenv_union_t): Replace int
diff --git a/sysdeps/powerpc/fpu/s_float_bitwise.h b/sysdeps/powerpc/fpu/s_float_bitwise.h
index 8e4adca..c0a4e56 100644
--- a/sysdeps/powerpc/fpu/s_float_bitwise.h
+++ b/sysdeps/powerpc/fpu/s_float_bitwise.h
@@ -23,18 +23,19 @@
 #include <math_private.h>
 
 /* Returns (int)(num & 0x7FFFFFF0 == value) */
-static inline
-int __float_and_test28 (float num, float value)
+static inline int
+__float_and_test28 (float num, float value)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7ffffffe, 0x00000000, 0x00000000, 0x0000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7ffffff0 };
   __asm__ (
-  /* the 'f' constrain is use on mask because we just need
+  /* the 'f' constraint is used on mask because we just need
    * to compare floats, not full vector */
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -46,16 +47,17 @@ int __float_and_test28 (float num, float value)
 }
 
 /* Returns (int)(num & 0x7FFFFF00 == value) */
-static inline
-int __float_and_test24 (float num, float value)
+static inline int
+__float_and_test24 (float num, float value)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7fffffe0, 0x00000000, 0x00000000, 0x0000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7fffff00 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -67,16 +69,17 @@ int __float_and_test24 (float num, float value)
 }
 
 /* Returns (float)(num & 0x7F800000) */
-static inline
-float __float_and8 (float num)
+static inline float
+__float_and8 (float num)
 {
   float ret;
 #ifdef _ARCH_PWR7
-  vector int mask = (vector int) {
-    0x7ff00000, 0x00000000, 0x00000000, 0x00000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7f800000 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
 #else
   int32_t inum;
@@ -88,17 +91,18 @@ float __float_and8 (float num)
 }
 
 /* Returns ((int32_t)(num & 0x7F800000) >> 23) */
-static inline
-int32_t __float_get_exp (float num)
+static inline int32_t
+__float_get_exp (float num)
 {
   int32_t inum;
 #ifdef _ARCH_PWR7
   float ret;
-  vector int mask = (vector int) {
-    0x7ff00000, 0x00000000, 0x00000000, 0x00000000
-  };
+  union {
+    int i;
+    float f;
+  } mask = { .i = 0x7f800000 };
   __asm__ (
-    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask)
+    "xxland %x0,%x1,%x2" : "=f" (ret) : "f" (num), "f" (mask.f)
   );
   GET_FLOAT_WORD(inum, ret);
 #else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=977432e4f3061cde059ec9afbf8dd38d64dcf8e9

commit 977432e4f3061cde059ec9afbf8dd38d64dcf8e9
Author: Anton Blanchard <anton@au1.ibm.com>
Date:   Sat Aug 17 18:28:55 2013 +0930

    PowerPC floating point little-endian [8 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00199.html
    
    Corrects floating-point environment code for little-endian.
    
    	* sysdeps/powerpc/fpu/fenv_libc.h (fenv_union_t): Replace int
    	array with long long.
    	* sysdeps/powerpc/fpu/e_sqrt.c (__slow_ieee754_sqrt): Adjust.
    	* sysdeps/powerpc/fpu/e_sqrtf.c (__slow_ieee754_sqrtf): Adjust.
    	* sysdeps/powerpc/fpu/fclrexcpt.c (__feclearexcept): Adjust.
    	* sysdeps/powerpc/fpu/fedisblxcpt.c (fedisableexcept): Adjust.
    	* sysdeps/powerpc/fpu/feenablxcpt.c (feenableexcept): Adjust.
    	* sysdeps/powerpc/fpu/fegetexcept.c (__fegetexcept): Adjust.
    	* sysdeps/powerpc/fpu/feholdexcpt.c (feholdexcept): Adjust.
    	* sysdeps/powerpc/fpu/fesetenv.c (__fesetenv): Adjust.
    	* sysdeps/powerpc/fpu/feupdateenv.c (__feupdateenv): Adjust.
    	* sysdeps/powerpc/fpu/fgetexcptflg.c (__fegetexceptflag): Adjust.
    	* sysdeps/powerpc/fpu/fraiseexcpt.c (__feraiseexcept): Adjust.
    	* sysdeps/powerpc/fpu/fsetexcptflg.c (__fesetexceptflag): Adjust.
    	* sysdeps/powerpc/fpu/ftestexcept.c (fetestexcept): Adjust.

diff --git a/ChangeLog b/ChangeLog
index 5c0f524..e9ed5d8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,23 @@
 2013-10-04  Anton Blanchard <anton@au1.ibm.com>
 
+	* sysdeps/powerpc/fpu/fenv_libc.h (fenv_union_t): Replace int
+	array with long long.
+	* sysdeps/powerpc/fpu/e_sqrt.c (__slow_ieee754_sqrt): Adjust.
+	* sysdeps/powerpc/fpu/e_sqrtf.c (__slow_ieee754_sqrtf): Adjust.
+	* sysdeps/powerpc/fpu/fclrexcpt.c (__feclearexcept): Adjust.
+	* sysdeps/powerpc/fpu/fedisblxcpt.c (fedisableexcept): Adjust.
+	* sysdeps/powerpc/fpu/feenablxcpt.c (feenableexcept): Adjust.
+	* sysdeps/powerpc/fpu/fegetexcept.c (__fegetexcept): Adjust.
+	* sysdeps/powerpc/fpu/feholdexcpt.c (feholdexcept): Adjust.
+	* sysdeps/powerpc/fpu/fesetenv.c (__fesetenv): Adjust.
+	* sysdeps/powerpc/fpu/feupdateenv.c (__feupdateenv): Adjust.
+	* sysdeps/powerpc/fpu/fgetexcptflg.c (__fegetexceptflag): Adjust.
+	* sysdeps/powerpc/fpu/fraiseexcpt.c (__feraiseexcept): Adjust.
+	* sysdeps/powerpc/fpu/fsetexcptflg.c (__fesetexceptflag): Adjust.
+	* sysdeps/powerpc/fpu/ftestexcept.c (fetestexcept): Adjust.
+
+2013-10-04  Anton Blanchard <anton@au1.ibm.com>
+
 	* sysdeps/powerpc/bits/mathinline.h (__signbitf): Use builtin.
 	(__signbit): Likewise.  Correct for little-endian.
 	(__signbitl): Call __signbit.
diff --git a/sysdeps/powerpc/fpu/e_sqrt.c b/sysdeps/powerpc/fpu/e_sqrt.c
index 3efe277..2d50fb5 100644
--- a/sysdeps/powerpc/fpu/e_sqrt.c
+++ b/sysdeps/powerpc/fpu/e_sqrt.c
@@ -145,7 +145,7 @@ __slow_ieee754_sqrt (double x)
       feraiseexcept (FE_INVALID_SQRT);
 
       fenv_union_t u = { .fenv = fegetenv_register () };
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 #endif
 	feraiseexcept (FE_INVALID);
       x = a_nan.value;
diff --git a/sysdeps/powerpc/fpu/e_sqrtf.c b/sysdeps/powerpc/fpu/e_sqrtf.c
index 6e50a3c..91d2d37 100644
--- a/sysdeps/powerpc/fpu/e_sqrtf.c
+++ b/sysdeps/powerpc/fpu/e_sqrtf.c
@@ -121,7 +121,7 @@ __slow_ieee754_sqrtf (float x)
       feraiseexcept (FE_INVALID_SQRT);
 
       fenv_union_t u = { .fenv = fegetenv_register () };
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 #endif
 	feraiseexcept (FE_INVALID);
       x = a_nan.value;
diff --git a/sysdeps/powerpc/fpu/fclrexcpt.c b/sysdeps/powerpc/fpu/fclrexcpt.c
index 86575db..7f66e21 100644
--- a/sysdeps/powerpc/fpu/fclrexcpt.c
+++ b/sysdeps/powerpc/fpu/fclrexcpt.c
@@ -28,8 +28,8 @@ __feclearexcept (int excepts)
   u.fenv = fegetenv_register ();
 
   /* Clear the relevant bits.  */
-  u.l[1] = u.l[1] & ~((-(excepts >> (31 - FPSCR_VX) & 1) & FE_ALL_INVALID)
-		      | (excepts & FPSCR_STICKY_BITS));
+  u.l = u.l & ~((-(excepts >> (31 - FPSCR_VX) & 1) & FE_ALL_INVALID)
+		| (excepts & FPSCR_STICKY_BITS));
 
   /* Put the new state in effect.  */
   fesetenv_register (u.fenv);
diff --git a/sysdeps/powerpc/fpu/fedisblxcpt.c b/sysdeps/powerpc/fpu/fedisblxcpt.c
index 659566b..f2c45a6 100644
--- a/sysdeps/powerpc/fpu/fedisblxcpt.c
+++ b/sysdeps/powerpc/fpu/fedisblxcpt.c
@@ -32,15 +32,15 @@ fedisableexcept (int excepts)
 
   fe.fenv = fegetenv_register ();
   if (excepts & FE_INEXACT)
-    fe.l[1] &= ~(1 << (31 - FPSCR_XE));
+    fe.l &= ~(1 << (31 - FPSCR_XE));
   if (excepts & FE_DIVBYZERO)
-    fe.l[1] &= ~(1 << (31 - FPSCR_ZE));
+    fe.l &= ~(1 << (31 - FPSCR_ZE));
   if (excepts & FE_UNDERFLOW)
-    fe.l[1] &= ~(1 << (31 - FPSCR_UE));
+    fe.l &= ~(1 << (31 - FPSCR_UE));
   if (excepts & FE_OVERFLOW)
-    fe.l[1] &= ~(1 << (31 - FPSCR_OE));
+    fe.l &= ~(1 << (31 - FPSCR_OE));
   if (excepts & FE_INVALID)
-    fe.l[1] &= ~(1 << (31 - FPSCR_VE));
+    fe.l &= ~(1 << (31 - FPSCR_VE));
   fesetenv_register (fe.fenv);
 
   new = __fegetexcept ();
diff --git a/sysdeps/powerpc/fpu/feenablxcpt.c b/sysdeps/powerpc/fpu/feenablxcpt.c
index fc4bfff..472796d 100644
--- a/sysdeps/powerpc/fpu/feenablxcpt.c
+++ b/sysdeps/powerpc/fpu/feenablxcpt.c
@@ -32,15 +32,15 @@ feenableexcept (int excepts)
 
   fe.fenv = fegetenv_register ();
   if (excepts & FE_INEXACT)
-    fe.l[1] |= (1 << (31 - FPSCR_XE));
+    fe.l |= (1 << (31 - FPSCR_XE));
   if (excepts & FE_DIVBYZERO)
-    fe.l[1] |= (1 << (31 - FPSCR_ZE));
+    fe.l |= (1 << (31 - FPSCR_ZE));
   if (excepts & FE_UNDERFLOW)
-    fe.l[1] |= (1 << (31 - FPSCR_UE));
+    fe.l |= (1 << (31 - FPSCR_UE));
   if (excepts & FE_OVERFLOW)
-    fe.l[1] |= (1 << (31 - FPSCR_OE));
+    fe.l |= (1 << (31 - FPSCR_OE));
   if (excepts & FE_INVALID)
-    fe.l[1] |= (1 << (31 - FPSCR_VE));
+    fe.l |= (1 << (31 - FPSCR_VE));
   fesetenv_register (fe.fenv);
 
   new = __fegetexcept ();
diff --git a/sysdeps/powerpc/fpu/fegetexcept.c b/sysdeps/powerpc/fpu/fegetexcept.c
index f3d5724..23d47a2 100644
--- a/sysdeps/powerpc/fpu/fegetexcept.c
+++ b/sysdeps/powerpc/fpu/fegetexcept.c
@@ -27,15 +27,15 @@ __fegetexcept (void)
 
   fe.fenv = fegetenv_register ();
 
-  if (fe.l[1] & (1 << (31 - FPSCR_XE)))
+  if (fe.l & (1 << (31 - FPSCR_XE)))
       result |= FE_INEXACT;
-  if (fe.l[1] & (1 << (31 - FPSCR_ZE)))
+  if (fe.l & (1 << (31 - FPSCR_ZE)))
       result |= FE_DIVBYZERO;
-  if (fe.l[1] & (1 << (31 - FPSCR_UE)))
+  if (fe.l & (1 << (31 - FPSCR_UE)))
       result |= FE_UNDERFLOW;
-  if (fe.l[1] & (1 << (31 - FPSCR_OE)))
+  if (fe.l & (1 << (31 - FPSCR_OE)))
       result |= FE_OVERFLOW;
-  if (fe.l[1] & (1 << (31 - FPSCR_VE)))
+  if (fe.l & (1 << (31 - FPSCR_VE)))
       result |= FE_INVALID;
 
   return result;
diff --git a/sysdeps/powerpc/fpu/feholdexcpt.c b/sysdeps/powerpc/fpu/feholdexcpt.c
index 013d2bf..0ecf0f7 100644
--- a/sysdeps/powerpc/fpu/feholdexcpt.c
+++ b/sysdeps/powerpc/fpu/feholdexcpt.c
@@ -30,13 +30,12 @@ feholdexcept (fenv_t *envp)
 
   /* Clear everything except for the rounding modes and non-IEEE arithmetic
      flag.  */
-  new.l[1] = old.l[1] & 7;
-  new.l[0] = old.l[0];
+  new.l = old.l & 0xffffffff00000007LL;
 
   /* If the old env had any enabled exceptions, then mask SIGFPE in the
      MSR FE0/FE1 bits.  This may allow the FPU to run faster because it
      always takes the default action and can not generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) != 0)
     (void)__fe_mask_env ();
 
   /* Put the new state in effect.  */
diff --git a/sysdeps/powerpc/fpu/fenv_libc.h b/sysdeps/powerpc/fpu/fenv_libc.h
index 1910951..baa2a7d 100644
--- a/sysdeps/powerpc/fpu/fenv_libc.h
+++ b/sysdeps/powerpc/fpu/fenv_libc.h
@@ -69,7 +69,7 @@ libm_hidden_proto (__fe_nomask_env)
 typedef union
 {
   fenv_t fenv;
-  unsigned int l[2];
+  unsigned long long l;
 } fenv_union_t;
 
 
diff --git a/sysdeps/powerpc/fpu/fesetenv.c b/sysdeps/powerpc/fpu/fesetenv.c
index e92adb4..6c00b26 100644
--- a/sysdeps/powerpc/fpu/fesetenv.c
+++ b/sysdeps/powerpc/fpu/fesetenv.c
@@ -34,14 +34,14 @@ __fesetenv (const fenv_t *envp)
      exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put the
      hardware into "precise mode" and may cause the FPU to run slower on some
      hardware.  */
-  if ((old.l[1] & _FPU_MASK_ALL) == 0 && (new.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
     (void)__fe_nomask_env ();
 
   /* If the old env had any enabled exceptions and the new env has no enabled
      exceptions, then mask SIGFPE in the MSR FE0/FE1 bits.  This may allow the
      FPU to run faster because it always takes the default action and can not
      generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0 && (new.l[1] & _FPU_MASK_ALL) == 0)
+  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
     (void)__fe_mask_env ();
 
   fesetenv_register (*envp);
diff --git a/sysdeps/powerpc/fpu/feupdateenv.c b/sysdeps/powerpc/fpu/feupdateenv.c
index 6500ea1..6775044 100644
--- a/sysdeps/powerpc/fpu/feupdateenv.c
+++ b/sysdeps/powerpc/fpu/feupdateenv.c
@@ -34,20 +34,20 @@ __feupdateenv (const fenv_t *envp)
   /* Restore rounding mode and exception enable from *envp and merge
      exceptions.  Leave fraction rounded/inexact and FP result/CC bits
      unchanged.  */
-  new.l[1] = (old.l[1] & 0x1FFFFF00) | (new.l[1] & 0x1FF80FFF);
+  new.l = (old.l & 0xffffffff1fffff00LL) | (new.l & 0x1ff80fff);
 
   /* If the old env has no enabled exceptions and the new env has any enabled
      exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put
      the hardware into "precise mode" and may cause the FPU to run slower on
      some hardware.  */
-  if ((old.l[1] & _FPU_MASK_ALL) == 0 && (new.l[1] & _FPU_MASK_ALL) != 0)
+  if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
     (void)__fe_nomask_env ();
 
   /* If the old env had any enabled exceptions and the new env has no enabled
      exceptions, then mask SIGFPE in the MSR FE0/FE1 bits.  This may allow the
      FPU to run faster because it always takes the default action and can not
      generate SIGFPE. */
-  if ((old.l[1] & _FPU_MASK_ALL) != 0 && (new.l[1] & _FPU_MASK_ALL) == 0)
+  if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
     (void)__fe_mask_env ();
 
   /* Atomically enable and raise (if appropriate) exceptions set in `new'. */
diff --git a/sysdeps/powerpc/fpu/fgetexcptflg.c b/sysdeps/powerpc/fpu/fgetexcptflg.c
index f6327ce..1395bed 100644
--- a/sysdeps/powerpc/fpu/fgetexcptflg.c
+++ b/sysdeps/powerpc/fpu/fgetexcptflg.c
@@ -27,7 +27,7 @@ __fegetexceptflag (fexcept_t *flagp, int excepts)
   u.fenv = fegetenv_register ();
 
   /* Return (all of) it.  */
-  *flagp = u.l[1] & excepts & FE_ALL_EXCEPT;
+  *flagp = u.l & excepts & FE_ALL_EXCEPT;
 
   /* Success.  */
   return 0;
diff --git a/sysdeps/powerpc/fpu/fraiseexcpt.c b/sysdeps/powerpc/fpu/fraiseexcpt.c
index 9118c19..6193071 100644
--- a/sysdeps/powerpc/fpu/fraiseexcpt.c
+++ b/sysdeps/powerpc/fpu/fraiseexcpt.c
@@ -33,11 +33,11 @@ __feraiseexcept (int excepts)
   u.fenv = fegetenv_register ();
 
   /* Add the exceptions */
-  u.l[1] = (u.l[1]
-	    | (excepts & FPSCR_STICKY_BITS)
-	    /* Turn FE_INVALID into FE_INVALID_SOFTWARE.  */
-	    | (excepts >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
-	       & FE_INVALID_SOFTWARE));
+  u.l = (u.l
+	 | (excepts & FPSCR_STICKY_BITS)
+	 /* Turn FE_INVALID into FE_INVALID_SOFTWARE.  */
+	 | (excepts >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
+	    & FE_INVALID_SOFTWARE));
 
   /* Store the new status word (along with the rest of the environment),
      triggering any appropriate exceptions.  */
@@ -49,7 +49,7 @@ __feraiseexcept (int excepts)
 	 don't have FE_INVALID_SOFTWARE implemented.  Detect this
 	 case and raise FE_INVALID_SNAN instead.  */
       u.fenv = fegetenv_register ();
-      if ((u.l[1] & FE_INVALID) == 0)
+      if ((u.l & FE_INVALID) == 0)
 	set_fpscr_bit (FPSCR_VXSNAN);
     }
 
diff --git a/sysdeps/powerpc/fpu/fsetexcptflg.c b/sysdeps/powerpc/fpu/fsetexcptflg.c
index c050d40..0d309c8 100644
--- a/sysdeps/powerpc/fpu/fsetexcptflg.c
+++ b/sysdeps/powerpc/fpu/fsetexcptflg.c
@@ -31,10 +31,10 @@ __fesetexceptflag (const fexcept_t *flagp, int excepts)
   flag = *flagp & excepts;
 
   /* Replace the exception status */
-  u.l[1] = ((u.l[1] & ~(FPSCR_STICKY_BITS & excepts))
-	    | (flag & FPSCR_STICKY_BITS)
-	    | (flag >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
-	       & FE_INVALID_SOFTWARE));
+  u.l = ((u.l & ~(FPSCR_STICKY_BITS & excepts))
+	 | (flag & FPSCR_STICKY_BITS)
+	 | (flag >> ((31 - FPSCR_VX) - (31 - FPSCR_VXSOFT))
+	    & FE_INVALID_SOFTWARE));
 
   /* Store the new status word (along with the rest of the environment).
      This may cause floating-point exceptions if the restored state
diff --git a/sysdeps/powerpc/fpu/ftestexcept.c b/sysdeps/powerpc/fpu/ftestexcept.c
index 0dbc3be..86eea0f 100644
--- a/sysdeps/powerpc/fpu/ftestexcept.c
+++ b/sysdeps/powerpc/fpu/ftestexcept.c
@@ -28,6 +28,6 @@ fetestexcept (int excepts)
 
   /* The FE_INVALID bit is dealt with correctly by the hardware, so we can
      just:  */
-  return u.l[1] & excepts;
+  return u.l & excepts;
 }
 libm_hidden_def (fetestexcept)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d6bbeb3f4ee99cdfcf7935780d710dfc2d2b3cbc

commit d6bbeb3f4ee99cdfcf7935780d710dfc2d2b3cbc
Author: Anton Blanchard <anton@au1.ibm.com>
Date:   Sat Aug 17 18:28:06 2013 +0930

    PowerPC floating point little-endian [7 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00086.html
    
    	* sysdeps/powerpc/bits/mathinline.h (__signbitf): Use builtin.
    	(__signbit): Likewise.  Correct for little-endian.
    	(__signbitl): Call __signbit.
    	(lrint): Correct for little-endian.
    	(lrintf): Call lrint.

diff --git a/ChangeLog b/ChangeLog
index b090c79..5c0f524 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2013-10-04  Anton Blanchard <anton@au1.ibm.com>
+
+	* sysdeps/powerpc/bits/mathinline.h (__signbitf): Use builtin.
+	(__signbit): Likewise.  Correct for little-endian.
+	(__signbitl): Call __signbit.
+	(lrint): Correct for little-endian.
+	(lrintf): Call lrint.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
 	* sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c (mynumber): Replace
diff --git a/sysdeps/powerpc/bits/mathinline.h b/sysdeps/powerpc/bits/mathinline.h
index 140fff0..cef5b29 100644
--- a/sysdeps/powerpc/bits/mathinline.h
+++ b/sysdeps/powerpc/bits/mathinline.h
@@ -61,21 +61,28 @@
 __MATH_INLINE int
 __NTH (__signbitf (float __x))
 {
+#if __GNUC_PREREQ (4, 0)
+  return __builtin_signbitf (__x);
+#else
   __extension__ union { float __f; int __i; } __u = { __f: __x };
   return __u.__i < 0;
+#endif
 }
 __MATH_INLINE int
 __NTH (__signbit (double __x))
 {
-  __extension__ union { double __d; int __i[2]; } __u = { __d: __x };
-  return __u.__i[0] < 0;
+#if __GNUC_PREREQ (4, 0)
+  return __builtin_signbit (__x);
+#else
+  __extension__ union { double __d; long long __i; } __u = { __d: __x };
+  return __u.__i < 0;
+#endif
 }
 #  ifdef __LONG_DOUBLE_128__
 __MATH_INLINE int
 __NTH (__signbitl (long double __x))
 {
-  __extension__ union { long double __d; int __i[4]; } __u = { __d: __x };
-  return __u.__i[0] < 0;
+  return __signbit ((double) __x);
 }
 #  endif
 # endif
@@ -92,22 +99,17 @@ __NTH (lrint (double __x))
 {
   union {
     double __d;
-    int __ll[2];
+    long long __ll;
   } __u;
   __asm__ ("fctiw %0,%1" : "=f"(__u.__d) : "f"(__x));
-  return __u.__ll[1];
+  return __u.__ll;
 }
 
 __MATH_INLINE long int lrintf (float __x) __THROW;
 __MATH_INLINE long int
 __NTH (lrintf (float __x))
 {
-  union {
-    double __d;
-    int __ll[2];
-  } __u;
-  __asm__ ("fctiw %0,%1" : "=f"(__u.__d) : "f"(__x));
-  return __u.__ll[1];
+  return lrint ((double) __x);
 }
 # endif
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e1120ce9e80828f3619c7dc0d619be2d75aecc46

commit e1120ce9e80828f3619c7dc0d619be2d75aecc46
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:27:19 2013 +0930

    PowerPC floating point little-endian [6 of 15]
    http://sourceware.org/ml/libc-alpha/2013-07/msg00197.html
    
    A rewrite to make this code correct for little-endian.
    
    	* sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c (mynumber): Replace
    	union 32-bit int array member with 64-bit int array.
    	(t515, tm256): Double rather than long double.
    	(__ieee754_sqrtl): Rewrite using 64-bit arithmetic.

diff --git a/ChangeLog b/ChangeLog
index 13b7ea3..b090c79 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c (mynumber): Replace
+	union 32-bit int array member with 64-bit int array.
+	(t515, tm256): Double rather than long double.
+	(__ieee754_sqrtl): Rewrite using 64-bit arithmetic.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/ieee754/ldbl-128ibm/ieee754.h (union ieee854_long_double):
 	Delete.
 	(IEEE854_LONG_DOUBLE_BIAS): Delete.
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c b/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
index 2b0f7c6..61feb36 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_sqrtl.c
@@ -34,15 +34,13 @@
 
 #include <math_private.h>
 
-typedef unsigned int int4;
-typedef union {int4 i[4]; long double x; double d[2]; } mynumber;
+typedef union {int64_t i[2]; long double x; double d[2]; } mynumber;
 
-static const  mynumber
-  t512 = {{0x5ff00000, 0x00000000, 0x00000000, 0x00000000 }},  /* 2^512  */
-  tm256 = {{0x2ff00000, 0x00000000, 0x00000000, 0x00000000 }};  /* 2^-256 */
 static const double
-two54 = 1.80143985094819840000e+16, /* 0x4350000000000000 */
-twom54 = 5.55111512312578270212e-17; /* 0x3C90000000000000 */
+  t512 = 0x1p512,
+  tm256 = 0x1p-256,
+  two54 = 0x1p54,	/* 0x4350000000000000 */
+  twom54 = 0x1p-54;	/* 0x3C90000000000000 */
 
 /*********************************************************************/
 /* An ultimate sqrt routine. Given an IEEE double machine number x   */
@@ -54,56 +52,53 @@ long double __ieee754_sqrtl(long double x)
   static const long double big = 134217728.0, big1 = 134217729.0;
   long double t,s,i;
   mynumber a,c;
-  int4 k, l, m;
-  int n;
+  uint64_t k, l;
+  int64_t m, n;
   double d;
 
   a.x=x;
-  k=a.i[0] & 0x7fffffff;
+  k=a.i[0] & INT64_C(0x7fffffffffffffff);
   /*----------------- 2^-1022  <= | x |< 2^1024  -----------------*/
-  if (k>0x000fffff && k<0x7ff00000) {
+  if (k>INT64_C(0x000fffff00000000) && k<INT64_C(0x7ff0000000000000)) {
     if (x < 0) return (big1-big1)/(big-big);
-    l = (k&0x001fffff)|0x3fe00000;
-    if (((a.i[2] & 0x7fffffff) | a.i[3]) != 0) {
-      n = (int) ((l - k) * 2) >> 21;
-      m = (a.i[2] >> 20) & 0x7ff;
+    l = (k&INT64_C(0x001fffffffffffff))|INT64_C(0x3fe0000000000000);
+    if ((a.i[1] & INT64_C(0x7fffffffffffffff)) != 0) {
+      n = (int64_t) ((l - k) * 2) >> 53;
+      m = (a.i[1] >> 52) & 0x7ff;
       if (m == 0) {
 	a.d[1] *= two54;
-	m = ((a.i[2] >> 20) & 0x7ff) - 54;
+	m = ((a.i[1] >> 52) & 0x7ff) - 54;
       }
       m += n;
-      if ((int) m > 0)
-	a.i[2] = (a.i[2] & 0x800fffff) | (m << 20);
-      else if ((int) m <= -54) {
-	a.i[2] &= 0x80000000;
-	a.i[3] = 0;
+      if (m > 0)
+	a.i[1] = (a.i[1] & INT64_C(0x800fffffffffffff)) | (m << 52);
+      else if (m <= -54) {
+	a.i[1] &= INT64_C(0x8000000000000000);
       } else {
 	m += 54;
-	a.i[2] = (a.i[2] & 0x800fffff) | (m << 20);
+	a.i[1] = (a.i[1] & INT64_C(0x800fffffffffffff)) | (m << 52);
 	a.d[1] *= twom54;
       }
     }
     a.i[0] = l;
     s = a.x;
     d = __ieee754_sqrt (a.d[0]);
-    c.i[0] = 0x20000000+((k&0x7fe00000)>>1);
+    c.i[0] = INT64_C(0x2000000000000000)+((k&INT64_C(0x7fe0000000000000))>>1);
     c.i[1] = 0;
-    c.i[2] = 0;
-    c.i[3] = 0;
     i = d;
     t = 0.5L * (i + s / i);
     i = 0.5L * (t + s / t);
     return c.x * i;
   }
   else {
-    if (k>=0x7ff00000) {
-      if (a.i[0] == 0xfff00000 && a.i[1] == 0)
+    if (k>=INT64_C(0x7ff0000000000000)) {
+      if (a.i[0] == INT64_C(0xfff0000000000000))
 	return (big1-big1)/(big-big); /* sqrt (-Inf) = NaN.  */
       return x; /* sqrt (NaN) = NaN, sqrt (+Inf) = +Inf.  */
     }
     if (x == 0) return x;
     if (x < 0) return (big1-big1)/(big-big);
-    return tm256.x*__ieee754_sqrtl(x*t512.x);
+    return tm256*__ieee754_sqrtl(x*t512);
   }
 }
 strong_alias (__ieee754_sqrtl, __sqrtl_finite)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=09b803a2824424a31cf92093b29105920d1f7ad5

commit 09b803a2824424a31cf92093b29105920d1f7ad5
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:26:39 2013 +0930

    PowerPC floating point little-endian [5 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00085.html
    
    Rid ourselves of ieee854.
    
    	* sysdeps/ieee754/ldbl-128ibm/ieee754.h (union ieee854_long_double):
    	Delete.
    	(IEEE854_LONG_DOUBLE_BIAS): Delete.
    	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h: Don't include ieee854
    	version of math_ldbl.h.

diff --git a/ChangeLog b/ChangeLog
index 993f6bf..13b7ea3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/ieee754/ldbl-128ibm/ieee754.h (union ieee854_long_double):
+	Delete.
+	(IEEE854_LONG_DOUBLE_BIAS): Delete.
+	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h: Don't include ieee854
+	version of math_ldbl.h.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	[BZ #15734], [BZ #15735]
 	* sysdeps/ieee754/ldbl-128ibm/e_fmodl.c (__ieee754_fmodl): Rewrite
 	all uses of ieee875 long double macros and unions.  Simplify test
diff --git a/sysdeps/ieee754/ldbl-128ibm/ieee754.h b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
index 0778b1f..0c97a99 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ieee754.h
+++ b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
@@ -111,61 +111,6 @@ union ieee754_double
 #define IEEE754_DOUBLE_BIAS	0x3ff /* Added to exponent.  */
 
 
-union ieee854_long_double
-  {
-    long double d;
-
-    /* This is the IEEE 854 quad-precision format.  */
-    struct
-      {
-#if	__BYTE_ORDER == __BIG_ENDIAN
-	unsigned int negative:1;
-	unsigned int exponent:15;
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa0:16;
-	unsigned int mantissa1:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa3:32;
-#endif				/* Big endian.  */
-#if	__BYTE_ORDER == __LITTLE_ENDIAN
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa3:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa1:32;
-	unsigned int mantissa0:16;
-	unsigned int exponent:15;
-	unsigned int negative:1;
-#endif				/* Little endian.  */
-      } ieee;
-
-    /* This format makes it easier to see if a NaN is a signalling NaN.  */
-    struct
-      {
-#if	__BYTE_ORDER == __BIG_ENDIAN
-	unsigned int negative:1;
-	unsigned int exponent:15;
-	unsigned int quiet_nan:1;
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa0:15;
-	unsigned int mantissa1:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa3:32;
-#else
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa3:32;
-	unsigned int mantissa2:32;
-	unsigned int mantissa1:32;
-	unsigned int mantissa0:15;
-	unsigned int quiet_nan:1;
-	unsigned int exponent:15;
-	unsigned int negative:1;
-#endif
-      } ieee_nan;
-  };
-
-#define IEEE854_LONG_DOUBLE_BIAS 0x3fff /* Added to exponent.  */
-
-
 /* IBM extended format for long double.
 
    Each long double is made up of two IEEE doubles.  The value of the
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 8adb081..1b6e27a 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -2,7 +2,6 @@
 #error "Never use <math_ldbl.h> directly; include <math_private.h> instead."
 #endif
 
-#include <sysdeps/ieee754/ldbl-128/math_ldbl.h>
 #include <ieee754.h>
 #include <stdint.h>
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2d025af1cbc32649e30a84253689b84ec9ae5a6f

commit 2d025af1cbc32649e30a84253689b84ec9ae5a6f
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:25:51 2013 +0930

    PowerPC floating point little-endian [4 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00084.html
    
    Another batch of ieee854 macros and union replacement.  These four
    files also have bugs fixed with this patch.  The fact that the two
    doubles in an IBM long double may have different signs means that
    negation and absolute value operations can't just twiddle one sign bit
    as you can with ieee864 style extended double.  fmodl, remainderl,
    erfl and erfcl all had errors of this type.  erfl also returned +1 for
    large magnitude negative input where it should return -1.  The hypotl
    error is innocuous since the value adjusted twice is only used as a
    flag.  The e_hypotl.c tests for large "a" and small "b" are mutually
    exclusive because we've already exited when x/y > 2**120.  That allows
    some further small simplifications.
    
    	[BZ #15734], [BZ #15735]
    	* sysdeps/ieee754/ldbl-128ibm/e_fmodl.c (__ieee754_fmodl): Rewrite
    	all uses of ieee875 long double macros and unions.  Simplify test
    	for 0.0L.  Correct |x|<|y| and |x|=|y| test.  Use
    	ldbl_extract_mantissa value for ix,iy exponents.  Properly
    	normalize after ldbl_extract_mantissa, and don't add hidden bit
    	already handled.  Don't treat low word of ieee854 mantissa like
    	low word of IBM long double and mask off bit when testing for
    	zero.
    	* sysdeps/ieee754/ldbl-128ibm/e_hypotl.c (__ieee754_hypotl): Rewrite
    	all uses of ieee875 long double macros and unions.  Simplify tests
    	for 0.0L and inf.  Correct double adjustment of k.  Delete dead code
    	adjusting ha,hb.  Simplify code setting kld.  Delete two600 and
    	two1022, instead use their values.  Recognise that tests for large
    	"a" and small "b" are mutually exclusive.  Rename vars.  Comment.
    	* sysdeps/ieee754/ldbl-128ibm/e_remainderl.c (__ieee754_remainderl):
    	Rewrite all uses of ieee875 long double macros and unions.  Simplify
    	test for 0.0L and nan.  Correct negation.
    	* sysdeps/ieee754/ldbl-128ibm/s_erfl.c (__erfl): Rewrite all uses of
    	ieee875 long double macros and unions.  Correct output for large
    	magnitude x.  Correct absolute value calculation.
    	(__erfcl): Likewise.
    	* math/libm-test.inc: Add tests for errors discovered in IBM long
    	double versions of fmodl, remainderl, erfl and erfcl.

diff --git a/ChangeLog b/ChangeLog
index aeb2f2c..993f6bf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,32 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	[BZ #15734], [BZ #15735]
+	* sysdeps/ieee754/ldbl-128ibm/e_fmodl.c (__ieee754_fmodl): Rewrite
+	all uses of ieee875 long double macros and unions.  Simplify test
+	for 0.0L.  Correct |x|<|y| and |x|=|y| test.  Use
+	ldbl_extract_mantissa value for ix,iy exponents.  Properly
+	normalize after ldbl_extract_mantissa, and don't add hidden bit
+	already handled.  Don't treat low word of ieee854 mantissa like
+	low word of IBM long double and mask off bit when testing for
+	zero.
+	* sysdeps/ieee754/ldbl-128ibm/e_hypotl.c (__ieee754_hypotl): Rewrite
+	all uses of ieee875 long double macros and unions.  Simplify tests
+	for 0.0L and inf.  Correct double adjustment of k.  Delete dead code
+	adjusting ha,hb.  Simplify code setting kld.  Delete two600 and
+	two1022, instead use their values.  Recognise that tests for large
+	"a" and small "b" are mutually exclusive.  Rename vars.  Comment.
+	* sysdeps/ieee754/ldbl-128ibm/e_remainderl.c (__ieee754_remainderl):
+	Rewrite all uses of ieee875 long double macros and unions.  Simplify
+	test for 0.0L and nan.  Correct negation.
+	* sysdeps/ieee754/ldbl-128ibm/s_erfl.c (__erfl): Rewrite all uses of
+	ieee875 long double macros and unions.  Correct output for large
+	magnitude x.  Correct absolute value calculation.
+	(__erfcl): Likewise.
+	* math/libm-test.inc: Add tests for errors discovered in IBM long
+	double versions of fmodl, remainderl, erfl and erfcl.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/ieee754/ldbl-128ibm/e_atan2l.c (__ieee754_atan2l): Rewrite
 	all uses of ieee854 long double macros and unions.  Simplify tests
 	for long doubles that are fully specified by the high double.
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 7a11c90..a84cf24 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -7816,6 +7816,11 @@ static const struct test_f_f_data erf_test_data[] =
     TEST_f_f (erf, 2.0L, 0.995322265018952734162069256367252929L),
     TEST_f_f (erf, 4.125L, 0.999999994576599200434933994687765914L),
     TEST_f_f (erf, 27.0L, 1.0L),
+    TEST_f_f (erf, -27.0L, -1.0L),
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 54
+    /* The input is not exactly representable as a double.  */
+    TEST_f_f (erf, -0x1.fffffffffffff8p-2L, -0.5204998778130465132916303345518417673509L),
+#endif
   };
 
 static void
@@ -7844,6 +7849,10 @@ static const struct test_f_f_data erfc_test_data[] =
     TEST_f_f (erfc, 0x1.ffa002p+2L, 1.233585992097580296336099501489175967033e-29L),
     TEST_f_f (erfc, 0x1.ffffc8p+2L, 1.122671365033056305522366683719541099329e-29L),
 #ifdef TEST_LDOUBLE
+# if LDBL_MANT_DIG >= 54
+    /* The input is not exactly representable as a double.  */
+    TEST_f_f (erfc, -0x1.fffffffffffff8p-2L, 1.52049987781304651329163033455184176735L),
+# endif
     /* The result can only be represented in long double.  */
 # if LDBL_MIN_10_EXP < -319
     TEST_f_f (erfc, 27.0L, 0.523704892378925568501606768284954709e-318L),
@@ -9342,6 +9351,13 @@ static const struct test_ff_f_data fmod_test_data[] =
 #if defined TEST_LDOUBLE && LDBL_MIN_EXP <= -16381
     TEST_ff_f (fmod, 0x0.fffffffffffffffep-16382L, 0x1p-16445L, plus_zero, NO_INEXACT_EXCEPTION),
 #endif
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 56
+    TEST_ff_f (fmod, -0x1.00000000000004p+0L, 0x1.fffffffffffff8p-1L, -0x1p-53L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, 0x1.fffffffffffffap-1L, 0x1.fffffffffffff8p-1L, 0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, -0x1.fffffffffffffap-1L, 0x1.fffffffffffff8p-1L, -0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, 0x1.fffffffffffffap-1L, -0x1.fffffffffffff8p-1L, 0x1p-56L, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fmod, -0x1.fffffffffffffap-1L, -0x1.fffffffffffff8p-1L, -0x1p-56L, NO_INEXACT_EXCEPTION),
+#endif
   };
 
 static void
@@ -12303,6 +12319,9 @@ static const struct test_ff_f_data remainder_test_data[] =
     TEST_ff_f (remainder, -1.625, -1.0, 0.375, NO_INEXACT_EXCEPTION),
     TEST_ff_f (remainder, 5.0, 2.0, 1.0, NO_INEXACT_EXCEPTION),
     TEST_ff_f (remainder, 3.0, 2.0, -1.0, NO_INEXACT_EXCEPTION),
+#if defined TEST_LDOUBLE && LDBL_MANT_DIG >= 56
+    TEST_ff_f (remainder, -0x1.80000000000002p1L, 2.0, 0x1.fffffffffffff8p-1L, NO_INEXACT_EXCEPTION),
+#endif
   };
 
 static void
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c b/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
index a60963c..a140fb3 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_fmodl.c
@@ -27,76 +27,83 @@ static const long double one = 1.0, Zero[] = {0.0, -0.0,};
 long double
 __ieee754_fmodl (long double x, long double y)
 {
-	int64_t n,hx,hy,hz,ix,iy,sx, i;
-	u_int64_t lx,ly,lz;
-	int temp;
+	int64_t hx, hy, hz, sx, sy;
+	uint64_t lx, ly, lz;
+	int n, ix, iy;
+	double xhi, xlo, yhi, ylo;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	ldbl_unpack (y, &yhi, &ylo);
+	EXTRACT_WORDS64 (hy, yhi);
+	EXTRACT_WORDS64 (ly, ylo);
 	sx = hx&0x8000000000000000ULL;		/* sign of x */
-	hx ^=sx;				/* |x| */
-	hy &= 0x7fffffffffffffffLL;		/* |y| */
+	hx ^= sx;				/* |x| */
+	sy = hy&0x8000000000000000ULL;		/* sign of y */
+	hy ^= sy;				/* |y| */
 
     /* purge off exception values */
-	if(__builtin_expect((hy|(ly&0x7fffffffffffffff))==0 ||
+	if(__builtin_expect(hy==0 ||
 			    (hx>=0x7ff0000000000000LL)|| /* y=0,or x not finite */
 			    (hy>0x7ff0000000000000LL),0))	/* or y is NaN */
 	    return (x*y)/(x*y);
-	if(__builtin_expect(hx<=hy,0)) {
-	    if((hx<hy)||(lx<ly)) return x;	/* |x|<|y| return x */
-	    if(lx==ly)
-		return Zero[(u_int64_t)sx>>63];	/* |x|=|y| return x*0*/
+	if (__builtin_expect (hx <= hy, 0))
+	  {
+	    /* If |x| < |y| return x.  */
+	    if (hx < hy)
+	      return x;
+	    /* At this point the absolute value of the high doubles of
+	       x and y must be equal.  */
+	    /* If the low double of y is the same sign as the high
+	       double of y (ie. the low double increases |y|)...  */
+	    if (((ly ^ sy) & 0x8000000000000000LL) == 0
+		/* ... then a different sign low double to high double
+		   for x or same sign but lower magnitude...  */
+		&& (int64_t) (lx ^ sx) < (int64_t) (ly ^ sy))
+	      /* ... means |x| < |y|.  */
+	      return x;
+	    /* If the low double of x differs in sign to the high
+	       double of x (ie. the low double decreases |x|)...  */
+	    if (((lx ^ sx) & 0x8000000000000000LL) != 0
+		/* ... then a different sign low double to high double
+		   for y with lower magnitude (we've already caught
+		   the same sign for y case above)...  */
+		&& (int64_t) (lx ^ sx) > (int64_t) (ly ^ sy))
+	      /* ... means |x| < |y|.  */
+	      return x;
+	    /* If |x| == |y| return x*0.  */
+	    if ((lx ^ sx) == (ly ^ sy))
+	      return Zero[(uint64_t) sx >> 63];
 	}
 
-    /* determine ix = ilogb(x) */
-	if(__builtin_expect(hx<0x0010000000000000LL,0)) {	/* subnormal x */
-	    if(hx==0) {
-		for (ix = -1043, i=lx; i>0; i<<=1) ix -=1;
-	    } else {
-		for (ix = -1022, i=(hx<<11); i>0; i<<=1) ix -=1;
-	    }
-	} else ix = (hx>>52)-0x3ff;
-
-    /* determine iy = ilogb(y) */
-	if(__builtin_expect(hy<0x0010000000000000LL,0)) {	/* subnormal y */
-	    if(hy==0) {
-		for (iy = -1043, i=ly; i>0; i<<=1) iy -=1;
-	    } else {
-		for (iy = -1022, i=(hy<<11); i>0; i<<=1) iy -=1;
-	    }
-	} else iy = (hy>>52)-0x3ff;
-
     /* Make the IBM extended format 105 bit mantissa look like the ieee854 112
        bit mantissa so the following operations will give the correct
        result.  */
-	ldbl_extract_mantissa(&hx, &lx, &temp, x);
-	ldbl_extract_mantissa(&hy, &ly, &temp, y);
+	ldbl_extract_mantissa(&hx, &lx, &ix, x);
+	ldbl_extract_mantissa(&hy, &ly, &iy, y);
 
-    /* set up {hx,lx}, {hy,ly} and align y to x */
-	if(__builtin_expect(ix >= -1022, 1))
-	    hx = 0x0001000000000000LL|(0x0000ffffffffffffLL&hx);
-	else {		/* subnormal x, shift x to normal */
-	    n = -1022-ix;
-	    if(n<=63) {
-		hx = (hx<<n)|(lx>>(64-n));
-		lx <<= n;
-	    } else {
-		hx = lx<<(n-64);
-		lx = 0;
-	    }
-	}
-	if(__builtin_expect(iy >= -1022, 1))
-	    hy = 0x0001000000000000LL|(0x0000ffffffffffffLL&hy);
-	else {		/* subnormal y, shift y to normal */
-	    n = -1022-iy;
-	    if(n<=63) {
-		hy = (hy<<n)|(ly>>(64-n));
-		ly <<= n;
-	    } else {
-		hy = ly<<(n-64);
-		ly = 0;
-	    }
-	}
+	if (__builtin_expect (ix == -IEEE754_DOUBLE_BIAS, 0))
+	  {
+	    /* subnormal x, shift x to normal.  */
+	    while ((hx & (1LL << 48)) == 0)
+	      {
+		hx = (hx << 1) | (lx >> 63);
+		lx = lx << 1;
+		ix -= 1;
+	      }
+	  }
+
+	if (__builtin_expect (iy == -IEEE754_DOUBLE_BIAS, 0))
+	  {
+	    /* subnormal y, shift y to normal.  */
+	    while ((hy & (1LL << 48)) == 0)
+	      {
+		hy = (hy << 1) | (ly >> 63);
+		ly = ly << 1;
+		iy -= 1;
+	      }
+	  }
 
     /* fix point fmod */
 	n = ix - iy;
@@ -104,7 +111,7 @@ __ieee754_fmodl (long double x, long double y)
 	    hz=hx-hy;lz=lx-ly; if(lx<ly) hz -= 1;
 	    if(hz<0){hx = hx+hx+(lx>>63); lx = lx+lx;}
 	    else {
-		if((hz|(lz&0x7fffffffffffffff))==0)		/* return sign(x)*0 */
+		if((hz|lz)==0)		/* return sign(x)*0 */
 		    return Zero[(u_int64_t)sx>>63];
 		hx = hz+hz+(lz>>63); lx = lz+lz;
 	    }
@@ -113,7 +120,7 @@ __ieee754_fmodl (long double x, long double y)
 	if(hz>=0) {hx=hz;lx=lz;}
 
     /* convert back to floating value and restore the sign */
-	if((hx|(lx&0x7fffffffffffffff))==0)			/* return sign(x)*0 */
+	if((hx|lx)==0)			/* return sign(x)*0 */
 	    return Zero[(u_int64_t)sx>>63];
 	while(hx<0x0001000000000000LL) {	/* normalize x */
 	    hx = hx+hx+(lx>>63); lx = lx+lx;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c b/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
index 768bd3b..3b07a47 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_hypotl.c
@@ -45,76 +45,84 @@
 #include <math.h>
 #include <math_private.h>
 
-static const long double two600 = 0x1.0p+600L;
-static const long double two1022 = 0x1.0p+1022L;
-
 long double
 __ieee754_hypotl(long double x, long double y)
 {
-	long double a,b,t1,t2,y1,y2,w,kld;
+	long double a,b,a1,a2,b1,b2,w,kld;
 	int64_t j,k,ha,hb;
+	double xhi, yhi, hi, lo;
 
-	GET_LDOUBLE_MSW64(ha,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ha, xhi);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hb, yhi);
 	ha &= 0x7fffffffffffffffLL;
-	GET_LDOUBLE_MSW64(hb,y);
 	hb &= 0x7fffffffffffffffLL;
 	if(hb > ha) {a=y;b=x;j=ha; ha=hb;hb=j;} else {a=x;b=y;}
 	a = fabsl(a);	/* a <- |a| */
 	b = fabsl(b);	/* b <- |b| */
-	if((ha-hb)>0x780000000000000LL) {return a+b;} /* x/y > 2**120 */
+	if((ha-hb)>0x0780000000000000LL) {return a+b;} /* x/y > 2**120 */
 	k=0;
 	kld = 1.0L;
 	if(ha > 0x5f30000000000000LL) {	/* a>2**500 */
 	   if(ha >= 0x7ff0000000000000LL) {	/* Inf or NaN */
-	       u_int64_t low;
 	       w = a+b;			/* for sNaN */
-	       GET_LDOUBLE_LSW64(low,a);
-	       if(((ha&0xfffffffffffffLL)|(low&0x7fffffffffffffffLL))==0)
+	       if(ha == 0x7ff0000000000000LL)
 		 w = a;
-	       GET_LDOUBLE_LSW64(low,b);
-	       if(((hb^0x7ff0000000000000LL)|(low&0x7fffffffffffffffLL))==0)
+	       if(hb == 0x7ff0000000000000LL)
 		 w = b;
 	       return w;
 	   }
 	   /* scale a and b by 2**-600 */
-	   ha -= 0x2580000000000000LL; hb -= 0x2580000000000000LL; k += 600;
-	   a /= two600;
-	   b /= two600;
-	   k += 600;
-	   kld = two600;
+	   a *= 0x1p-600L;
+	   b *= 0x1p-600L;
+	   k = 600;
+	   kld = 0x1p+600L;
 	}
-	if(hb < 0x23d0000000000000LL) {	/* b < 2**-450 */
+	else if(hb < 0x23d0000000000000LL) {	/* b < 2**-450 */
 	    if(hb <= 0x000fffffffffffffLL) {	/* subnormal b or 0 */
-		u_int64_t low;
-		GET_LDOUBLE_LSW64(low,b);
-		if((hb|(low&0x7fffffffffffffffLL))==0) return a;
-		t1=two1022;	/* t1=2^1022 */
-		b *= t1;
-		a *= t1;
-		k -= 1022;
-		kld = kld / two1022;
+		if(hb==0) return a;
+		a *= 0x1p+1022L;
+		b *= 0x1p+1022L;
+		k = -1022;
+		kld = 0x1p-1022L;
 	    } else {		/* scale a and b by 2^600 */
-		ha += 0x2580000000000000LL;	/* a *= 2^600 */
-		hb += 0x2580000000000000LL;	/* b *= 2^600 */
-		k -= 600;
-		a *= two600;
-		b *= two600;
-		kld = kld / two600;
+		a *= 0x1p+600L;
+		b *= 0x1p+600L;
+		k = -600;
+		kld = 0x1p-600L;
 	    }
 	}
     /* medium size a and b */
 	w = a-b;
 	if (w>b) {
-	    SET_LDOUBLE_WORDS64(t1,ha,0);
-	    t2 = a-t1;
-	    w  = __ieee754_sqrtl(t1*t1-(b*(-b)-t2*(a+t1)));
+	    ldbl_unpack (a, &hi, &lo);
+	    a1 = hi;
+	    a2 = lo;
+	    /* a*a + b*b
+	       = (a1+a2)*a + b*b
+	       = a1*a + a2*a + b*b
+	       = a1*(a1+a2) + a2*a + b*b
+	       = a1*a1 + a1*a2 + a2*a + b*b
+	       = a1*a1 + a2*(a+a1) + b*b  */
+	    w  = __ieee754_sqrtl(a1*a1-(b*(-b)-a2*(a+a1)));
 	} else {
 	    a  = a+a;
-	    SET_LDOUBLE_WORDS64(y1,hb,0);
-	    y2 = b - y1;
-	    SET_LDOUBLE_WORDS64(t1,ha+0x0010000000000000LL,0);
-	    t2 = a - t1;
-	    w  = __ieee754_sqrtl(t1*y1-(w*(-w)-(t1*y2+t2*b)));
+	    ldbl_unpack (b, &hi, &lo);
+	    b1 = hi;
+	    b2 = lo;
+	    ldbl_unpack (a, &hi, &lo);
+	    a1 = hi;
+	    a2 = lo;
+	    /* a*a + b*b
+	       = a*a + (a-b)*(a-b) - (a-b)*(a-b) + b*b
+	       = a*a + w*w  - (a*a - 2*a*b + b*b) + b*b
+	       = w*w + 2*a*b
+	       = w*w + (a1+a2)*b
+	       = w*w + a1*b + a2*b
+	       = w*w + a1*(b1+b2) + a2*b
+	       = w*w + a1*b1 + a1*b2 + a2*b  */
+	    w  = __ieee754_sqrtl(a1*b1-(w*(-w)-(a1*b2+a2*b)));
 	}
 	if(k!=0)
 	    return w*kld;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c b/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
index 67d7db7..800416f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_remainderl.c
@@ -33,18 +33,22 @@ __ieee754_remainderl(long double x, long double p)
 	int64_t hx,hp;
 	u_int64_t sx,lx,lp;
 	long double p_half;
+	double xhi, xlo, phi, plo;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hp,lp,p);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	ldbl_unpack (p, &phi, &plo);
+	EXTRACT_WORDS64 (hp, phi);
+	EXTRACT_WORDS64 (lp, plo);
 	sx = hx&0x8000000000000000ULL;
 	hp &= 0x7fffffffffffffffLL;
 	hx &= 0x7fffffffffffffffLL;
 
     /* purge off exception values */
-	if((hp|(lp&0x7fffffffffffffff))==0) return (x*p)/(x*p);	/* p = 0 */
+	if(hp==0) return (x*p)/(x*p);	/* p = 0 */
 	if((hx>=0x7ff0000000000000LL)||			/* x not finite */
-	  ((hp>=0x7ff0000000000000LL)&&			/* p is NaN */
-	  (((hp-0x7ff0000000000000LL)|lp)!=0)))
+	   (hp>0x7ff0000000000000LL))			/* p is NaN */
 	    return (x*p)/(x*p);
 
 
@@ -64,8 +68,8 @@ __ieee754_remainderl(long double x, long double p)
 		if(x>=p_half) x -= p;
 	    }
 	}
-	GET_LDOUBLE_MSW64(hx,x);
-	SET_LDOUBLE_MSW64(x,hx^sx);
+	if (sx)
+	  x = -x;
 	return x;
 }
 strong_alias (__ieee754_remainderl, __remainderl_finite)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_erfl.c b/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
index 6a4475e..c861c65 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_erfl.c
@@ -760,16 +760,16 @@ long double
 __erfl (long double x)
 {
   long double a, y, z;
-  int32_t i, ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t i, ix, hx;
+  double xhi;
 
-  u.value = x;
-  sign = u.parts32.w0;
-  ix = sign & 0x7fffffff;
+  xhi = ldbl_high (x);
+  GET_HIGH_WORD (hx, xhi);
+  ix = hx & 0x7fffffff;
 
   if (ix >= 0x7ff00000)
     {				/* erf(nan)=nan */
-      i = ((sign & 0xfff00000) >> 31) << 1;
+      i = ((uint32_t) hx >> 31) << 1;
       return (long double) (1 - i) + one / x;	/* erf(+-inf)=+-1 */
     }
 
@@ -778,7 +778,7 @@ __erfl (long double x)
       if (ix >= 0x4039A0DE)
 	{
 	/* __erfcl (x) underflows if x > 25.6283 */
-	  if (sign)
+	  if ((hx & 0x80000000) == 0)
 	    return one-tiny;
 	  else
 	    return tiny-one;
@@ -789,8 +789,9 @@ __erfl (long double x)
 	  return (one - y);
 	}
     }
-  u.parts32.w0 = ix;
-  a = u.value;
+  a = x;
+  if ((hx & 0x80000000) != 0)
+    a = -a;
   z = x * x;
   if (ix < 0x3fec0000)  /* a < 0.875 */
     {
@@ -814,7 +815,7 @@ __erfl (long double x)
       y = erf_const + neval (a, TN2, NTN2) / deval (a, TD2, NTD2);
     }
 
-  if (sign & 0x80000000) /* x < 0 */
+  if (hx & 0x80000000) /* x < 0 */
     y = -y;
   return( y );
 }
@@ -824,18 +825,18 @@ long double
 __erfcl (long double x)
 {
   long double y, z, p, r;
-  int32_t i, ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t i, ix;
+  uint32_t hx;
+  double xhi;
 
-  u.value = x;
-  sign = u.parts32.w0;
-  ix = sign & 0x7fffffff;
-  u.parts32.w0 = ix;
+  xhi = ldbl_high (x);
+  GET_HIGH_WORD (hx, xhi);
+  ix = hx & 0x7fffffff;
 
   if (ix >= 0x7ff00000)
     {				/* erfc(nan)=nan */
       /* erfc(+-inf)=0,2 */
-      return (long double) (((u_int32_t) sign >> 31) << 1) + one / x;
+      return (long double) ((hx >> 31) << 1) + one / x;
     }
 
   if (ix < 0x3fd00000) /* |x| <1/4 */
@@ -846,7 +847,8 @@ __erfcl (long double x)
     }
   if (ix < 0x3ff40000) /* 1.25 */
     {
-      x = u.value;
+      if ((hx & 0x80000000) != 0)
+	x = -x;
       i = 8.0 * x;
       switch (i)
 	{
@@ -891,7 +893,7 @@ __erfcl (long double x)
 	  y += C20a;
 	  break;
 	}
-      if (sign & 0x80000000)
+      if (hx & 0x80000000)
 	y = 2.0L - y;
       return y;
     }
@@ -899,10 +901,11 @@ __erfcl (long double x)
   if (ix < 0x405ac000)
     {
       /* x < -9 */
-      if ((ix >= 0x40220000) && (sign & 0x80000000))
+      if (hx >= 0xc0220000)
 	return two - tiny;
 
-      x = fabsl (x);
+      if ((hx & 0x80000000) != 0)
+	x = -x;
       z = one / (x * x);
       i = 8.0 / x;
       switch (i)
@@ -933,21 +936,17 @@ __erfcl (long double x)
 	  p = neval (z, RNr8, NRNr8) / deval (z, RDr8, NRDr8);
 	  break;
 	}
-      u.value = x;
-      u.parts32.w3 = 0;
-      u.parts32.w2 = 0;
-      u.parts32.w1 &= 0xf8000000;
-      z = u.value;
+      z = (float) x;
       r = __ieee754_expl (-z * z - 0.5625) *
 	__ieee754_expl ((z - x) * (z + x) + p);
-      if ((sign & 0x80000000) == 0)
+      if ((hx & 0x80000000) == 0)
 	return r / x;
       else
 	return two - r / x;
     }
   else
     {
-      if ((sign & 0x80000000) == 0)
+      if ((hx & 0x80000000) == 0)
 	return tiny * tiny;
       else
 	return two - tiny;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1ce1b171c86c73ea6b52d4784c43942a2060664c

commit 1ce1b171c86c73ea6b52d4784c43942a2060664c
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:24:58 2013 +0930

    PowerPC floating point little-endian [3 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00083.html
    
    Further replacement of ieee854 macros and unions.  These files also
    have some optimisations for comparison against 0.0L, infinity and nan.
    Since the ABI specifies that the high double of an IBM long double
    pair is the value rounded to double, a high double of 0.0 means the
    low double must also be 0.0.  The ABI also says that infinity and
    nan are encoded in the high double, with the low double unspecified.
    This means that tests for 0.0L, +/-Infinity and +/-NaN need only check
    the high double.
    
    	* sysdeps/ieee754/ldbl-128ibm/e_atan2l.c (__ieee754_atan2l): Rewrite
    	all uses of ieee854 long double macros and unions.  Simplify tests
    	for long doubles that are fully specified by the high double.
    	* sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c (__ieee754_gammal_r):
    	Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c (__ieee754_ilogbl): Likewise.
    	Remove dead code too.
    	* sysdeps/ieee754/ldbl-128ibm/e_jnl.c (__ieee754_jnl): Likewise.
    	(__ieee754_ynl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_log10l.c (__ieee754_log10l): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_logl.c (__ieee754_logl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_powl.c (__ieee754_powl): Likewise.
    	Remove dead code too.
    	* sysdeps/ieee754/ldbl-128ibm/k_tanl.c (__kernel_tanl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_expm1l.c (__expm1l): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_frexpl.c (__frexpl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c (__isinf_nsl): Likewise.
    	Simplify.
    	* sysdeps/ieee754/ldbl-128ibm/s_isinfl.c (___isinfl): Likewise.
    	Simplify.
    	* sysdeps/ieee754/ldbl-128ibm/s_log1pl.c (__log1pl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_modfl.c (__modfl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c (__nextafterl): Likewise.
    	Comment on variable precision.
    	* sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c (__nexttoward): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c (__nexttowardf):
    	Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_remquol.c (__remquol): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c (__scalblnl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c (__scalbnl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_tanhl.c (__tanhl): Likewise.
    	* sysdeps/powerpc/fpu/libm-test-ulps: Adjust tan_towardzero ulps.

diff --git a/ChangeLog b/ChangeLog
index 38a8b7d..aeb2f2c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,40 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/ieee754/ldbl-128ibm/e_atan2l.c (__ieee754_atan2l): Rewrite
+	all uses of ieee854 long double macros and unions.  Simplify tests
+	for long doubles that are fully specified by the high double.
+	* sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c (__ieee754_gammal_r):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c (__ieee754_ilogbl): Likewise.
+	Remove dead code too.
+	* sysdeps/ieee754/ldbl-128ibm/e_jnl.c (__ieee754_jnl): Likewise.
+	(__ieee754_ynl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_log10l.c (__ieee754_log10l): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_logl.c (__ieee754_logl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_powl.c (__ieee754_powl): Likewise.
+	Remove dead code too.
+	* sysdeps/ieee754/ldbl-128ibm/k_tanl.c (__kernel_tanl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_expm1l.c (__expm1l): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_frexpl.c (__frexpl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c (__isinf_nsl): Likewise.
+	Simplify.
+	* sysdeps/ieee754/ldbl-128ibm/s_isinfl.c (___isinfl): Likewise.
+	Simplify.
+	* sysdeps/ieee754/ldbl-128ibm/s_log1pl.c (__log1pl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_modfl.c (__modfl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c (__nextafterl): Likewise.
+	Comment on variable precision.
+	* sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c (__nexttoward): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c (__nexttowardf):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_remquol.c (__remquol): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c (__scalblnl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c (__scalbnl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_tanhl.c (__tanhl): Likewise.
+	* sysdeps/powerpc/fpu/libm-test-ulps: Adjust tan_towardzero ulps.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h (ldbl_high): Define.
 	* sysdeps/ieee754/ldbl-128ibm/e_acoshl.c (__ieee754_acoshl): Rewrite
 	all uses of ieee854 long double macros and unions.
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c b/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
index 3e05355..b625323 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_atan2l.c
@@ -56,11 +56,15 @@ __ieee754_atan2l(long double y, long double x)
 {
 	long double z;
 	int64_t k,m,hx,hy,ix,iy;
-	u_int64_t lx,ly;
+	uint64_t lx;
+	double xhi, xlo, yhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	ix = hx&0x7fffffffffffffffLL;
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	iy = hy&0x7fffffffffffffffLL;
 	if(((ix)>0x7ff0000000000000LL)||
 	   ((iy)>0x7ff0000000000000LL))	/* x or y is NaN */
@@ -70,7 +74,7 @@ __ieee754_atan2l(long double y, long double x)
 	m = ((hy>>63)&1)|((hx>>62)&2);	/* 2*sign(x)+sign(y) */
 
     /* when y = 0 */
-	if((iy|(ly&0x7fffffffffffffffLL))==0) {
+	if(iy==0) {
 	    switch(m) {
 		case 0:
 		case 1: return y;	/* atan(+-0,+anything)=+-0 */
@@ -79,7 +83,7 @@ __ieee754_atan2l(long double y, long double x)
 	    }
 	}
     /* when x = 0 */
-	if((ix|(lx&0x7fffffffffffffff))==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+	if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
 
     /* when x is INF */
 	if(ix==0x7ff0000000000000LL) {
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c b/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
index 90d8e3f..84c13de 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_gammal_r.c
@@ -122,11 +122,12 @@ long double
 __ieee754_gammal_r (long double x, int *signgamp)
 {
   int64_t hx;
-  u_int64_t lx;
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
 
-  if (((hx | lx) & 0x7fffffffffffffffLL) == 0)
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     {
       /* Return value for x == 0 is Inf with divide by zero exception.  */
       *signgamp = 0;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c b/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
index 55f87ed..aeace7c 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_ilogbl.c
@@ -31,26 +31,24 @@ static char rcsid[] = "$NetBSD: $";
 
 int __ieee754_ilogbl(long double x)
 {
-	int64_t hx,lx;
+	int64_t hx;
 	int ix;
+	double xhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	hx &= 0x7fffffffffffffffLL;
 	if(hx <= 0x0010000000000000LL) {
-	    if((hx|(lx&0x7fffffffffffffffLL))==0)
+	    if(hx==0)
 		return FP_ILOGB0;	/* ilogbl(0) = FP_ILOGB0 */
 	    else			/* subnormal x */
-		if(hx==0) {
-		    for (ix = -1043; lx>0; lx<<=1) ix -=1;
-		} else {
-		    for (ix = -1022, hx<<=11; hx>0; hx<<=1) ix -=1;
-		}
+		for (ix = -1022, hx<<=11; hx>0; hx<<=1) ix -=1;
 	    return ix;
 	}
 	else if (hx<0x7ff0000000000000LL) return (hx>>52)-0x3ff;
 	else if (FP_ILOGBNAN != INT_MAX) {
 	    /* ISO C99 requires ilogbl(+-Inf) == INT_MAX.  */
-	    if (((hx^0x7ff0000000000000LL)|lx) == 0)
+	    if (hx==0x7ff0000000000000LL)
 		return INT_MAX;
 	}
 	return FP_ILOGBNAN;
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_jnl.c b/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
index 40012e4..817977d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_jnl.c
@@ -70,26 +70,25 @@ static const long double
 long double
 __ieee754_jnl (int n, long double x)
 {
-  u_int32_t se;
+  uint32_t se, lx;
   int32_t i, ix, sgn;
   long double a, b, temp, di;
   long double z, w;
-  ieee854_long_double_shape_type u;
+  double xhi;
 
 
   /* J(-n,x) = (-1)^n * J(n, x), J(n, -x) = (-1)^n * J(n, x)
    * Thus, J(-n,x) = J(n,-x)
    */
 
-  u.value = x;
-  se = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (se, lx, xhi);
   ix = se & 0x7fffffff;
 
   /* if J(n,NaN) is NaN */
   if (ix >= 0x7ff00000)
     {
-      if ((u.parts32.w0 & 0xfffff) | u.parts32.w1
-	  | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3)
+      if (((ix - 0x7ff00000) | lx) != 0)
 	return x + x;
     }
 
@@ -298,21 +297,20 @@ strong_alias (__ieee754_jnl, __jnl_finite)
 long double
 __ieee754_ynl (int n, long double x)
 {
-  u_int32_t se;
+  uint32_t se, lx;
   int32_t i, ix;
   int32_t sign;
   long double a, b, temp;
-  ieee854_long_double_shape_type u;
+  double xhi;
 
-  u.value = x;
-  se = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (se, lx, xhi);
   ix = se & 0x7fffffff;
 
   /* if Y(n,NaN) is NaN */
   if (ix >= 0x7ff00000)
     {
-      if ((u.parts32.w0 & 0xfffff) | u.parts32.w1
-	  | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3)
+      if (((ix - 0x7ff00000) | lx) != 0)
 	return x + x;
     }
   if (x <= 0.0L)
@@ -377,14 +375,16 @@ __ieee754_ynl (int n, long double x)
       a = __ieee754_y0l (x);
       b = __ieee754_y1l (x);
       /* quit if b is -inf */
-      u.value = b;
-      se = u.parts32.w0 & 0xfff00000;
+      xhi = ldbl_high (b);
+      GET_HIGH_WORD (se, xhi);
+      se &= 0xfff00000;
       for (i = 1; i < n && se != 0xfff00000; i++)
 	{
 	  temp = b;
 	  b = ((long double) (i + i) / x) * b - a;
-	  u.value = b;
-	  se = u.parts32.w0 & 0xfff00000;
+	  xhi = ldbl_high (b);
+	  GET_HIGH_WORD (se, xhi);
+	  se &= 0xfff00000;
 	  a = temp;
 	}
     }
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_log10l.c b/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
index fae774c..1a6a4a0 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_log10l.c
@@ -182,11 +182,13 @@ __ieee754_log10l (long double x)
   long double z;
   long double y;
   int e;
-  int64_t hx, lx;
+  int64_t hx;
+  double xhi;
 
 /* Test for domain */
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  if (((hx & 0x7fffffffffffffffLL) | (lx & 0x7fffffffffffffffLL)) == 0)
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     return (-1.0L / (x - x));
   if (hx < 0)
     return (x - x) / (x - x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_logl.c b/sysdeps/ieee754/ldbl-128ibm/e_logl.c
index 15b5edf..b7db2b9 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_logl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_logl.c
@@ -188,18 +188,20 @@ static const long double
 long double
 __ieee754_logl(long double x)
 {
-  long double z, y, w;
-  ieee854_long_double_shape_type u, t;
+  long double z, y, w, t;
   unsigned int m;
   int k, e;
+  double xhi;
+  uint32_t hx, lx;
 
-  u.value = x;
-  m = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (hx, lx, xhi);
+  m = hx;
 
   /* Check for IEEE special cases.  */
   k = m & 0x7fffffff;
   /* log(0) = -infinity. */
-  if ((k | u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3) == 0)
+  if ((k | lx) == 0)
     {
       return -0.5L / ZERO;
     }
@@ -219,7 +221,7 @@ __ieee754_logl(long double x)
     {
       z = x - 1.0L;
       k = 64;
-      t.value  = 1.0L;
+      t = 1.0L;
       e = 0;
     }
   else
@@ -236,10 +238,8 @@ __ieee754_logl(long double x)
 	  k = (m - 0xff000) >> 13;
 	  /* t is the argument 0.5 + (k+26)/128
 	     of the nearest item to u in the lookup table.  */
-	  t.parts32.w0 = 0x3ff00000 + (k << 13);
-	  t.parts32.w1 = 0;
-	  t.parts32.w2 = 0;
-	  t.parts32.w3 = 0;
+	  INSERT_WORDS (xhi, 0x3ff00000 + (k << 13), 0);
+	  t = xhi;
 	  w0 += 0x100000;
 	  e -= 1;
 	  k += 64;
@@ -247,17 +247,15 @@ __ieee754_logl(long double x)
       else
 	{
 	  k = (m - 0xfe000) >> 14;
-	  t.parts32.w0 = 0x3fe00000 + (k << 14);
-	  t.parts32.w1 = 0;
-	  t.parts32.w2 = 0;
-	  t.parts32.w3 = 0;
+	  INSERT_WORDS (xhi, 0x3fe00000 + (k << 14), 0);
+	  t = xhi;
 	}
-      u.value = __scalbnl (u.value, ((int) ((w0 - u.parts32.w0) * 2)) >> 21);
+      x = __scalbnl (x, ((int) ((w0 - hx) * 2)) >> 21);
       /* log(u) = log( t u/t ) = log(t) + log(u/t)
 	 log(t) is tabulated in the lookup table.
 	 Express log(u/t) = log(1+z),  where z = u/t - 1 = (u-t)/t.
 	 cf. Cody & Waite. */
-      z = (u.value - t.value) / t.value;
+      z = (x - t) / t;
     }
   /* Series expansion of log(1+z).  */
   w = z * z;
@@ -284,7 +282,7 @@ __ieee754_logl(long double x)
   y += e * ln2b;  /* Base 2 exponent offset times ln(2).  */
   y += z;
   y += logtbl[k-26]; /* log(t) - (t-1) */
-  y += (t.value - 1.0L);
+  y += (t - 1.0L);
   y += e * ln2a;
   return y;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_powl.c b/sysdeps/ieee754/ldbl-128ibm/e_powl.c
index 8bd35d0..c942f2f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_powl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_powl.c
@@ -151,37 +151,32 @@ __ieee754_powl (long double x, long double y)
   long double y1, t1, t2, r, s, t, u, v, w;
   long double s2, s_h, s_l, t_h, t_l, ay;
   int32_t i, j, k, yisint, n;
-  u_int32_t ix, iy;
-  int32_t hx, hy;
-  ieee854_long_double_shape_type o, p, q;
+  uint32_t ix, iy;
+  int32_t hx, hy, hax;
+  double ohi, xhi, xlo, yhi, ylo;
+  uint32_t lx, ly, lj;
 
-  p.value = x;
-  hx = p.parts32.w0;
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS (hx, lx, xhi);
   ix = hx & 0x7fffffff;
 
-  q.value = y;
-  hy = q.parts32.w0;
+  ldbl_unpack (y, &yhi, &ylo);
+  EXTRACT_WORDS (hy, ly, yhi);
   iy = hy & 0x7fffffff;
 
-
   /* y==zero: x**0 = 1 */
-  if ((iy | q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if ((iy | ly) == 0)
     return one;
 
   /* 1.0**y = 1; -1.0**+-Inf = 1 */
   if (x == one)
     return one;
-  if (x == -1.0L && iy == 0x7ff00000
-      && (q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if (x == -1.0L && ((iy - 0x7ff00000) | ly) == 0)
     return one;
 
   /* +-NaN return x+y */
-  if ((ix > 0x7ff00000)
-      || ((ix == 0x7ff00000)
-	  && ((p.parts32.w1 | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) != 0))
-      || (iy > 0x7ff00000)
-      || ((iy == 0x7ff00000)
-	  && ((q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) != 0)))
+  if ((ix >= 0x7ff00000 && ((ix - 0x7ff00000) | lx) != 0)
+      || (iy >= 0x7ff00000 && ((iy - 0x7ff00000) | ly) != 0))
     return x + y;
 
   /* determine if y is an odd int when x < 0
@@ -192,7 +187,10 @@ __ieee754_powl (long double x, long double y)
   yisint = 0;
   if (hx < 0)
     {
-      if ((q.parts32.w2 & 0x7fffffff) >= 0x43400000)	/* Low part >= 2^53 */
+      uint32_t low_ye;
+
+      GET_HIGH_WORD (low_ye, ylo);
+      if ((low_ye & 0x7fffffff) >= 0x43400000)	/* Low part >= 2^53 */
 	yisint = 2;		/* even integer y */
       else if (iy >= 0x3ff00000)	/* 1.0 */
 	{
@@ -207,42 +205,43 @@ __ieee754_powl (long double x, long double y)
 	}
     }
 
+  ax = fabsl (x);
+
   /* special value of y */
-  if ((q.parts32.w1 | (q.parts32.w2 & 0x7fffffff) | q.parts32.w3) == 0)
+  if (ly == 0)
     {
-      if (iy == 0x7ff00000 && q.parts32.w1 == 0)	/* y is +-inf */
+      if (iy == 0x7ff00000)	/* y is +-inf */
 	{
-	  if (((ix - 0x3ff00000) | p.parts32.w1
-	       | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) == 0)
-	    return y - y;	/* inf**+-1 is NaN */
-	  else if (ix > 0x3ff00000 || fabsl (x) > 1.0L)
+	  if (ax > one)
 	    /* (|x|>1)**+-inf = inf,0 */
 	    return (hy >= 0) ? y : zero;
 	  else
 	    /* (|x|<1)**-,+inf = inf,0 */
 	    return (hy < 0) ? -y : zero;
 	}
-      if (iy == 0x3ff00000)
-	{			/* y is  +-1 */
-	  if (hy < 0)
-	    return one / x;
-	  else
-	    return x;
-	}
-      if (hy == 0x40000000)
-	return x * x;		/* y is  2 */
-      if (hy == 0x3fe00000)
-	{			/* y is  0.5 */
-	  if (hx >= 0)		/* x >= +0 */
-	    return __ieee754_sqrtl (x);
+      if (ylo == 0.0)
+	{
+	  if (iy == 0x3ff00000)
+	    {			/* y is  +-1 */
+	      if (hy < 0)
+		return one / x;
+	      else
+		return x;
+	    }
+	  if (hy == 0x40000000)
+	    return x * x;		/* y is  2 */
+	  if (hy == 0x3fe00000)
+	    {			/* y is  0.5 */
+	      if (hx >= 0)		/* x >= +0 */
+		return __ieee754_sqrtl (x);
+	    }
 	}
     }
 
-  ax = fabsl (x);
   /* special value of x */
-  if ((p.parts32.w1 | (p.parts32.w2 & 0x7fffffff) | p.parts32.w3) == 0)
+  if (lx == 0)
     {
-      if (ix == 0x7ff00000 || ix == 0 || ix == 0x3ff00000)
+      if (ix == 0x7ff00000 || ix == 0 || (ix == 0x3ff00000 && xlo == 0.0))
 	{
 	  z = ax;		/*x is +-0,+-inf,+-1 */
 	  if (hy < 0)
@@ -294,8 +293,8 @@ __ieee754_powl (long double x, long double y)
     {
       ax *= two113;
       n -= 113;
-      o.value = ax;
-      ix = o.parts32.w0;
+      ohi = ldbl_high (ax);
+      GET_HIGH_WORD (ix, ohi);
     }
   n += ((ix) >> 20) - 0x3ff;
   j = ix & 0x000fffff;
@@ -312,26 +311,19 @@ __ieee754_powl (long double x, long double y)
       ix -= 0x00100000;
     }
 
-  o.value = ax;
-  o.value = __scalbnl (o.value, ((int) ((ix - o.parts32.w0) * 2)) >> 21);
-  ax = o.value;
+  ohi = ldbl_high (ax);
+  GET_HIGH_WORD (hax, ohi);
+  ax = __scalbnl (ax, ((int) ((ix - hax) * 2)) >> 21);
 
   /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
   u = ax - bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
   v = one / (ax + bp[k]);
   s = u * v;
-  s_h = s;
+  s_h = ldbl_high (s);
 
-  o.value = s_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  s_h = o.value;
   /* t_h=ax+bp[k] High */
   t_h = ax + bp[k];
-  o.value = t_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t_h = o.value;
+  t_h = ldbl_high (t_h);
   t_l = ax - (t_h - bp[k]);
   s_l = v * ((u - s_h * t_h) - s_h * t_l);
   /* compute log(ax) */
@@ -342,30 +334,21 @@ __ieee754_powl (long double x, long double y)
   r += s_l * (s_h + s);
   s2 = s_h * s_h;
   t_h = 3.0 + s2 + r;
-  o.value = t_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t_h = o.value;
+  t_h = ldbl_high (t_h);
   t_l = r - ((t_h - 3.0) - s2);
   /* u+v = s*(1+...) */
   u = s_h * t_h;
   v = s_l * t_h + t_l * s;
   /* 2/(3log2)*(s+...) */
   p_h = u + v;
-  o.value = p_h;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  p_h = o.value;
+  p_h = ldbl_high (p_h);
   p_l = v - (p_h - u);
   z_h = cp_h * p_h;		/* cp_h+cp_l = 2/(3*log2) */
   z_l = cp_l * p_h + p_l * cp + dp_l[k];
   /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
   t = (long double) n;
   t1 = (((z_h + z_l) + dp_h[k]) + t);
-  o.value = t1;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t1 = o.value;
+  t1 = ldbl_high (t1);
   t2 = z_l - (((t1 - t) - dp_h[k]) - z_h);
 
   /* s (sign of result -ve**odd) = -1 else = 1 */
@@ -374,21 +357,16 @@ __ieee754_powl (long double x, long double y)
     s = -one;			/* (-ve)**(odd int) */
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-  y1 = y;
-  o.value = y1;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  y1 = o.value;
+  y1 = ldbl_high (y);
   p_l = (y - y1) * t1 + y * t2;
   p_h = y1 * t1;
   z = p_l + p_h;
-  o.value = z;
-  j = o.parts32.w0;
+  ohi = ldbl_high (z);
+  EXTRACT_WORDS (j, lj, ohi);
   if (j >= 0x40d00000) /* z >= 16384 */
     {
       /* if z > 16384 */
-      if (((j - 0x40d00000) | o.parts32.w1
-	| (o.parts32.w2 & 0x7fffffff) | o.parts32.w3) != 0)
+      if (((j - 0x40d00000) | lj) != 0)
 	return s * huge * huge;	/* overflow */
       else
 	{
@@ -399,8 +377,7 @@ __ieee754_powl (long double x, long double y)
   else if ((j & 0x7fffffff) >= 0x40d01b90)	/* z <= -16495 */
     {
       /* z < -16495 */
-      if (((j - 0xc0d01bc0) | o.parts32.w1
-	 | (o.parts32.w2 & 0x7fffffff) | o.parts32.w3) != 0)
+      if (((j - 0xc0d01bc0) | lj) != 0)
 	return s * tiny * tiny;	/* underflow */
       else
 	{
@@ -419,10 +396,7 @@ __ieee754_powl (long double x, long double y)
       p_h -= t;
     }
   t = p_l + p_h;
-  o.value = t;
-  o.parts32.w3 = 0;
-  o.parts32.w2 = 0;
-  t = o.value;
+  t = ldbl_high (t);
   u = t * lg2_h;
   v = (p_l - (t - p_h)) * lg2 + t * lg2_l;
   z = u + v;
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_tanl.c b/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
index 1f6bad2..bcf8b5e 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_tanl.c
@@ -85,17 +85,17 @@ long double
 __kernel_tanl (long double x, long double y, int iy)
 {
   long double z, r, v, w, s;
-  int32_t ix, sign;
-  ieee854_long_double_shape_type u, u1;
+  int32_t ix, sign, hx, lx;
+  double xhi;
 
-  u.value = x;
-  ix = u.parts32.w0 & 0x7fffffff;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (hx, lx, xhi);
+  ix = hx & 0x7fffffff;
   if (ix < 0x3c600000)		/* x < 2**-57 */
     {
-      if ((int) x == 0)
-	{			/* generate inexact */
-	  if ((ix | u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3
-	       | (iy + 1)) == 0)
+      if ((int) x == 0)		/* generate inexact */
+	{
+	  if ((ix | lx | (iy + 1)) == 0)
 	    return one / fabs (x);
 	  else
 	    return (iy == 1) ? x : -one / x;
@@ -103,7 +103,7 @@ __kernel_tanl (long double x, long double y, int iy)
     }
   if (ix >= 0x3fe59420) /* |x| >= 0.6743316650390625 */
     {
-      if ((u.parts32.w0 & 0x80000000) != 0)
+      if ((hx & 0x80000000) != 0)
 	{
 	  x = -x;
 	  y = -y;
@@ -139,15 +139,13 @@ __kernel_tanl (long double x, long double y, int iy)
     {				/* if allow error up to 2 ulp,
 				   simply return -1.0/(x+r) here */
       /*  compute -1.0/(x+r) accurately */
-      u1.value = w;
-      u1.parts32.w2 = 0;
-      u1.parts32.w3 = 0;
-      v = r - (u1.value - x);		/* u1+v = r+x */
+      long double u1, z1;
+
+      u1 = ldbl_high (w);
+      v = r - (u1 - x);		/* u1+v = r+x */
       z = -1.0 / w;
-      u.value = z;
-      u.parts32.w2 = 0;
-      u.parts32.w3 = 0;
-      s = 1.0 + u.value * u1.value;
-      return u.value + z * (s + u.value * v);
+      z1 = ldbl_high (z);
+      s = 1.0 + z1 * u1;
+      return z1 + z * (s + z1 * v);
     }
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c b/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
index 8808dcd..007e785 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_expm1l.c
@@ -92,19 +92,19 @@ long double
 __expm1l (long double x)
 {
   long double px, qx, xx;
-  int32_t ix, sign;
-  ieee854_long_double_shape_type u;
+  int32_t ix, lx, sign;
   int k;
+  double xhi;
 
   /* Detect infinity and NaN.  */
-  u.value = x;
-  ix = u.parts32.w0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (ix, lx, xhi);
   sign = ix & 0x80000000;
   ix &= 0x7fffffff;
   if (ix >= 0x7ff00000)
     {
       /* Infinity. */
-      if (((ix & 0xfffff) | u.parts32.w1 | (u.parts32.w2&0x7fffffff) | u.parts32.w3) == 0)
+      if (((ix - 0x7ff00000) | lx) == 0)
 	{
 	  if (sign)
 	    return -1.0L;
@@ -116,7 +116,7 @@ __expm1l (long double x)
     }
 
   /* expm1(+- 0) = +- 0.  */
-  if ((ix == 0) && (u.parts32.w1 | (u.parts32.w2&0x7fffffff) | u.parts32.w3) == 0)
+  if ((ix | lx) == 0)
     return x;
 
   /* Overflow.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c b/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
index 3ac5374..7e40663 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_frexpl.c
@@ -36,16 +36,21 @@ two107 = 162259276829213363391578010288128.0; /* 0x4670000000000000, 0 */
 
 long double __frexpl(long double x, int *eptr)
 {
-	u_int64_t hx, lx, ix, ixl;
+	uint64_t hx, lx, ix, ixl;
 	int64_t explo;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	ixl = 0x7fffffffffffffffULL&lx;
 	ix =  0x7fffffffffffffffULL&hx;
 	*eptr = 0;
-	if(ix>=0x7ff0000000000000ULL||((ix|ixl)==0)) return x;	/* 0,inf,nan */
+	if(ix>=0x7ff0000000000000ULL||ix==0) return x;	/* 0,inf,nan */
 	if (ix<0x0010000000000000ULL) {		/* subnormal */
 	    x *= two107;
-	    GET_LDOUBLE_MSW64(hx,x);
+	    xhi = ldbl_high (x);
+	    EXTRACT_WORDS64 (hx, xhi);
 	    ix = hx&0x7fffffffffffffffULL;
 	    *eptr = -107;
 	}
@@ -54,7 +59,7 @@ long double __frexpl(long double x, int *eptr)
 	if (ixl != 0ULL) {
 	  explo = (ixl>>52) - (ix>>52) + 0x3fe;
 	  if ((ixl&0x7ff0000000000000ULL) == 0LL) {
-	    /* the lower double is a denomal so we need to correct its
+	    /* the lower double is a denormal so we need to correct its
 	       mantissa and perhaps its exponent.  */
 	    int cnt;
 
@@ -73,7 +78,9 @@ long double __frexpl(long double x, int *eptr)
 	  lx = 0ULL;
 
 	hx = (hx&0x800fffffffffffffULL) | 0x3fe0000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,hx,lx);
+	INSERT_WORDS64 (xhi, hx);
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c b/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
index c8dd9ff..54e72c9 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isinf_nsl.c
@@ -1,6 +1,7 @@
 /*
  * __isinf_nsl(x) returns != 0 if x is Â±inf, else 0;
  * no branching!
+ * slightly dodgy in relying on signed shift right copying sign bit
  */
 
 #include <math.h>
@@ -9,8 +10,14 @@
 int
 __isinf_nsl (long double x)
 {
-	int64_t hx,lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	return !((lx & 0x7fffffffffffffffLL)
-		 | ((hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL));
+  double xhi;
+  int64_t hx, mask;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+
+  mask = (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
+  mask |= -mask;
+  mask >>= 63;
+  return ~mask;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c b/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
index 5f5b014..6a72822 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isinfl.c
@@ -11,6 +11,7 @@ static char rcsid[] = "$NetBSD: $";
 /*
  * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0;
  * no branching!
+ * slightly dodgy in relying on signed shift right copying sign bit
  */
 
 #include <math.h>
@@ -20,12 +21,16 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___isinfl (long double x)
 {
-	int64_t hx,lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	lx = (lx & 0x7fffffffffffffffLL);
-	lx |= (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
-	lx |= -lx;
-	return ~(lx >> 63) & (hx >> 62);
+  double xhi;
+  int64_t hx, mask;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+
+  mask = (hx & 0x7fffffffffffffffLL) ^ 0x7ff0000000000000LL;
+  mask |= -mask;
+  mask >>= 63;
+  return ~mask & (hx >> 62);
 }
 hidden_ver (___isinfl, __isinfl)
 #ifndef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c b/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
index 77c4fde..a346383 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_log1pl.c
@@ -126,19 +126,18 @@ long double
 __log1pl (long double xm1)
 {
   long double x, y, z, r, s;
-  ieee854_long_double_shape_type u;
-  int32_t hx;
+  double xhi;
+  int32_t hx, lx;
   int e;
 
   /* Test for NaN or infinity input. */
-  u.value = xm1;
-  hx = u.parts32.w0;
+  xhi = ldbl_high (xm1);
+  EXTRACT_WORDS (hx, lx, xhi);
   if (hx >= 0x7ff00000)
     return xm1;
 
   /* log1p(+- 0) = +- 0.  */
-  if (((hx & 0x7fffffff) == 0)
-      && (u.parts32.w1 | (u.parts32.w2 & 0x7fffffff) | u.parts32.w3) == 0)
+  if (((hx & 0x7fffffff) | lx) == 0)
     return xm1;
 
   x = xm1 + 1.0L;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_modfl.c b/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
index 39de9d4..ed03ce2 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_modfl.c
@@ -37,43 +37,54 @@ long double __modfl(long double x, long double *iptr)
 {
 	int64_t i0,i1,j0;
 	u_int64_t i;
-	GET_LDOUBLE_WORDS64(i0,i1,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (i0, xhi);
+	EXTRACT_WORDS64 (i1, xlo);
 	i1 &= 0x000fffffffffffffLL;
 	j0 = ((i0>>52)&0x7ff)-0x3ff;	/* exponent of x */
 	if(j0<52) {			/* integer part in high x */
 	    if(j0<0) {			/* |x|<1 */
 		/* *iptr = +-0 */
-	        SET_LDOUBLE_WORDS64(*iptr,i0&0x8000000000000000ULL,0);
+		INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		*iptr = xhi;
 		return x;
 	    } else {
 		i = (0x000fffffffffffffLL)>>j0;
 		if(((i0&i)|(i1&0x7fffffffffffffffLL))==0) {		/* x is integral */
 		    *iptr = x;
 		    /* return +-0 */
-		    SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+		    INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		    x = xhi;
 		    return x;
 		} else {
-		    SET_LDOUBLE_WORDS64(*iptr,i0&(~i),0);
+		    INSERT_WORDS64 (xhi, i0&(~i));
+		    *iptr = xhi;
 		    return x - *iptr;
 		}
 	    }
 	} else if (j0>103) {		/* no fraction part */
 	    *iptr = x*one;
 	    /* We must handle NaNs separately.  */
-	    if (j0 == 0x400 && ((i0 & 0x000fffffffffffffLL) | i1))
+	    if ((i0 & 0x7fffffffffffffffLL) > 0x7ff0000000000000LL)
 	      return x*one;
 	    /* return +-0 */
-	    SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+	    INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+	    x = xhi;
 	    return x;
 	} else {			/* fraction part in low x */
 	    i = -1ULL>>(j0-52);
 	    if((i1&i)==0) { 		/* x is integral */
 		*iptr = x;
 		/* return +-0 */
-		SET_LDOUBLE_WORDS64(x,i0&0x8000000000000000ULL,0);
+		INSERT_WORDS64 (xhi, i0&0x8000000000000000ULL);
+		x = xhi;
 		return x;
 	    } else {
-		SET_LDOUBLE_WORDS64(*iptr,i0,i1&(~i));
+		INSERT_WORDS64 (xhi, i0);
+		INSERT_WORDS64 (xlo, i1&(~i));
+		*iptr = ldbl_pack (xhi, xlo);
 		return x - *iptr;
 	    }
 	}
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c b/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
index 7e58127..c050944 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nextafterl.c
@@ -30,27 +30,28 @@ static char rcsid[] = "$NetBSD: $";
 
 long double __nextafterl(long double x, long double y)
 {
-	int64_t hx,hy,ihx,ihy,ilx;
-	u_int64_t lx;
-	u_int64_t ly __attribute__ ((unused));
+	int64_t hx,hy,ihx,ihy;
+	uint64_t lx;
+	double xhi, xlo, yhi;
 
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	ihx = hx&0x7fffffffffffffffLL;		/* |hx| */
-	ilx = lx&0x7fffffffffffffffLL;		/* |lx| */
 	ihy = hy&0x7fffffffffffffffLL;		/* |hy| */
 
-	if((((ihx&0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
-	    ((ihx&0x000fffffffffffffLL)!=0)) ||   /* x is nan */
-	   (((ihy&0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
-	    ((ihy&0x000fffffffffffffLL)!=0)))     /* y is nan */
+	if((ihx>0x7ff0000000000000LL) ||	/* x is nan */
+	   (ihy>0x7ff0000000000000LL))		/* y is nan */
 	    return x+y; /* signal the nan */
 	if(x==y)
 	    return y;		/* x=y, return y */
-	if(ihx == 0 && ilx == 0) {			/* x == 0 */
-	    long double u;
+	if(ihx == 0) {				/* x == 0 */
+	    long double u;			/* return +-minsubnormal */
 	    hy = (hy & 0x8000000000000000ULL) | 1;
-	    SET_LDOUBLE_WORDS64(x,hy,0ULL);/* return +-minsubnormal */
+	    INSERT_WORDS64 (yhi, hy);
+	    x = yhi;
 	    u = math_opt_barrier (x);
 	    u = u * u;
 	    math_force_eval (u);		/* raise underflow flag */
@@ -59,10 +60,16 @@ long double __nextafterl(long double x, long double y)
 
 	long double u;
 	if(x > y) {	/* x > y, x -= ulp */
+	    /* This isn't the largest magnitude correctly rounded
+	       long double as you can see from the lowest mantissa
+	       bit being zero.  It is however the largest magnitude
+	       long double with a 106 bit mantissa, and nextafterl
+	       is insane with variable precision.  So to make
+	       nextafterl sane we assume 106 bit precision.  */
 	    if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
 	      return x+x;	/* overflow, return -inf */
 	    if (hx >= 0x7ff0000000000000LL) {
-	      SET_LDOUBLE_WORDS64(u,0x7fefffffffffffffLL,0x7c8ffffffffffffeLL);
+	      u = 0x1.fffffffffffff7ffffffffffff8p+1023L;
 	      return u;
 	    }
 	    if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
@@ -77,16 +84,19 @@ long double __nextafterl(long double x, long double y)
 	      return x;
 	    }
 	    if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+	      INSERT_WORDS64 (yhi, hx & (0x7ffLL<<52));
+	      u = yhi;
 	      u *= 0x1.0000000000000p-105L;
-	    } else
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+	    } else {
+	      INSERT_WORDS64 (yhi, (hx & (0x7ffLL<<52))-(0x069LL<<52));
+	      u = yhi;
+	    }
 	    return x - u;
 	} else {				/* x < y, x += ulp */
 	    if((hx==0x7fefffffffffffffLL)&&(lx==0x7c8ffffffffffffeLL))
 	      return x+x;	/* overflow, return +inf */
-	    if ((u_int64_t) hx >= 0xfff0000000000000ULL) {
-	      SET_LDOUBLE_WORDS64(u,0xffefffffffffffffLL,0xfc8ffffffffffffeLL);
+	    if ((uint64_t) hx >= 0xfff0000000000000ULL) {
+	      u = -0x1.fffffffffffff7ffffffffffff8p+1023L;
 	      return u;
 	    }
 	    if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
@@ -103,10 +113,13 @@ long double __nextafterl(long double x, long double y)
 	      return x;
 	    }
 	    if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+	      INSERT_WORDS64 (yhi, hx & (0x7ffLL<<52));
+	      u = yhi;
 	      u *= 0x1.0000000000000p-105L;
-	    } else
-	      SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+	    } else {
+	      INSERT_WORDS64 (yhi, (hx & (0x7ffLL<<52))-(0x069LL<<52));
+	      u = yhi;
+	    }
 	    return x + u;
 	}
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c b/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
index 7e288a4..b40cf16 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nexttoward.c
@@ -34,23 +34,22 @@ double __nexttoward(double x, long double y)
 {
 	int32_t hx,ix;
 	int64_t hy,iy;
-	u_int32_t lx;
-	u_int64_t ly,uly;
+	uint32_t lx;
+	double yhi;
 
 	EXTRACT_WORDS(hx,lx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64(hy,yhi);
 	ix = hx&0x7fffffff;		/* |x| */
 	iy = hy&0x7fffffffffffffffLL;	/* |y| */
-	uly = ly&0x7fffffffffffffffLL;	/* |y| */
 
 	if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) ||   /* x is nan */
-	   ((iy>=0x7ff0000000000000LL)&&((iy-0x7ff0000000000000LL)|uly)!=0))
-							    /* y is nan */
+	   iy>0x7ff0000000000000LL)			    /* y is nan */
 	   return x+y;
 	if((long double) x==y) return y;	/* x=y, return y */
 	if((ix|lx)==0) {			/* x == 0 */
 	    double u;
-	    INSERT_WORDS(x,(u_int32_t)((hy>>32)&0x80000000),1);/* return +-minsub */
+	    INSERT_WORDS(x,(uint32_t)((hy>>32)&0x80000000),1);/* return +-minsub */
 	    u = math_opt_barrier (x);
 	    u = u * u;
 	    math_force_eval (u);		/* raise underflow flag */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c b/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
index b387a91..19522f4 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nexttowardf.c
@@ -27,16 +27,16 @@ float __nexttowardf(float x, long double y)
 {
 	int32_t hx,ix;
 	int64_t hy,iy;
-	u_int64_t ly, uly;
+	double yhi;
 
 	GET_FLOAT_WORD(hx,x);
-	GET_LDOUBLE_WORDS64(hy,ly,y);
+	yhi = ldbl_high (y);
+	EXTRACT_WORDS64 (hy, yhi);
 	ix = hx&0x7fffffff;		/* |x| */
 	iy = hy&0x7fffffffffffffffLL;	/* |y| */
-	uly = ly&0x7fffffffffffffffLL;	/* |y| */
 
 	if((ix>0x7f800000) ||   /* x is nan */
-	   ((iy>=0x7ff0000000000000LL)&&((iy-0x7ff0000000000000LL)|uly)!=0))
+	   (iy>0x7ff0000000000000LL))
 				/* y is nan */
 	   return x+y;
 	if((long double) x==y) return y;	/* x=y, return y */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_remquol.c b/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
index f4777a0..195e108 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_remquol.c
@@ -33,20 +33,24 @@ __remquol (long double x, long double y, int *quo)
   int64_t hx,hy;
   u_int64_t sx,lx,ly,qs;
   int cquo;
-
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  GET_LDOUBLE_WORDS64 (hy, ly, y);
+  double xhi, xlo, yhi, ylo;
+
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS64 (hx, xhi);
+  EXTRACT_WORDS64 (lx, xlo);
+  ldbl_unpack (y, &yhi, &ylo);
+  EXTRACT_WORDS64 (hy, yhi);
+  EXTRACT_WORDS64 (ly, ylo);
   sx = hx & 0x8000000000000000ULL;
   qs = sx ^ (hy & 0x8000000000000000ULL);
   hy &= 0x7fffffffffffffffLL;
   hx &= 0x7fffffffffffffffLL;
 
   /* Purge off exception values.  */
-  if ((hy | (ly & 0x7fffffffffffffff)) == 0)
+  if (hy == 0)
     return (x * y) / (x * y); 			/* y = 0 */
   if ((hx >= 0x7ff0000000000000LL)		/* x not finite */
-      || ((hy >= 0x7ff0000000000000LL)		/* y is NaN */
-	  && (((hy - 0x7ff0000000000000LL) | ly) != 0)))
+      || (hy > 0x7ff0000000000000LL))		/* y is NaN */
     return (x * y) / (x * y);
 
   if (hy <= 0x7fbfffffffffffffLL)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c b/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
index d752568..03d4597 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_scalblnl.c
@@ -41,11 +41,15 @@ long double __scalblnl (long double x, long int n)
 {
 	int64_t k,l,hx,lx;
 	union { int64_t i; double d; } u;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	k = (hx>>52)&0x7ff;		/* extract exponent */
 	l = (lx>>52)&0x7ff;
 	if (k==0) {				/* 0 or subnormal x */
-	    if (((hx|lx)&0x7fffffffffffffffULL)==0) return x; /* +-0 */
+	    if ((hx&0x7fffffffffffffffULL)==0) return x; /* +-0 */
 	    u.i = hx;
 	    u.d *= two54;
 	    hx = u.i;
@@ -61,7 +65,9 @@ long double __scalblnl (long double x, long int n)
 	if (k > 0) {				/* normal result */
 	    hx = (hx&0x800fffffffffffffULL)|(k<<52);
 	    if ((lx & 0x7fffffffffffffffULL) == 0) { /* low part +-0 */
-		SET_LDOUBLE_WORDS64(x,hx,lx);
+		INSERT_WORDS64 (xhi, hx);
+		INSERT_WORDS64 (xlo, lx);
+		x = ldbl_pack (xhi, xlo);
 		return x;
 	    }
 	    if (l == 0) { /* low part subnormal */
@@ -81,14 +87,19 @@ long double __scalblnl (long double x, long int n)
 		u.d *= twom54;
 		lx = u.i;
 	    }
-	    SET_LDOUBLE_WORDS64(x,hx,lx);
+	    INSERT_WORDS64 (xhi, hx);
+	    INSERT_WORDS64 (xlo, lx);
+	    x = ldbl_pack (xhi, xlo);
 	    return x;
 	}
 	if (k <= -54)
 	  return tiny*__copysignl(tiny,x); 	/*underflow*/
 	k += 54;				/* subnormal result */
 	lx &= 0x8000000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,(hx&0x800fffffffffffffULL)|(k<<52),lx);
+	hx &= 0x800fffffffffffffULL;
+	INSERT_WORDS64 (xhi, hx|(k<<52));
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x*twolm54;
 }
 long_double_symbol (libm, __scalblnl, scalblnl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c b/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
index bcdb23b..161172d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_scalbnl.c
@@ -41,11 +41,15 @@ long double __scalbnl (long double x, int n)
 {
 	int64_t k,l,hx,lx;
 	union { int64_t i; double d; } u;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	k = (hx>>52)&0x7ff;		/* extract exponent */
 	l = (lx>>52)&0x7ff;
 	if (k==0) {				/* 0 or subnormal x */
-	    if (((hx|lx)&0x7fffffffffffffffULL)==0) return x; /* +-0 */
+	    if ((hx&0x7fffffffffffffffULL)==0) return x; /* +-0 */
 	    u.i = hx;
 	    u.d *= two54;
 	    hx = u.i;
@@ -61,7 +65,9 @@ long double __scalbnl (long double x, int n)
 	if (k > 0) {				/* normal result */
 	    hx = (hx&0x800fffffffffffffULL)|(k<<52);
 	    if ((lx & 0x7fffffffffffffffULL) == 0) { /* low part +-0 */
-		SET_LDOUBLE_WORDS64(x,hx,lx);
+		INSERT_WORDS64 (xhi, hx);
+		INSERT_WORDS64 (xlo, lx);
+		x = ldbl_pack (xhi, xlo);
 		return x;
 	    }
 	    if (l == 0) { /* low part subnormal */
@@ -81,14 +87,19 @@ long double __scalbnl (long double x, int n)
 		u.d *= twom54;
 		lx = u.i;
 	    }
-	    SET_LDOUBLE_WORDS64(x,hx,lx);
+	    INSERT_WORDS64 (xhi, hx);
+	    INSERT_WORDS64 (xlo, lx);
+	    x = ldbl_pack (xhi, xlo);
 	    return x;
 	}
 	if (k <= -54)
 	  return tiny*__copysignl(tiny,x); 	/*underflow*/
 	k += 54;				/* subnormal result */
 	lx &= 0x8000000000000000ULL;
-	SET_LDOUBLE_WORDS64(x,(hx&0x800fffffffffffffULL)|(k<<52),lx);
+	hx &= 0x800fffffffffffffULL;
+	INSERT_WORDS64 (xhi, hx|(k<<52));
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x*twolm54;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c b/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
index 138b63c..c63e253 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_tanhl.c
@@ -47,10 +47,12 @@ static const long double one=1.0L, two=2.0L, tiny = 1.0e-300L;
 long double __tanhl(long double x)
 {
 	long double t,z;
-	int64_t jx,ix,lx;
+	int64_t jx,ix;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_WORDS64(jx,lx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (jx, xhi);
 	ix = jx&0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
@@ -61,7 +63,7 @@ long double __tanhl(long double x)
 
     /* |x| < 22 */
 	if (ix < 0x4036000000000000LL) {		/* |x|<22 */
-	    if ((ix | (lx&0x7fffffffffffffffLL)) == 0)
+	    if (ix == 0)
 		return x;		/* x == +-0 */
 	    if (ix<0x3c60000000000000LL) 	/* |x|<2**-57 */
 		return x*(one+x);    	/* tanh(small) = small */
diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps
index 6fdace9..3dcc344 100644
--- a/sysdeps/powerpc/fpu/libm-test-ulps
+++ b/sysdeps/powerpc/fpu/libm-test-ulps
@@ -6592,6 +6592,9 @@ float: 1
 ifloat: 1
 ildouble: 2
 ldouble: 2
+Test "tan_towardzero (2)":
+ildouble: 1
+ldouble: 1
 Test "tan_towardzero (3)":
 float: 1
 ifloat: 1

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=eac0b9f9daac0ede19f1689e9a0f0aa5aa3ac811

commit eac0b9f9daac0ede19f1689e9a0f0aa5aa3ac811
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:24:05 2013 +0930

    PowerPC floating point little-endian [2 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00082.html
    
    This patch replaces occurrences of GET_LDOUBLE_* and SET_LDOUBLE_*
    macros, and union ieee854_long_double_shape_type in ldbl-128ibm/,
    and a stray one in the 32-bit fpu support.  These files have no
    significant changes apart from rewriting the long double bit access.
    
    	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h (ldbl_high): Define.
    	* sysdeps/ieee754/ldbl-128ibm/e_acoshl.c (__ieee754_acoshl): Rewrite
    	all uses of ieee854 long double macros and unions.
    	* sysdeps/ieee754/ldbl-128ibm/e_acosl.c (__ieee754_acosl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_asinl.c (__ieee754_asinl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_atanhl.c (__ieee754_atanhl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_coshl.c (__ieee754_coshl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_log2l.c (__ieee754_log2l): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c (__ieee754_rem_pio2l):
    	Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_sinhl.c (__ieee754_sinhl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/k_cosl.c (__kernel_cosl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/k_sincosl.c (__kernel_sincosl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/k_sinl.c (__kernel_sinl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_asinhl.c (__asinhl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_atanl.c (__atanl): Likewise.
    	Simplify sign and nan test too.
    	* sysdeps/ieee754/ldbl-128ibm/s_cosl.c (__cosl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_fabsl.c (__fabsl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_finitel.c (___finitel): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c (___fpclassifyl):
    	Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_isnanl.c (___isnanl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c (__issignalingl):
    	Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_logbl.c (__logbl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_signbitl.c (___signbitl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_sincosl.c (__sincosl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_sinl.c (__sinl): Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_tanl.c (__tanl): Likewise.
    	* sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c (__logbl): Likewise.

diff --git a/ChangeLog b/ChangeLog
index 8bc1698..38a8b7d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,39 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h (ldbl_high): Define.
+	* sysdeps/ieee754/ldbl-128ibm/e_acoshl.c (__ieee754_acoshl): Rewrite
+	all uses of ieee854 long double macros and unions.
+	* sysdeps/ieee754/ldbl-128ibm/e_acosl.c (__ieee754_acosl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_asinl.c (__ieee754_asinl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_atanhl.c (__ieee754_atanhl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_coshl.c (__ieee754_coshl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_log2l.c (__ieee754_log2l): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c (__ieee754_rem_pio2l):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_sinhl.c (__ieee754_sinhl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/k_cosl.c (__kernel_cosl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/k_sincosl.c (__kernel_sincosl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/k_sinl.c (__kernel_sinl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_asinhl.c (__asinhl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_atanl.c (__atanl): Likewise.
+	Simplify sign and nan test too.
+	* sysdeps/ieee754/ldbl-128ibm/s_cosl.c (__cosl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_fabsl.c (__fabsl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_finitel.c (___finitel): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c (___fpclassifyl):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_isnanl.c (___isnanl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c (__issignalingl):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_logbl.c (__logbl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_signbitl.c (___signbitl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_sincosl.c (__sincosl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_sinl.c (__sinl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_tanl.c (__tanl): Likewise.
+	* sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c (__logbl): Likewise.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* stdio-common/printf_size.c (__printf_size): Don't use
 	union ieee854_long_double in fpnum union.
 	* stdio-common/printf_fphex.c (__printf_fphex): Likewise.  Use
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c b/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
index abc78a3..8a4a5bb 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_acoshl.c
@@ -36,8 +36,12 @@ __ieee754_acoshl(long double x)
 {
 	long double t;
 	int64_t hx;
-	u_int64_t lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	uint64_t lx;
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	if(hx<0x3ff0000000000000LL) {		/* x < 1 */
 	    return (x-x)/(x-x);
 	} else if(hx >=0x41b0000000000000LL) {	/* x > 2**28 */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
index 5d2af30..8663993 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_acosl.c
@@ -151,26 +151,25 @@ static const long double
 long double
 __ieee754_acosl (long double x)
 {
-  long double z, r, w, p, q, s, t, f2;
-  ieee854_long_double_shape_type u;
+  long double a, z, r, w, p, q, s, t, f2;
 
-  u.value = __builtin_fabsl (x);
-  if (u.value == 1.0L)
+  a = __builtin_fabsl (x);
+  if (a == 1.0L)
     {
       if (x > 0.0L)
 	return 0.0;		/* acos(1) = 0  */
       else
 	return (2.0 * pio2_hi) + (2.0 * pio2_lo);	/* acos(-1)= pi */
     }
-  else if (u.value > 1.0L)
+  else if (a > 1.0L)
     {
       return (x - x) / (x - x);	/* acos(|x| > 1) is NaN */
     }
-  if (u.value < 0.5L)
+  if (a < 0.5L)
     {
-      if (u.value < 6.938893903907228e-18L)	/* |x| < 2**-57 */
+      if (a < 6.938893903907228e-18L)	/* |x| < 2**-57 */
 	return pio2_hi + pio2_lo;
-      if (u.value < 0.4375L)
+      if (a < 0.4375L)
 	{
 	  /* Arcsine of x.  */
 	  z = x * x;
@@ -199,7 +198,7 @@ __ieee754_acosl (long double x)
 	  return z;
 	}
       /* .4375 <= |x| < .5 */
-      t = u.value - 0.4375L;
+      t = a - 0.4375L;
       p = ((((((((((P10 * t
 		    + P9) * t
 		   + P8) * t
@@ -230,9 +229,9 @@ __ieee754_acosl (long double x)
 	r = acosr4375 + r;
       return r;
     }
-  else if (u.value < 0.625L)
+  else if (a < 0.625L)
     {
-      t = u.value - 0.5625L;
+      t = a - 0.5625L;
       p = ((((((((((rS10 * t
 		    + rS9) * t
 		   + rS8) * t
@@ -264,7 +263,9 @@ __ieee754_acosl (long double x)
     }
   else
     {				/* |x| >= .625 */
-      z = (one - u.value) * 0.5;
+      double shi, slo;
+
+      z = (one - a) * 0.5;
       s = __ieee754_sqrtl (z);
       /* Compute an extended precision square root from
 	 the Newton iteration  s -> 0.5 * (s + z / s).
@@ -273,12 +274,11 @@ __ieee754_acosl (long double x)
 	  Express s = f1 + f2 where f1 * f1 is exactly representable.
 	  w = (z - s^2)/2s = (z - f1^2 - 2 f1 f2 - f2^2)/2s .
 	  s + w has extended precision.  */
-      u.value = s;
-      u.parts32.w2 = 0;
-      u.parts32.w3 = 0;
-      f2 = s - u.value;
-      w = z - u.value * u.value;
-      w = w - 2.0 * u.value * f2;
+      ldbl_unpack (s, &shi, &slo);
+      a = shi;
+      f2 = slo;
+      w = z - a * a;
+      w = w - 2.0 * a * f2;
       w = w - f2 * f2;
       w = w / (2.0 * s);
       /* Arcsine of s.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
index b395439..99a5b85 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_asinl.c
@@ -131,19 +131,18 @@ static const long double
 long double
 __ieee754_asinl (long double x)
 {
-  long double t, w, p, q, c, r, s;
+  long double a, t, w, p, q, c, r, s;
   int flag;
-  ieee854_long_double_shape_type u;
 
   flag = 0;
-  u.value = __builtin_fabsl (x);
-  if (u.value == 1.0L)	/* |x|>= 1 */
+  a = __builtin_fabsl (x);
+  if (a == 1.0L)	/* |x|>= 1 */
     return x * pio2_hi + x * pio2_lo;	/* asin(1)=+-pi/2 with inexact */
-  else if (u.value >= 1.0L)
+  else if (a >= 1.0L)
     return (x - x) / (x - x);	/* asin(|x|>1) is NaN */
-  else if (u.value < 0.5L)
+  else if (a < 0.5L)
     {
-      if (u.value < 6.938893903907228e-18L) /* |x| < 2**-57 */
+      if (a < 6.938893903907228e-18L) /* |x| < 2**-57 */
 	{
 	  if (huge + x > one)
 	    return x;		/* return x with inexact if x!=0 */
@@ -155,9 +154,9 @@ __ieee754_asinl (long double x)
 	  flag = 1;
 	}
     }
-  else if (u.value < 0.625L)
+  else if (a < 0.625L)
     {
-      t = u.value - 0.5625;
+      t = a - 0.5625;
       p = ((((((((((rS10 * t
 		    + rS9) * t
 		   + rS8) * t
@@ -190,7 +189,7 @@ __ieee754_asinl (long double x)
   else
     {
       /* 1 > |x| >= 0.625 */
-      w = one - u.value;
+      w = one - a;
       t = w * 0.5;
     }
 
@@ -223,17 +222,14 @@ __ieee754_asinl (long double x)
     }
 
   s = __ieee754_sqrtl (t);
-  if (u.value > 0.975L)
+  if (a > 0.975L)
     {
       w = p / q;
       t = pio2_hi - (2.0 * (s + s * w) - pio2_lo);
     }
   else
     {
-      u.value = s;
-      u.parts32.w3 = 0;
-      u.parts32.w2 = 0;
-      w = u.value;
+      w = ldbl_high (s);
       c = (t - w * w) / (s + w);
       r = p / q;
       p = 2.0 * s * r - (pio2_lo - 2.0 * c);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c b/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
index f35182f..29f2e92 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_atanhl.c
@@ -40,8 +40,10 @@ __ieee754_atanhl(long double x)
 {
 	long double t;
 	int64_t hx,ix;
-	u_int64_t lx __attribute__ ((unused));
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi;
+
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	ix = hx&0x7fffffffffffffffLL;
 	if (ix >= 0x3ff0000000000000LL) { /* |x|>=1 */
 	    if (ix > 0x3ff0000000000000LL)
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_coshl.c b/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
index 3e8e187..05683bc 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_coshl.c
@@ -41,9 +41,11 @@ __ieee754_coshl (long double x)
 {
 	long double t,w;
 	int64_t ix;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 	ix &= 0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_log2l.c b/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
index f0098f6..323ded0 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_log2l.c
@@ -177,11 +177,13 @@ __ieee754_log2l (x)
   long double z;
   long double y;
   int e;
-  int64_t hx, lx;
+  int64_t hx;
+  double xhi;
 
 /* Test for domain */
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
-  if (((hx & 0x7fffffffffffffffLL) | (lx & 0x7fffffffffffffffLL)) == 0)
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  if ((hx & 0x7fffffffffffffffLL) == 0)
     return (-1.0L / (x - x));
   if (hx < 0)
     return (x - x) / (x - x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
index 8885def..36bc032 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
@@ -200,10 +200,11 @@ int32_t __ieee754_rem_pio2l(long double x, long double *y)
   double tx[8];
   int exp;
   int64_t n, ix, hx, ixd;
-  u_int64_t lx __attribute__ ((unused));
   u_int64_t lxd;
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
   ix = hx & 0x7fffffffffffffffLL;
   if (ix <= 0x3fe921fb54442d10LL)	/* x in <-pi/4, pi/4> */
     {
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c b/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
index 4e8481c..1790bef 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_sinhl.c
@@ -38,9 +38,11 @@ __ieee754_sinhl(long double x)
 {
 	long double t,w,h;
 	int64_t ix,jx;
+	double xhi;
 
     /* High word of |x|. */
-	GET_LDOUBLE_MSW64(jx,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (jx, xhi);
 	ix = jx&0x7fffffffffffffffLL;
 
     /* x is INF or NaN */
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_cosl.c b/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
index 0b81782..046f3b5 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_cosl.c
@@ -81,8 +81,11 @@ __kernel_cosl(long double x, long double y)
 {
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
-  u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
+  uint32_t tix, hix, index;
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
   tix = ((u_int64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
@@ -136,7 +139,8 @@ __kernel_cosl(long double x, long double y)
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       l = y - (h - x);
       z = l * l;
       sin_l = l*(ONE+z*(SSIN1+z*(SSIN2+z*(SSIN3+z*(SSIN4+z*SSIN5)))));
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c b/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
index fc1ead6..3ba9d7e 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_sincosl.c
@@ -100,9 +100,12 @@ __kernel_sincosl(long double x, long double y, long double *sinx, long double *c
 {
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
-  u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
-  tix = ((u_int64_t)ix) >> 32;
+  uint32_t tix, hix, index;
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
+  tix = ((uint64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
     {
@@ -164,7 +167,8 @@ __kernel_sincosl(long double x, long double y, long double *sinx, long double *c
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       if (iy)
 	l = y - (h - x);
       else
diff --git a/sysdeps/ieee754/ldbl-128ibm/k_sinl.c b/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
index f17c0ae..b12ea13 100644
--- a/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/k_sinl.c
@@ -82,7 +82,10 @@ __kernel_sinl(long double x, long double y, int iy)
   long double h, l, z, sin_l, cos_l_m1;
   int64_t ix;
   u_int32_t tix, hix, index;
-  GET_LDOUBLE_MSW64 (ix, x);
+  double xhi, hhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
   tix = ((u_int64_t)ix) >> 32;
   tix &= ~0x80000000;			/* tix = |x|'s high 32 bits */
   if (tix < 0x3fc30000)			/* |x| < 0.1484375 */
@@ -132,7 +135,8 @@ __kernel_sinl(long double x, long double y, int iy)
 	case 2: index = (hix - 0x3fc30000) >> 14; break;
 	}
 */
-      SET_LDOUBLE_WORDS64(h, ((u_int64_t)hix) << 32, 0);
+      INSERT_WORDS64 (hhi, ((uint64_t)hix) << 32);
+      h = hhi;
       if (iy)
 	l = (ix < 0 ? -y : y) - (h - x);
       else
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 4bb49c8..8adb081 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -192,6 +192,9 @@ default_ldbl_unpack (long double l, double *a, double *aa)
 # define ldbl_unpack default_ldbl_unpack
 #endif
 
+/* Extract high double.  */
+#define ldbl_high(x) ((double) x)
+
 /* Convert a finite long double to canonical form.
    Does not handle +/-Inf properly.  */
 static inline void
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c b/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
index a833457..63c6edb 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_asinhl.c
@@ -38,7 +38,10 @@ long double __asinhl(long double x)
 {
 	long double t,w;
 	int64_t hx,ix;
-	GET_LDOUBLE_MSW64(hx,x);
+	double xhi;
+
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (hx, xhi);
 	ix = hx&0x7fffffffffffffffLL;
 	if(ix>=0x7ff0000000000000LL) return x+x;	/* x is inf or NaN */
 	if(ix< 0x3e20000000000000LL) {	/* |x|<2**-29 */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_atanl.c b/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
index 2a36d16..41dde23 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_atanl.c
@@ -173,23 +173,20 @@ static const long double
 long double
 __atanl (long double x)
 {
-  int k, sign;
+  int32_t k, sign, lx;
   long double t, u, p, q;
-  ieee854_long_double_shape_type s;
+  double xhi;
 
-  s.value = x;
-  k = s.parts32.w0;
-  if (k & 0x80000000)
-    sign = 1;
-  else
-    sign = 0;
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS (k, lx, xhi);
+  sign = k & 0x80000000;
 
   /* Check for IEEE special cases.  */
   k &= 0x7fffffff;
   if (k >= 0x7ff00000)
     {
       /* NaN. */
-      if ((k & 0xfffff) | s.parts32.w1 )
+      if (((k - 0x7ff00000) | lx) != 0)
 	return (x + x);
 
       /* Infinity. */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_cosl.c b/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
index 2314839..54c6cc7 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_cosl.c
@@ -53,9 +53,11 @@ long double __cosl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c b/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
index 99146d8..c801c97 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_fabsl.c
@@ -29,10 +29,16 @@ static char rcsid[] = "$NetBSD: $";
 long double __fabsl(long double x)
 {
 	u_int64_t hx, lx;
-	GET_LDOUBLE_WORDS64(hx,lx,x);
+	double xhi, xlo;
+
+	ldbl_unpack (x, &xhi, &xlo);
+	EXTRACT_WORDS64 (hx, xhi);
+	EXTRACT_WORDS64 (lx, xlo);
 	lx = lx ^ ( hx & 0x8000000000000000LL );
 	hx = hx & 0x7fffffffffffffffLL;
-	SET_LDOUBLE_WORDS64(x,hx,lx);
+	INSERT_WORDS64 (xhi, hx);
+	INSERT_WORDS64 (xlo, lx);
+	x = ldbl_pack (xhi, xlo);
 	return x;
 }
 long_double_symbol (libm, __fabsl, fabsl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_finitel.c b/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
index 8edb341..7b4655f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_finitel.c
@@ -29,10 +29,14 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___finitel (long double x)
 {
-	int64_t hx;
-	GET_LDOUBLE_MSW64(hx,x);
-	return (int)((u_int64_t)((hx&0x7fffffffffffffffLL)
-				 -0x7ff0000000000000LL)>>63);
+  uint64_t hx;
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  hx &= 0x7fffffffffffffffLL;
+  hx -= 0x7ff0000000000000LL;
+  return hx >> 63;
 }
 hidden_ver (___finitel, __finitel)
 weak_alias (___finitel, ____finitel)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c b/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
index f4a90b0..90586e8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_fpclassifyl.c
@@ -46,8 +46,10 @@ ___fpclassifyl (long double x)
 {
   u_int64_t hx, lx;
   int retval = FP_NORMAL;
+  double xhi, xlo;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  ldbl_unpack (x, &xhi, &xlo);
+  EXTRACT_WORDS64 (hx, xhi);
   if ((hx & 0x7ff0000000000000ULL) == 0x7ff0000000000000ULL) {
       /* +/-NaN or +/-Inf */
       if (hx & 0x000fffffffffffffULL) {
@@ -65,6 +67,7 @@ ___fpclassifyl (long double x)
 	      retval = FP_NORMAL;
 	  } else {
 	      if ((hx & 0x7ff0000000000000ULL) == 0x0360000000000000ULL) {
+		  EXTRACT_WORDS64 (lx, xlo);
 		  if ((lx & 0x7fffffffffffffff)	/* lower is non-zero */
 		  && ((lx^hx) & 0x8000000000000000ULL)) { /* and sign differs */
 		      /* +/- denormal */
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c b/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
index 264dec7..d12f1d3 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_isnanl.c
@@ -29,12 +29,14 @@ static char rcsid[] = "$NetBSD: $";
 int
 ___isnanl (long double x)
 {
-	int64_t hx;
-	int64_t lx __attribute__ ((unused));
-	GET_LDOUBLE_WORDS64(hx,lx,x);
-	hx &= 0x7fffffffffffffffLL;
-	hx = 0x7ff0000000000000LL - hx;
-	return (int)((u_int64_t)hx>>63);
+  uint64_t hx;
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
+  hx &= 0x7fffffffffffffffLL;
+  hx = 0x7ff0000000000000LL - hx;
+  return (int) (hx >> 63);
 }
 hidden_ver (___isnanl, __isnanl)
 #ifndef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c b/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
index 96fab1a..bdd58f8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_issignalingl.c
@@ -22,10 +22,13 @@
 int
 __issignalingl (long double x)
 {
-  u_int64_t xi;
+  uint64_t xi;
   /* For inspecting NaN status, we only have to look at the first of the pair
      of IEEE 754 64-bit precision numbers.  */
-  GET_LDOUBLE_MSW64 (xi, x);
+  double xhi;
+
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (xi, xhi);
 #ifdef HIGH_ORDER_BIT_IS_SET_FOR_SNAN
 # error untested
   /* We only have to care about the high-order bit of x's significand, because
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_logbl.c b/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
index 6cbfcfa..e140288 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_logbl.c
@@ -27,9 +27,10 @@ long double
 __logbl (long double x)
 {
   int64_t hx, rhx;
-  int64_t lx __attribute__ ((unused));
+  double xhi;
 
-  GET_LDOUBLE_WORDS64 (hx, lx, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (hx, xhi);
   hx &= 0x7fffffffffffffffLL;	/* high |x| */
   if (hx == 0)
     return -1.0 / fabs (x);
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c b/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
index ee4aea6..aecb1fd 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_signbitl.c
@@ -25,8 +25,10 @@ int
 ___signbitl (long double x)
 {
   int64_t e;
+  double xhi;
 
-  GET_LDOUBLE_MSW64 (e, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (e, xhi);
   return e < 0;
 }
 #ifdef IS_IN_libm
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c b/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
index 3b1e547..a9e2f3d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_sincosl.c
@@ -27,9 +27,11 @@ void
 __sincosl (long double x, long double *sinx, long double *cosx)
 {
   int64_t ix;
+  double xhi;
 
   /* High word of x. */
-  GET_LDOUBLE_MSW64 (ix, x);
+  xhi = ldbl_high (x);
+  EXTRACT_WORDS64 (ix, xhi);
 
   /* |x| ~< pi/4 */
   ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_sinl.c b/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
index 6fec16f..087921a 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_sinl.c
@@ -53,9 +53,11 @@ long double __sinl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_tanl.c b/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
index 9967d0c..66b8a06 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_tanl.c
@@ -53,9 +53,11 @@ long double __tanl(long double x)
 {
 	long double y[2],z=0.0L;
 	int64_t n, ix;
+	double xhi;
 
     /* High word of x. */
-	GET_LDOUBLE_MSW64(ix,x);
+	xhi = ldbl_high (x);
+	EXTRACT_WORDS64 (ix, xhi);
 
     /* |x| ~< pi/4 */
 	ix &= 0x7fffffffffffffffLL;
diff --git a/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c b/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
index e008ed0..1c82577 100644
--- a/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
+++ b/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
@@ -35,14 +35,14 @@ static const union {
 long double
 __logbl (long double x)
 {
-  double xh, xl;
+  double xh;
   double ret;
 
   if (__builtin_expect (x == 0.0L, 0))
     /* Raise FE_DIVBYZERO and return -HUGE_VAL[LF].  */
     return -1.0L / __builtin_fabsl (x);
 
-  ldbl_unpack (x, &xh, &xl);
+  xh = ldbl_high (x);
   /* ret = x & 0x7ff0000000000000;  */
   asm (
     "xxland %x0,%x1,%x2\n"
@@ -58,9 +58,9 @@ __logbl (long double x)
     {
       /* POSIX specifies that denormal number is treated as
          though it were normalized.  */
-      int64_t lx, hx;
+      int64_t hx;
 
-      GET_LDOUBLE_WORDS64 (hx, lx, x);
+      EXTRACT_WORDS64 (hx, xh);
       return (long double) (-1023 - (__builtin_clzll (hx) - 12));
     }
   /* Test to avoid logb_downward (0.0) == -0.0.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1157aba3b996ce427cc4099d0572dcc2b630fb76

commit 1157aba3b996ce427cc4099d0572dcc2b630fb76
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:21:58 2013 +0930

    PowerPC floating point little-endian [1 of 15]
    http://sourceware.org/ml/libc-alpha/2013-08/msg00081.html
    
    This is the first of a series of patches to ban ieee854_long_double
    and the ieee854_long_double macros when using IBM long double.  union
    ieee854_long_double just isn't correct for IBM long double, especially
    when little-endian, and pretending it is OK has allowed a number of
    bugs to remain undetected in sysdeps/ieee754/ldbl-128ibm/.
    
    This changes the few places in generic code that use it.
    
    	* stdio-common/printf_size.c (__printf_size): Don't use
    	union ieee854_long_double in fpnum union.
    	* stdio-common/printf_fphex.c (__printf_fphex): Likewise.  Use
    	signbit macro to retrieve sign from long double.
    	* stdio-common/printf_fp.c (___printf_fp): Use signbit macro to
    	retrieve sign from long double.
    	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c: Adjust for fpnum change.
    	* sysdeps/ieee754/ldbl-128/printf_fphex.c: Likewise.
    	* sysdeps/ieee754/ldbl-96/printf_fphex.c: Likewise.
    	* sysdeps/x86_64/fpu/printf_fphex.c: Likewise.
    	* math/test-misc.c (main): Don't use union ieee854_long_double.
    ports/
    	* sysdeps/ia64/fpu/printf_fphex.c: Adjust for fpnum change.

diff --git a/ChangeLog b/ChangeLog
index 55abf03..8bc1698 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,19 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	* stdio-common/printf_size.c (__printf_size): Don't use
+	union ieee854_long_double in fpnum union.
+	* stdio-common/printf_fphex.c (__printf_fphex): Likewise.  Use
+	signbit macro to retrieve sign from long double.
+	* stdio-common/printf_fp.c (___printf_fp): Use signbit macro to
+	retrieve sign from long double.
+	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c: Adjust for fpnum change.
+	* sysdeps/ieee754/ldbl-128/printf_fphex.c: Likewise.
+	* sysdeps/ieee754/ldbl-96/printf_fphex.c: Likewise.
+	* sysdeps/x86_64/fpu/printf_fphex.c: Likewise.
+	* math/test-misc.c (main): Don't use union ieee854_long_double.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	[BZ #15680]
 	* sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c: Comment fix.
 	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
diff --git a/math/test-misc.c b/math/test-misc.c
index 27d673b..f5276eb 100644
--- a/math/test-misc.c
+++ b/math/test-misc.c
@@ -722,300 +722,161 @@ main (void)
 
 #ifndef NO_LONG_DOUBLE
   {
-    union ieee854_long_double v1;
-    union ieee854_long_double v2;
-    long double ld;
+    long double v1, v2;
 
-    v1.d = ld = LDBL_MIN;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = LDBL_MIN;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (LDBL_MIN) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MIN) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, LDBL_MIN / 2.0);
-    if (fpclassify (ld) != FP_SUBNORMAL)
+    v2 = nextafterl (v1, LDBL_MIN / 2.0);
+    if (fpclassify (v2) != FP_SUBNORMAL)
       {
 	printf ("fpclassify (LDBL_MIN-epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
-    v2.d = ld = nextafterl (ld, LDBL_MIN);
-    if (fpclassify (ld) != FP_NORMAL)
+    v2 = nextafterl (v2, LDBL_MIN);
+    if (fpclassify (v2) != FP_NORMAL)
       {
 	printf ("fpclassify (LDBL_MIN-epsilon+epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
+    if (v1 != v2)
       {
-	printf ("LDBL_MIN: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("LDBL_MIN: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("LDBL_MIN: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("LDBL_MIN: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("LDBL_MIN-epsilon+epsilon != LDBL_MIN: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -LDBL_MIN;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = -LDBL_MIN;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (-LDBL_MIN) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (-LDBL_MIN) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, -LDBL_MIN / 2.0);
-    if (fpclassify (ld) != FP_SUBNORMAL)
+    v2 = nextafterl (v1, -LDBL_MIN / 2.0);
+    if (fpclassify (v2) != FP_SUBNORMAL)
       {
 	printf ("fpclassify (-LDBL_MIN-epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
-    v2.d = ld = nextafterl (ld, -LDBL_MIN);
-    if (fpclassify (ld) != FP_NORMAL)
+    v2 = nextafterl (v2, -LDBL_MIN);
+    if (fpclassify (v2) != FP_NORMAL)
       {
 	printf ("fpclassify (-LDBL_MIN-epsilon+epsilon) failed: %d (%La)\n",
-		fpclassify (ld), ld);
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("-LDBL_MIN: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-LDBL_MIN: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("-LDBL_MIN: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-LDBL_MIN: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-LDBL_MIN-epsilon+epsilon != -LDBL_MIN: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    ld = LDBL_MAX;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = LDBL_MAX;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (LDBL_MAX) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MAX) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, INFINITY);
-    if (fpclassify (ld) != FP_INFINITE)
+    v2 = nextafterl (v1, INFINITY);
+    if (fpclassify (v2) != FP_INFINITE)
       {
-	printf ("fpclassify (LDBL_MAX+epsilon) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (LDBL_MAX+epsilon) failed: %d (%La)\n",
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    ld = -LDBL_MAX;
-    if (fpclassify (ld) != FP_NORMAL)
+    v1 = -LDBL_MAX;
+    if (fpclassify (v1) != FP_NORMAL)
       {
-	printf ("fpclassify (-LDBL_MAX) failed: %d\n", fpclassify (ld));
+	printf ("fpclassify (-LDBL_MAX) failed: %d (%La)\n",
+		fpclassify (v1), v1);
 	result = 1;
       }
-    ld = nextafterl (ld, -INFINITY);
-    if (fpclassify (ld) != FP_INFINITE)
+    v2 = nextafterl (v1, -INFINITY);
+    if (fpclassify (v2) != FP_INFINITE)
       {
-	printf ("fpclassify (-LDBL_MAX-epsilon) failed: %d\n",
-		fpclassify (ld));
+	printf ("fpclassify (-LDBL_MAX-epsilon) failed: %d (%La)\n",
+		fpclassify (v2), v2);
 	result = 1;
       }
 
-    v1.d = ld = 0.0625;
-    ld = nextafterl (ld, 0.0);
-    v2.d = ld = nextafterl (ld, 1.0);
+    v1 = 0.0625;
+    v2 = nextafterl (v1, 0.0);
+    v2 = nextafterl (v2, 1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0625L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0625L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0625L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
+    if (v1 != v2)
       {
-	printf ("0.0625L down: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("0.0625L-epsilon+epsilon != 0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = 0.0625;
-    ld = nextafterl (ld, 1.0);
-    v2.d = ld = nextafterl (ld, 0.0);
+    v1 = 0.0625;
+    v2 = nextafterl (v1, 1.0);
+    v2 = nextafterl (v2, 0.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0625L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0625L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0625L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
+    if (v1 != v2)
       {
-	printf ("0.0625L up: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("0.0625L+epsilon-epsilon != 0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -0.0625;
-    ld = nextafterl (ld, 0.0);
-    v2.d = ld = nextafterl (ld, -1.0);
+    v1 = -0.0625;
+    v2 = nextafterl (v1, 0.0);
+    v2 = nextafterl (v2, -1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
+    if (v1 != v2)
       {
-	printf ("-0.0625L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-0.0625L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("-0.0625L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-0.0625L up: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-0.0625L+epsilon-epsilon != -0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = -0.0625;
-    ld = nextafterl (ld, -1.0);
-    v2.d = ld = nextafterl (ld, 0.0);
+    v1 = -0.0625;
+    v2 = nextafterl (v1, -1.0);
+    v2 = nextafterl (v2, 0.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("-0.0625L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("-0.0625L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("-0.0625L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
-	result = 1;
-      }
-    if (v1.ieee.negative != v2.ieee.negative)
-      {
-	printf ("-0.0625L down: negative differs: %d vs %d\n",
-		v1.ieee.negative, v2.ieee.negative);
+	printf ("-0.0625L-epsilon+epsilon != -0.0625L: %La vs %La\n", v2, v1);
 	result = 1;
       }
 
-    v1.d = ld = 0.0;
-    ld = nextafterl (ld, 1.0);
-    v2.d = nextafterl (ld, -1.0);
+    v1 = 0.0;
+    v2 = nextafterl (v1, 1.0);
+    v2 = nextafterl (v2, -1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0L up: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
-      {
-	printf ("0.0L up: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
+    if (v1 != v2)
       {
-	printf ("0.0L up: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
+	printf ("0.0+epsilon-epsilon != 0.0L: %La vs %La\n", v2, v1);
 	result = 1;
       }
-    if (0 != v2.ieee.negative)
+    if (signbit (v2))
       {
-	printf ("0.0L up: negative differs: 0 vs %d\n",
-		v2.ieee.negative);
+	printf ("0.0+epsilon-epsilon is negative\n");
 	result = 1;
       }
 
-    v1.d = ld = 0.0;
-    ld = nextafterl (ld, -1.0);
-    v2.d = nextafterl (ld, 1.0);
+    v1 = 0.0;
+    v2 = nextafterl (v1, -1.0);
+    v2 = nextafterl (v2, 1.0);
 
-    if (v1.ieee.mantissa0 != v2.ieee.mantissa0)
-      {
-	printf ("0.0L down: mantissa0 differs: %8x vs %8x\n",
-		v1.ieee.mantissa0, v2.ieee.mantissa0);
-	result = 1;
-      }
-    if (v1.ieee.mantissa1 != v2.ieee.mantissa1)
+    if (v1 != v2)
       {
-	printf ("0.0L down: mantissa1 differs: %8x vs %8x\n",
-		v1.ieee.mantissa1, v2.ieee.mantissa1);
-	result = 1;
-      }
-    if (v1.ieee.exponent != v2.ieee.exponent)
-      {
-	printf ("0.0L down: exponent differs: %4x vs %4x\n",
-		v1.ieee.exponent, v2.ieee.exponent);
+	printf ("0.0-epsilon+epsilon != 0.0L: %La vs %La\n", v2, v1);
 	result = 1;
       }
-    if (1 != v2.ieee.negative)
+    if (!signbit (v2))
       {
-	printf ("0.0L down: negative differs: 1 vs %d\n",
-		v2.ieee.negative);
+	printf ("0.0-epsilon+epsilon is positive\n");
 	result = 1;
       }
 
diff --git a/ports/ChangeLog b/ports/ChangeLog
index fcb58cc..5edce77 100644
--- a/ports/ChangeLog
+++ b/ports/ChangeLog
@@ -1,3 +1,7 @@
+2013-19-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/ia64/fpu/printf_fphex.c: Adjust for fpnum change.
+
 2013-01-02  Joseph Myers  <joseph@codesourcery.com>
 
 	* README: Update copyright dates in example.
diff --git a/ports/sysdeps/ia64/fpu/printf_fphex.c b/ports/sysdeps/ia64/fpu/printf_fphex.c
index ca02142..0698cda 100644
--- a/ports/sysdeps/ia64/fpu/printf_fphex.c
+++ b/ports/sysdeps/ia64/fpu/printf_fphex.c
@@ -25,9 +25,11 @@ do {									      \
       /* The "strange" 80 bit format on ia64 has an explicit		      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -49,8 +51,8 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
-									     \
+      exponent = u.ieee.exponent;					      \
+									      \
       if (exponent == 0)						      \
 	{								      \
 	  if (zero_mantissa)						      \
diff --git a/stdio-common/printf_fp.c b/stdio-common/printf_fp.c
index e20eab6..2b93e6c 100644
--- a/stdio-common/printf_fp.c
+++ b/stdio-common/printf_fp.c
@@ -332,8 +332,7 @@ ___printf_fp (FILE *fp,
       int res;
       if (__isnanl (fpnum.ldbl))
 	{
-	  union ieee854_long_double u = { .d = fpnum.ldbl };
-	  is_neg = u.ieee.negative != 0;
+	  is_neg = signbit (fpnum.ldbl);
 	  if (isupper (info->spec))
 	    {
 	      special = "NAN";
diff --git a/stdio-common/printf_fphex.c b/stdio-common/printf_fphex.c
index 3da2eec..50b6fbf 100644
--- a/stdio-common/printf_fphex.c
+++ b/stdio-common/printf_fphex.c
@@ -93,7 +93,7 @@ __printf_fphex (FILE *fp,
   union
     {
       union ieee754_double dbl;
-      union ieee854_long_double ldbl;
+      long double ldbl;
     }
   fpnum;
 
@@ -162,12 +162,11 @@ __printf_fphex (FILE *fp,
 #ifndef __NO_LONG_DOUBLE_MATH
   if (info->is_long_double && sizeof (long double) > sizeof (double))
     {
-      fpnum.ldbl.d = *(const long double *) args[0];
+      fpnum.ldbl = *(const long double *) args[0];
 
       /* Check for special values: not a number or infinity.  */
-      if (__isnanl (fpnum.ldbl.d))
+      if (__isnanl (fpnum.ldbl))
 	{
-	  negative = fpnum.ldbl.ieee.negative != 0;
 	  if (isupper (info->spec))
 	    {
 	      special = "NAN";
@@ -181,8 +180,7 @@ __printf_fphex (FILE *fp,
 	}
       else
 	{
-	  int res = __isinfl (fpnum.ldbl.d);
-	  if (res)
+	  if (__isinfl (fpnum.ldbl))
 	    {
 	      if (isupper (info->spec))
 		{
@@ -194,11 +192,9 @@ __printf_fphex (FILE *fp,
 		  special = "inf";
 		  wspecial = L"inf";
 		}
-	      negative = res < 0;
 	    }
-	  else
-	    negative = signbit (fpnum.ldbl.d);
 	}
+      negative = signbit (fpnum.ldbl);
     }
   else
 #endif	/* no long double */
diff --git a/stdio-common/printf_size.c b/stdio-common/printf_size.c
index 2c496e5..dfb3a53 100644
--- a/stdio-common/printf_size.c
+++ b/stdio-common/printf_size.c
@@ -103,7 +103,7 @@ __printf_size (FILE *fp, const struct printf_info *info,
   union
     {
       union ieee754_double dbl;
-      union ieee854_long_double ldbl;
+      long double ldbl;
     }
   fpnum;
   const void *ptr = &fpnum;
@@ -123,25 +123,25 @@ __printf_size (FILE *fp, const struct printf_info *info,
 #ifndef __NO_LONG_DOUBLE_MATH
   if (info->is_long_double && sizeof (long double) > sizeof (double))
     {
-      fpnum.ldbl.d = *(const long double *) args[0];
+      fpnum.ldbl = *(const long double *) args[0];
 
       /* Check for special values: not a number or infinity.  */
-      if (__isnanl (fpnum.ldbl.d))
+      if (__isnanl (fpnum.ldbl))
 	{
 	  special = "nan";
 	  wspecial = L"nan";
 	  // fpnum_sign = 0;	Already zero
 	}
-      else if ((res = __isinfl (fpnum.ldbl.d)))
+      else if ((res = __isinfl (fpnum.ldbl)))
 	{
 	  fpnum_sign = res;
 	  special = "inf";
 	  wspecial = L"inf";
 	}
       else
-	while (fpnum.ldbl.d >= divisor && tag[1] != '\0')
+	while (fpnum.ldbl >= divisor && tag[1] != '\0')
 	  {
-	    fpnum.ldbl.d /= divisor;
+	    fpnum.ldbl /= divisor;
 	    ++tag;
 	  }
     }
diff --git a/sysdeps/ieee754/ldbl-128/printf_fphex.c b/sysdeps/ieee754/ldbl-128/printf_fphex.c
index c9e09a4..e82228a 100644
--- a/sysdeps/ieee754/ldbl-128/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128/printf_fphex.c
@@ -24,13 +24,15 @@ do {									      \
 	 digits we use only the implicit digits for the number before	      \
 	 the decimal point.  */						      \
       unsigned long long int num0, num1;				      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
-      num0 = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
-      num1 = (((unsigned long long int) fpnum.ldbl.ieee.mantissa2) << 32      \
-	     | fpnum.ldbl.ieee.mantissa3);				      \
+      num0 = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
+      num1 = (((unsigned long long int) u.ieee.mantissa2) << 32		      \
+	     | u.ieee.mantissa3);					      \
 									      \
       zero_mantissa = (num0|num1) == 0;					      \
 									      \
@@ -75,9 +77,9 @@ do {									      \
 	  *--wnumstr = L'0';						      \
 	}								      \
 									      \
-      leading = fpnum.ldbl.ieee.exponent == 0 ? '0' : '1';		      \
+      leading = u.ieee.exponent == 0 ? '0' : '1';			      \
 									      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 453c2be..e0ec422 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -27,14 +27,14 @@ do {									      \
       unsigned long long hi, lo;					      \
       int ediff;							      \
       union ibm_extended_long_double u;					      \
-      u.ld = fpnum.ldbl.d;						      \
+      u.ld = fpnum.ldbl;						      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
       lo = ((long long)u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;  \
       hi = ((long long)u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;  \
       lo <<= 7; /* pre-shift lo to match ieee854.  */			      \
-      /* If the lower double is not a denomal or zero then set the hidden     \
+      /* If the lower double is not a denormal or zero then set the hidden    \
 	 53rd bit.  */							      \
       if (u.d[1].ieee.exponent != 0)					      \
 	lo |= (1ULL << (52 + 7));					      \
diff --git a/sysdeps/ieee754/ldbl-96/printf_fphex.c b/sysdeps/ieee754/ldbl-96/printf_fphex.c
index f356a48..715c93b 100644
--- a/sysdeps/ieee754/ldbl-96/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-96/printf_fphex.c
@@ -25,11 +25,13 @@ do {									      \
       /* The "strange" 80 bit format on ix86 and m68k has an explicit	      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
       assert (sizeof (long double) == 12);				      \
 									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -62,7 +64,7 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
diff --git a/sysdeps/x86_64/fpu/printf_fphex.c b/sysdeps/x86_64/fpu/printf_fphex.c
index c85d1f7..be55f9c 100644
--- a/sysdeps/x86_64/fpu/printf_fphex.c
+++ b/sysdeps/x86_64/fpu/printf_fphex.c
@@ -25,10 +25,11 @@ do {									      \
       /* The "strange" 80 bit format on ix86 and m68k has an explicit	      \
 	 leading digit in the 64 bit mantissa.  */			      \
       unsigned long long int num;					      \
+      union ieee854_long_double u;					      \
+      u.d = fpnum.ldbl;							      \
 									      \
-									      \
-      num = (((unsigned long long int) fpnum.ldbl.ieee.mantissa0) << 32	      \
-	     | fpnum.ldbl.ieee.mantissa1);				      \
+      num = (((unsigned long long int) u.ieee.mantissa0) << 32		      \
+	     | u.ieee.mantissa1);					      \
 									      \
       zero_mantissa = num == 0;						      \
 									      \
@@ -61,7 +62,7 @@ do {									      \
 									      \
       /* We have 3 bits from the mantissa in the leading nibble.	      \
 	 Therefore we are here using `IEEE854_LONG_DOUBLE_BIAS + 3'.  */      \
-      exponent = fpnum.ldbl.ieee.exponent;				      \
+      exponent = u.ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4f36342de3e83319bdf70e4c0442a5d0df33334f

commit 4f36342de3e83319bdf70e4c0442a5d0df33334f
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:19:44 2013 +0930

    Fix for [BZ #15680] IBM long double inaccuracy
    http://sourceware.org/ml/libc-alpha/2013-06/msg00919.html
    
    I discovered a number of places where denormals and other corner cases
    were being handled wrongly.
    
    - printf_fphex.c: Testing for the low double exponent being zero is
    unnecessary.  If the difference in exponents is less than 53 then the
    high double exponent must be nearing the low end of its range, and the
    low double exponent hit rock bottom.
    
    - ldbl2mpn.c: A denormal (ie. exponent of zero) value is treated as
    if the exponent was one, so shift mantissa left by one.  Code handling
    normalisation of the low double mantissa lacked a test for shift count
    greater than bits in type being shifted, and lacked anything to handle
    the case where the difference in exponents is less than 53 as in
    printf_fphex.c.
    
    - math_ldbl.h (ldbl_extract_mantissa): Same as above, but worse, with
    code testing for exponent > 1 for some reason, probably a typo for >= 1.
    
    - math_ldbl.h (ldbl_insert_mantissa): Round the high double as per
    mpn2ldbl.c (hi is odd or explicit mantissas non-zero) so that the
    number we return won't change when applying ldbl_canonicalize().
    Add missing overflow checks and normalisation of high mantissa.
    Correct misleading comment: "The hidden bit of the lo mantissa is
    zero" is not always true as can be seen from the code rounding the hi
    mantissa.  Also by inspection, lzcount can never be less than zero so
    remove that test.  Lastly, masking bitfields to their widths can be
    left to the compiler.
    
    - mpn2ldbl.c: The overflow checks here on rounding of high double were
    just plain wrong.  Incrementing the exponent must be accompanied by a
    shift right of the mantissa to keep the value unchanged.  Above notes
    for ldbl_insert_mantissa are also relevant.
    
    	[BZ #15680]
    	* sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c: Comment fix.
    	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
    	(PRINT_FPHEX_LONG_DOUBLE): Tidy code by moving -53 into ediff
    	calculation.  Remove unnecessary test for denormal exponent.
    	* sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c (__mpn_extract_long_double):
    	Correct handling of denormals.  Avoid undefined shift behaviour.
    	Correct normalisation of low mantissa when low double is denormal.
    	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
    	(ldbl_extract_mantissa): Likewise.  Comment.  Use uint64_t* for hi64.
    	(ldbl_insert_mantissa): Make both hi64 and lo64 parms uint64_t.
    	Correct normalisation of low mantissa.  Test for overflow of high
    	mantissa and normalise.
    	(ldbl_nearbyint): Use more readable constant for two52.
    	* sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
    	(__mpn_construct_long_double): Fix test for overflow of high
    	mantissa and correct normalisation.  Avoid undefined shift.

diff --git a/ChangeLog b/ChangeLog
index 272795b..55abf03 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,25 @@
 2013-10-04  Alan Modra  <amodra@gmail.com>
 
+	[BZ #15680]
+	* sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c: Comment fix.
+	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+	(PRINT_FPHEX_LONG_DOUBLE): Tidy code by moving -53 into ediff
+	calculation.  Remove unnecessary test for denormal exponent.
+	* sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c (__mpn_extract_long_double):
+	Correct handling of denormals.  Avoid undefined shift behaviour.
+	Correct normalisation of low mantissa when low double is denormal.
+	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+	(ldbl_extract_mantissa): Likewise.  Comment.  Use uint64_t* for hi64.
+	(ldbl_insert_mantissa): Make both hi64 and lo64 parms uint64_t.
+	Correct normalisation of low mantissa.  Test for overflow of high
+	mantissa and normalise.
+	(ldbl_nearbyint): Use more readable constant for two52.
+	* sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
+	(__mpn_construct_long_double): Fix test for overflow of high
+	mantissa and correct normalisation.  Avoid undefined shift.
+
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
 	* sysdeps/ieee754/ldbl-128ibm/ieee754.h
 	(union ibm_extended_long_double): Define as an array of ieee754_double.
 	(IBM_EXTENDED_LONG_DOUBLE_BIAS): Delete.
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
index 6a72d6a..8885def 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_rem_pio2l.c
@@ -243,7 +243,7 @@ int32_t __ieee754_rem_pio2l(long double x, long double *y)
      We split the 113 bits of the mantissa into 5 24bit integers
      stored in a double array.  */
   /* Make the IBM extended format 105 bit mantissa look like the ieee854 112
-     bit mantissa so the next operatation will give the correct result.  */
+     bit mantissa so the next operation will give the correct result.  */
   ldbl_extract_mantissa (&ixd, &lxd, &exp, x);
   exp = exp - 23;
   /* This is faster than doing this in floating point, because we
diff --git a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
index 5149ba1..e46fde7 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
@@ -36,6 +36,7 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   union ibm_extended_long_double u;
   unsigned long long hi, lo;
   int ediff;
+
   u.ld = value;
 
   *is_neg = u.d[0].ieee.negative;
@@ -43,27 +44,36 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 
   lo = ((long long) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
   hi = ((long long) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
-  /* If the lower double is not a denomal or zero then set the hidden
+
+  /* If the lower double is not a denormal or zero then set the hidden
      53rd bit.  */
-  if (u.d[1].ieee.exponent > 0)
-    {
-      lo |= 1LL << 52;
+  if (u.d[1].ieee.exponent != 0)
+    lo |= 1ULL << 52;
+  else
+    lo = lo << 1;
 
-      /* The lower double is normalized separately from the upper.  We may
-	 need to adjust the lower manitissa to reflect this.  */
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
-      if (ediff > 53)
-	lo = lo >> (ediff-53);
+  /* The lower double is normalized separately from the upper.  We may
+     need to adjust the lower manitissa to reflect this.  */
+  ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
+  if (ediff > 0)
+    {
+      if (ediff < 64)
+	lo = lo >> ediff;
+      else
+	lo = 0;
     }
+  else if (ediff < 0)
+    lo = lo << -ediff;
+
   /* The high double may be rounded and the low double reflects the
      difference between the long double and the rounded high double
      value.  This is indicated by a differnce between the signs of the
      high and low doubles.  */
-  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
-      && ((u.d[1].ieee.exponent != 0) && (lo != 0L)))
+  if (u.d[0].ieee.negative != u.d[1].ieee.negative
+      && lo != 0)
     {
       lo = (1ULL << 53) - lo;
-      if (hi == 0LL)
+      if (hi == 0)
 	{
 	  /* we have a borrow from the hidden bit, so shift left 1.  */
 	  hi = 0x0ffffffffffffeLL | (lo >> 51);
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 3036f14..4bb49c8 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -6,6 +6,10 @@
 #include <ieee754.h>
 #include <stdint.h>
 
+/* To suit our callers we return *hi64 and *lo64 as if they came from
+   an ieee854 112 bit mantissa, that is, 48 bits in *hi64 (plus one
+   implicit bit) and 64 bits in *lo64.  */
+
 static inline void
 ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
 {
@@ -14,77 +18,119 @@ ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
      the number before the decimal point and the second implicit bit
      as bit 53 of the mantissa.  */
   uint64_t hi, lo;
-  int ediff;
   union ibm_extended_long_double u;
+
   u.ld = x;
   *exp = u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
   lo = ((uint64_t) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
   hi = ((uint64_t) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
-  /* If the lower double is not a denomal or zero then set the hidden
-     53rd bit.  */
-  if (u.d[1].ieee.exponent > 0x001)
-    {
-      lo |= (1ULL << 52);
-      lo = lo << 7; /* pre-shift lo to match ieee854.  */
-      /* The lower double is normalized separately from the upper.  We
-	 may need to adjust the lower manitissa to reflect this.  */
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
-      if (ediff > 53)
-	lo = lo >> (ediff-53);
-      hi |= (1ULL << 52);
-    }
 
-  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
-      && ((u.d[1].ieee.exponent != 0) && (lo != 0LL)))
+  if (u.d[0].ieee.exponent != 0)
     {
-      hi--;
-      lo = (1ULL << 60) - lo;
-      if (hi < (1ULL << 52))
+      int ediff;
+
+      /* If not a denormal or zero then we have an implicit 53rd bit.  */
+      hi |= (uint64_t) 1 << 52;
+
+      if (u.d[1].ieee.exponent != 0)
+	lo |= (uint64_t) 1 << 52;
+      else
+	/* A denormal is to be interpreted as having a biased exponent
+	   of 1.  */
+	lo = lo << 1;
+
+      /* We are going to shift 4 bits out of hi later, because we only
+	 want 48 bits in *hi64.  That means we want 60 bits in lo, but
+	 we currently only have 53.  Shift the value up.  */
+      lo = lo << 7;
+
+      /* The lower double is normalized separately from the upper.
+	 We may need to adjust the lower mantissa to reflect this.
+	 The difference between the exponents can be larger than 53
+	 when the low double is much less than 1ULP of the upper
+	 (in which case there are significant bits, all 0's or all
+	 1's, between the two significands).  The difference between
+	 the exponents can be less than 53 when the upper double
+	 exponent is nearing its minimum value (in which case the low
+	 double is denormal ie. has an exponent of zero).  */
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;
+      if (ediff > 0)
 	{
-	  /* we have a borrow from the hidden bit, so shift left 1.  */
-	  hi = (hi << 1) | (lo >> 59);
-	  lo = 0xfffffffffffffffLL & (lo << 1);
-	  *exp = *exp - 1;
+	  if (ediff < 64)
+	    lo = lo >> ediff;
+	  else
+	    lo = 0;
+	}
+      else if (ediff < 0)
+	lo = lo << -ediff;
+
+      if (u.d[0].ieee.negative != u.d[1].ieee.negative
+	  && lo != 0)
+	{
+	  hi--;
+	  lo = ((uint64_t) 1 << 60) - lo;
+	  if (hi < (uint64_t) 1 << 52)
+	    {
+	      /* We have a borrow from the hidden bit, so shift left 1.  */
+	      hi = (hi << 1) | (lo >> 59);
+	      lo = (((uint64_t) 1 << 60) - 1) & (lo << 1);
+	      *exp = *exp - 1;
+	    }
 	}
     }
+  else
+    /* If the larger magnitude double is denormal then the smaller
+       one must be zero.  */
+    hi = hi << 1;
+
   *lo64 = (hi << 60) | lo;
   *hi64 = hi >> 4;
 }
 
 static inline long double
-ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
+ldbl_insert_mantissa (int sign, int exp, int64_t hi64, uint64_t lo64)
 {
   union ibm_extended_long_double u;
-  unsigned long hidden2, lzcount;
-  unsigned long long hi, lo;
+  int expnt2;
+  uint64_t hi, lo;
 
   u.d[0].ieee.negative = sign;
   u.d[1].ieee.negative = sign;
   u.d[0].ieee.exponent = exp + IEEE754_DOUBLE_BIAS;
-  u.d[1].ieee.exponent = exp-53 + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = 0;
+  expnt2 = exp - 53 + IEEE754_DOUBLE_BIAS;
+
   /* Expect 113 bits (112 bits + hidden) right justified in two longs.
      The low order 53 bits (52 + hidden) go into the lower double */
-  lo = (lo64 >> 7)& ((1ULL << 53) - 1);
-  hidden2 = (lo64 >> 59) &  1ULL;
+  lo = (lo64 >> 7) & (((uint64_t) 1 << 53) - 1);
   /* The high order 53 bits (52 + hidden) go into the upper double */
-  hi = (lo64 >> 60) & ((1ULL << 11) - 1);
-  hi |= (hi64 << 4);
+  hi = lo64 >> 60;
+  hi |= hi64 << 4;
 
-  if (lo != 0LL)
+  if (lo != 0)
     {
-      /* hidden2 bit of low double controls rounding of the high double.
-	 If hidden2 is '1' then round up hi and adjust lo (2nd mantissa)
+      int lzcount;
+
+      /* hidden bit of low double controls rounding of the high double.
+	 If hidden is '1' and either the explicit mantissa is non-zero
+	 or hi is odd, then round up hi and adjust lo (2nd mantissa)
 	 plus change the sign of the low double to compensate.  */
-      if (hidden2)
+      if ((lo & ((uint64_t) 1 << 52)) != 0
+	  && ((hi & 1) != 0 || (lo & (((uint64_t) 1 << 52) - 1)) != 0))
 	{
 	  hi++;
+	  if ((hi & ((uint64_t) 1 << 53)) != 0)
+	    {
+	      hi = hi >> 1;
+	      u.d[0].ieee.exponent++;
+	    }
 	  u.d[1].ieee.negative = !sign;
-	  lo = (1ULL << 53) - lo;
+	  lo = ((uint64_t) 1 << 53) - lo;
 	}
-      /* The hidden bit of the lo mantissa is zero so we need to
-	 normalize the it for the low double.  Shift it left until the
-	 hidden bit is '1' then adjust the 2nd exponent accordingly.  */
+
+      /* Normalize the low double.  Shift the mantissa left until
+	 the hidden bit is '1' and adjust the exponent accordingly.  */
 
       if (sizeof (lo) == sizeof (long))
 	lzcount = __builtin_clzl (lo);
@@ -92,34 +138,30 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
 	lzcount = __builtin_clzl ((long) (lo >> 32));
       else
 	lzcount = __builtin_clzl ((long) lo) + 32;
-      lzcount = lzcount - 11;
-      if (lzcount > 0)
+      lzcount = lzcount - (64 - 53);
+      lo <<= lzcount;
+      expnt2 -= lzcount;
+
+      if (expnt2 >= 1)
+	/* Not denormal.  */
+	u.d[1].ieee.exponent = expnt2;
+      else
 	{
-	  int expnt2 = u.d[1].ieee.exponent - lzcount;
-	  if (expnt2 >= 1)
-	    {
-	      /* Not denormal.  Normalize and set low exponent.  */
-	      lo = lo << lzcount;
-	      u.d[1].ieee.exponent = expnt2;
-	    }
+	  /* Is denormal.  Note that biased exponent of 0 is treated
+	     as if it was 1, hence the extra shift.  */
+	  if (expnt2 > -53)
+	    lo >>= 1 - expnt2;
 	  else
-	    {
-	      /* Is denormal.  */
-	      lo = lo << (lzcount + expnt2);
-	      u.d[1].ieee.exponent = 0;
-	    }
+	    lo = 0;
 	}
     }
   else
-    {
-      u.d[1].ieee.negative = 0;
-      u.d[1].ieee.exponent = 0;
-    }
+    u.d[1].ieee.negative = 0;
 
-  u.d[1].ieee.mantissa1 = lo & ((1ULL << 32) - 1);
-  u.d[1].ieee.mantissa0 = (lo >> 32) & ((1ULL << 20) - 1);
-  u.d[0].ieee.mantissa1 = hi & ((1ULL << 32) - 1);
-  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
+  u.d[1].ieee.mantissa1 = lo;
+  u.d[1].ieee.mantissa0 = lo >> 32;
+  u.d[0].ieee.mantissa1 = hi;
+  u.d[0].ieee.mantissa0 = hi >> 32;
   return u.ld;
 }
 
@@ -163,13 +205,13 @@ ldbl_canonicalize (double *a, double *aa)
   *aa = xl;
 }
 
-/* Simple inline nearbyint (double) function .
+/* Simple inline nearbyint (double) function.
    Only works in the default rounding mode
    but is useful in long double rounding functions.  */
 static inline double
 ldbl_nearbyint (double a)
 {
-  double two52 = 0x10000000000000LL;
+  double two52 = 0x1p52;
 
   if (__builtin_expect ((__builtin_fabs (a) < two52), 1))
     {
diff --git a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
index c3e42f2..c96852d 100644
--- a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
@@ -69,9 +69,9 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
       else
 	lzcount = __builtin_clzl ((long) val) + 32;
       if (hi)
-	lzcount = lzcount - 11;
+	lzcount = lzcount - (64 - 53);
       else
-	lzcount = lzcount + 42;
+	lzcount = lzcount + 53 - (64 - 53);
 
       if (lzcount > u.d[0].ieee.exponent)
 	{
@@ -97,29 +97,27 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	}
     }
 
-  if (lo != 0L)
+  if (lo != 0)
     {
-      /* hidden2 bit of low double controls rounding of the high double.
-	 If hidden2 is '1' and either the explicit mantissa is non-zero
+      /* hidden bit of low double controls rounding of the high double.
+	 If hidden is '1' and either the explicit mantissa is non-zero
 	 or hi is odd, then round up hi and adjust lo (2nd mantissa)
 	 plus change the sign of the low double to compensate.  */
       if ((lo & (1LL << 52)) != 0
-	  && ((hi & 1) != 0 || (lo & ((1LL << 52) - 1))))
+	  && ((hi & 1) != 0 || (lo & ((1LL << 52) - 1)) != 0))
 	{
 	  hi++;
-	  if ((hi & ((1LL << 52) - 1)) == 0)
+	  if ((hi & (1LL << 53)) != 0)
 	    {
-	      if ((hi & (1LL << 53)) != 0)
-		hi -= 1LL << 52;
+	      hi >>= 1;
 	      u.d[0].ieee.exponent++;
 	    }
 	  u.d[1].ieee.negative = !sign;
 	  lo = (1LL << 53) - lo;
 	}
 
-      /* The hidden bit of the lo mantissa is zero so we need to normalize
-	 it for the low double.  Shift it left until the hidden bit is '1'
-	 then adjust the 2nd exponent accordingly.  */
+      /* Normalize the low double.  Shift the mantissa left until
+	 the hidden bit is '1' and adjust the exponent accordingly.  */
 
       if (sizeof (lo) == sizeof (long))
 	lzcount = __builtin_clzl (lo);
@@ -127,24 +125,24 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	lzcount = __builtin_clzl ((long) (lo >> 32));
       else
 	lzcount = __builtin_clzl ((long) lo) + 32;
-      lzcount = lzcount - 11;
-      if (lzcount > 0)
-	{
-	  lo = lo << lzcount;
-	  exponent2 = exponent2 - lzcount;
-	}
+      lzcount = lzcount - (64 - 53);
+      lo <<= lzcount;
+      exponent2 -= lzcount;
+
       if (exponent2 > 0)
 	u.d[1].ieee.exponent = exponent2;
-      else
+      else if (exponent2 > -53)
 	lo >>= 1 - exponent2;
+      else
+	lo = 0;
     }
   else
     u.d[1].ieee.negative = 0;
 
-  u.d[1].ieee.mantissa1 = lo & 0xffffffffLL;
-  u.d[1].ieee.mantissa0 = (lo >> 32) & 0xfffff;
-  u.d[0].ieee.mantissa1 = hi & 0xffffffffLL;
-  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
+  u.d[1].ieee.mantissa1 = lo;
+  u.d[1].ieee.mantissa0 = lo >> 32;
+  u.d[0].ieee.mantissa1 = hi;
+  u.d[0].ieee.mantissa0 = hi >> 32;
 
   return u.ld;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 3fe8333..453c2be 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -42,15 +42,15 @@ do {									      \
 	lo <<= 1;							      \
       /* The lower double is normalized separately from the upper.  We	      \
 	 may need to adjust the lower manitissa to reflect this.  */	      \
-      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;		      \
-      if (ediff > 53 + 63)						      \
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent - 53;		      \
+      if (ediff > 63)							      \
 	lo = 0;								      \
-      else if (ediff > 53)						      \
-	lo = lo >> (ediff - 53);					      \
-      else if (u.d[1].ieee.exponent == 0 && ediff < 53)			      \
-	lo = lo << (53 - ediff);					      \
+      else if (ediff > 0)						      \
+	lo = lo >> ediff;						      \
+      else if (ediff < 0)						      \
+	lo = lo << -ediff;						      \
       if (u.d[0].ieee.negative != u.d[1].ieee.negative			      \
-	  && (u.d[1].ieee.exponent != 0 || lo != 0L))			      \
+	  && lo != 0)							      \
 	{								      \
 	  lo = (1ULL << 60) - lo;					      \
 	  if (hi == 0L)							      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5d8d74696a867006ada0ec1586a255a4093422ba

commit 5d8d74696a867006ada0ec1586a255a4093422ba
Author: Alan Modra <amodra@gmail.com>
Date:   Sat Aug 17 18:12:56 2013 +0930

    IBM long double mechanical changes to support little-endian
    http://sourceware.org/ml/libc-alpha/2013-07/msg00001.html
    
    This patch starts the process of supporting powerpc64 little-endian
    long double in glibc.  IBM long double is an array of two ieee
    doubles, so making union ibm_extended_long_double reflect this fact is
    the correct way to access fields of the doubles.
    
    	* sysdeps/ieee754/ldbl-128ibm/ieee754.h
    	(union ibm_extended_long_double): Define as an array of ieee754_double.
    	(IBM_EXTENDED_LONG_DOUBLE_BIAS): Delete.
    	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c: Update all references
    	to ibm_extended_long_double and IBM_EXTENDED_LONG_DOUBLE_BIAS.
    	* sysdeps/ieee754/ldbl-128ibm/e_exp10l.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/e_expl.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/strtold_l.c: Likewise.
    	* sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c: Likewise.

diff --git a/ChangeLog b/ChangeLog
index 9d04543..272795b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/ieee754/ldbl-128ibm/ieee754.h
+	(union ibm_extended_long_double): Define as an array of ieee754_double.
+	(IBM_EXTENDED_LONG_DOUBLE_BIAS): Delete.
+	* sysdeps/ieee754/ldbl-128ibm/printf_fphex.c: Update all references
+	to ibm_extended_long_double and IBM_EXTENDED_LONG_DOUBLE_BIAS.
+	* sysdeps/ieee754/ldbl-128ibm/e_exp10l.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/e_expl.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/math_ldbl.h: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/strtold_l.c: Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c: Likewise.
+
 2013-09-09  Allan McRae  <allan@archlinux.org>
 
 	[BZ #15893]
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c b/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
index 1eaf2fe..49121ca 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_exp10l.c
@@ -36,9 +36,9 @@ __ieee754_exp10l (long double arg)
   else if (arg > LDBL_MAX_10_EXP + 1)
     return LDBL_MAX * LDBL_MAX;
 
-  u.d = arg;
-  arg_high = u.dd[0];
-  arg_low = u.dd[1];
+  u.ld = arg;
+  arg_high = u.d[0].d;
+  arg_low = u.d[1].d;
   exp_high = arg_high * log10_high;
   exp_low = arg_high * log10_low + arg_low * M_LN10l;
   return __ieee754_expl (exp_high) * __ieee754_expl (exp_low);
diff --git a/sysdeps/ieee754/ldbl-128ibm/e_expl.c b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
index b599f36..9d3d1e3 100644
--- a/sysdeps/ieee754/ldbl-128ibm/e_expl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/e_expl.c
@@ -162,39 +162,39 @@ __ieee754_expl (long double x)
       x = x + xl;
 
       /* Compute ex2 = 2^n_0 e^(argtable[tval1]) e^(argtable[tval2]).  */
-      ex2_u.d = __expl_table[T_EXPL_RES1 + tval1]
-		* __expl_table[T_EXPL_RES2 + tval2];
+      ex2_u.ld = (__expl_table[T_EXPL_RES1 + tval1]
+		  * __expl_table[T_EXPL_RES2 + tval2]);
       n_i = (int)n;
       /* 'unsafe' is 1 iff n_1 != 0.  */
       unsafe = fabsl(n_i) >= -LDBL_MIN_EXP - 1;
-      ex2_u.ieee.exponent += n_i >> unsafe;
+      ex2_u.d[0].ieee.exponent += n_i >> unsafe;
       /* Fortunately, there are no subnormal lowpart doubles in
 	 __expl_table, only normal values and zeros.
 	 But after scaling it can be subnormal.  */
-      exponent2 = ex2_u.ieee.exponent2 + (n_i >> unsafe);
-      if (ex2_u.ieee.exponent2 == 0)
-	/* assert ((ex2_u.ieee.mantissa2|ex2_u.ieee.mantissa3) == 0) */;
+      exponent2 = ex2_u.d[1].ieee.exponent + (n_i >> unsafe);
+      if (ex2_u.d[1].ieee.exponent == 0)
+	/* assert ((ex2_u.d[1].ieee.mantissa0|ex2_u.d[1].ieee.mantissa1) == 0) */;
       else if (exponent2 > 0)
-	ex2_u.ieee.exponent2 = exponent2;
+	ex2_u.d[1].ieee.exponent = exponent2;
       else if (exponent2 <= -54)
 	{
-	  ex2_u.ieee.exponent2 = 0;
-	  ex2_u.ieee.mantissa2 = 0;
-	  ex2_u.ieee.mantissa3 = 0;
+	  ex2_u.d[1].ieee.exponent = 0;
+	  ex2_u.d[1].ieee.mantissa0 = 0;
+	  ex2_u.d[1].ieee.mantissa1 = 0;
 	}
       else
 	{
 	  static const double
 	    two54 = 1.80143985094819840000e+16, /* 4350000000000000 */
 	    twom54 = 5.55111512312578270212e-17; /* 3C90000000000000 */
-	  ex2_u.dd[1] *= two54;
-	  ex2_u.ieee.exponent2 += n_i >> unsafe;
-	  ex2_u.dd[1] *= twom54;
+	  ex2_u.d[1].d *= two54;
+	  ex2_u.d[1].ieee.exponent += n_i >> unsafe;
+	  ex2_u.d[1].d *= twom54;
 	}
 
       /* Compute scale = 2^n_1.  */
-      scale_u.d = 1.0L;
-      scale_u.ieee.exponent += n_i - (n_i >> unsafe);
+      scale_u.ld = 1.0L;
+      scale_u.d[0].ieee.exponent += n_i - (n_i >> unsafe);
 
       /* Approximate e^x2 - 1, using a seventh-degree polynomial,
 	 with maximum error in [-2^-16-2^-53,2^-16+2^-53]
@@ -204,7 +204,7 @@ __ieee754_expl (long double x)
       /* Return result.  */
       fesetenv (&oldenv);
 
-      result = x22 * ex2_u.d + ex2_u.d;
+      result = x22 * ex2_u.ld + ex2_u.ld;
 
       /* Now we can test whether the result is ultimate or if we are unsure.
 	 In the later case we should probably call a mpn based routine to give
@@ -238,7 +238,7 @@ __ieee754_expl (long double x)
       if (!unsafe)
 	return result;
       else
-	return result * scale_u.d;
+	return result * scale_u.ld;
     }
   /* Exceptional cases:  */
   else if (isless (x, himark))
diff --git a/sysdeps/ieee754/ldbl-128ibm/ieee754.h b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
index 9e94f53..0778b1f 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ieee754.h
+++ b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
@@ -179,49 +179,10 @@ union ieee854_long_double
 
 union ibm_extended_long_double
   {
-    long double d;
-    double dd[2];
-
-    /* This is the IBM extended format long double.  */
-    struct
-      { /* Big endian.  There is no other.  */
-
-	unsigned int negative:1;
-	unsigned int exponent:11;
-	/* Together Mantissa0-3 comprise the mantissa.  */
-	unsigned int mantissa0:20;
-	unsigned int mantissa1:32;
-
-	unsigned int negative2:1;
-	unsigned int exponent2:11;
-	/* There is an implied 1 here?  */
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa2:20;
-	unsigned int mantissa3:32;
-      } ieee;
-
-    /* This format makes it easier to see if a NaN is a signalling NaN.  */
-    struct
-      { /* Big endian.  There is no other.  */
-
-	unsigned int negative:1;
-	unsigned int exponent:11;
-	unsigned int quiet_nan:1;
-	/* Together Mantissa0-3 comprise the mantissa.  */
-	unsigned int mantissa0:19;
-	unsigned int mantissa1:32;
-
-	unsigned int negative2:1;
-	unsigned int exponent2:11;
-	/* There is an implied 1 here?  */
-	/* Together these comprise the mantissa.  */
-	unsigned int mantissa2:20;
-	unsigned int mantissa3:32;
-      } ieee_nan;
+    long double ld;
+    union ieee754_double d[2];
    };
 
-#define IBM_EXTENDED_LONG_DOUBLE_BIAS 0x3ff /* Added to exponent.  */
-
 __END_DECLS
 
 #endif /* ieee754.h */
diff --git a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
index 00e44b8..5149ba1 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
+++ b/sysdeps/ieee754/ldbl-128ibm/ldbl2mpn.c
@@ -36,22 +36,22 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
   union ibm_extended_long_double u;
   unsigned long long hi, lo;
   int ediff;
-  u.d = value;
+  u.ld = value;
 
-  *is_neg = u.ieee.negative;
-  *expt = (int) u.ieee.exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  *is_neg = u.d[0].ieee.negative;
+  *expt = (int) u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
-  lo = ((long long) u.ieee.mantissa2 << 32) | u.ieee.mantissa3;
-  hi = ((long long) u.ieee.mantissa0 << 32) | u.ieee.mantissa1;
+  lo = ((long long) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
+  hi = ((long long) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
   /* If the lower double is not a denomal or zero then set the hidden
      53rd bit.  */
-  if (u.ieee.exponent2 > 0)
+  if (u.d[1].ieee.exponent > 0)
     {
       lo |= 1LL << 52;
 
       /* The lower double is normalized separately from the upper.  We may
 	 need to adjust the lower manitissa to reflect this.  */
-      ediff = u.ieee.exponent - u.ieee.exponent2;
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
       if (ediff > 53)
 	lo = lo >> (ediff-53);
     }
@@ -59,8 +59,8 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
      difference between the long double and the rounded high double
      value.  This is indicated by a differnce between the signs of the
      high and low doubles.  */
-  if ((u.ieee.negative != u.ieee.negative2)
-      && ((u.ieee.exponent2 != 0) && (lo != 0L)))
+  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
+      && ((u.d[1].ieee.exponent != 0) && (lo != 0L)))
     {
       lo = (1ULL << 53) - lo;
       if (hi == 0LL)
@@ -92,7 +92,7 @@ __mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size,
 #define NUM_LEADING_ZEROS (BITS_PER_MP_LIMB \
 			   - (LDBL_MANT_DIG - ((N - 1) * BITS_PER_MP_LIMB)))
 
-  if (u.ieee.exponent == 0)
+  if (u.d[0].ieee.exponent == 0)
     {
       /* A biased exponent of zero is a special case.
 	 Either it is a zero or it is a denormal number.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
index 046293e..3036f14 100644
--- a/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
+++ b/sysdeps/ieee754/ldbl-128ibm/math_ldbl.h
@@ -15,28 +15,28 @@ ldbl_extract_mantissa (int64_t *hi64, uint64_t *lo64, int *exp, long double x)
      as bit 53 of the mantissa.  */
   uint64_t hi, lo;
   int ediff;
-  union ibm_extended_long_double eldbl;
-  eldbl.d = x;
-  *exp = eldbl.ieee.exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  union ibm_extended_long_double u;
+  u.ld = x;
+  *exp = u.d[0].ieee.exponent - IEEE754_DOUBLE_BIAS;
 
-  lo = ((int64_t)eldbl.ieee.mantissa2 << 32) | eldbl.ieee.mantissa3;
-  hi = ((int64_t)eldbl.ieee.mantissa0 << 32) | eldbl.ieee.mantissa1;
+  lo = ((uint64_t) u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;
+  hi = ((uint64_t) u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;
   /* If the lower double is not a denomal or zero then set the hidden
      53rd bit.  */
-  if (eldbl.ieee.exponent2 > 0x001)
+  if (u.d[1].ieee.exponent > 0x001)
     {
       lo |= (1ULL << 52);
       lo = lo << 7; /* pre-shift lo to match ieee854.  */
       /* The lower double is normalized separately from the upper.  We
 	 may need to adjust the lower manitissa to reflect this.  */
-      ediff = eldbl.ieee.exponent - eldbl.ieee.exponent2;
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;
       if (ediff > 53)
 	lo = lo >> (ediff-53);
       hi |= (1ULL << 52);
     }
 
-  if ((eldbl.ieee.negative != eldbl.ieee.negative2)
-      && ((eldbl.ieee.exponent2 != 0) && (lo != 0LL)))
+  if ((u.d[0].ieee.negative != u.d[1].ieee.negative)
+      && ((u.d[1].ieee.exponent != 0) && (lo != 0LL)))
     {
       hi--;
       lo = (1ULL << 60) - lo;
@@ -59,10 +59,10 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
   unsigned long hidden2, lzcount;
   unsigned long long hi, lo;
 
-  u.ieee.negative = sign;
-  u.ieee.negative2 = sign;
-  u.ieee.exponent = exp + IBM_EXTENDED_LONG_DOUBLE_BIAS;
-  u.ieee.exponent2 = exp-53 + IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  u.d[0].ieee.negative = sign;
+  u.d[1].ieee.negative = sign;
+  u.d[0].ieee.exponent = exp + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = exp-53 + IEEE754_DOUBLE_BIAS;
   /* Expect 113 bits (112 bits + hidden) right justified in two longs.
      The low order 53 bits (52 + hidden) go into the lower double */
   lo = (lo64 >> 7)& ((1ULL << 53) - 1);
@@ -79,7 +79,7 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
       if (hidden2)
 	{
 	  hi++;
-	  u.ieee.negative2 = !sign;
+	  u.d[1].ieee.negative = !sign;
 	  lo = (1ULL << 53) - lo;
 	}
       /* The hidden bit of the lo mantissa is zero so we need to
@@ -95,32 +95,32 @@ ldbl_insert_mantissa (int sign, int exp, int64_t hi64, u_int64_t lo64)
       lzcount = lzcount - 11;
       if (lzcount > 0)
 	{
-	  int expnt2 = u.ieee.exponent2 - lzcount;
+	  int expnt2 = u.d[1].ieee.exponent - lzcount;
 	  if (expnt2 >= 1)
 	    {
 	      /* Not denormal.  Normalize and set low exponent.  */
 	      lo = lo << lzcount;
-	      u.ieee.exponent2 = expnt2;
+	      u.d[1].ieee.exponent = expnt2;
 	    }
 	  else
 	    {
 	      /* Is denormal.  */
 	      lo = lo << (lzcount + expnt2);
-	      u.ieee.exponent2 = 0;
+	      u.d[1].ieee.exponent = 0;
 	    }
 	}
     }
   else
     {
-      u.ieee.negative2 = 0;
-      u.ieee.exponent2 = 0;
+      u.d[1].ieee.negative = 0;
+      u.d[1].ieee.exponent = 0;
     }
 
-  u.ieee.mantissa3 = lo & ((1ULL << 32) - 1);
-  u.ieee.mantissa2 = (lo >> 32) & ((1ULL << 20) - 1);
-  u.ieee.mantissa1 = hi & ((1ULL << 32) - 1);
-  u.ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
-  return u.d;
+  u.d[1].ieee.mantissa1 = lo & ((1ULL << 32) - 1);
+  u.d[1].ieee.mantissa0 = (lo >> 32) & ((1ULL << 20) - 1);
+  u.d[0].ieee.mantissa1 = hi & ((1ULL << 32) - 1);
+  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1ULL << 20) - 1);
+  return u.ld;
 }
 
 /* Handy utility functions to pack/unpack/cononicalize and find the nearbyint
@@ -129,18 +129,18 @@ static inline long double
 default_ldbl_pack (double a, double aa)
 {
   union ibm_extended_long_double u;
-  u.dd[0] = a;
-  u.dd[1] = aa;
-  return u.d;
+  u.d[0].d = a;
+  u.d[1].d = aa;
+  return u.ld;
 }
 
 static inline void
 default_ldbl_unpack (long double l, double *a, double *aa)
 {
   union ibm_extended_long_double u;
-  u.d = l;
-  *a = u.dd[0];
-  *aa = u.dd[1];
+  u.ld = l;
+  *a = u.d[0].d;
+  *aa = u.d[1].d;
 }
 
 #ifndef ldbl_pack
diff --git a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
index 3df42c5..c3e42f2 100644
--- a/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/mpn2ldbl.c
@@ -33,11 +33,11 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
   unsigned long long hi, lo;
   int exponent2;
 
-  u.ieee.negative = sign;
-  u.ieee.negative2 = sign;
-  u.ieee.exponent = expt + IBM_EXTENDED_LONG_DOUBLE_BIAS;
-  u.ieee.exponent2 = 0;
-  exponent2 = expt - 53 + IBM_EXTENDED_LONG_DOUBLE_BIAS;
+  u.d[0].ieee.negative = sign;
+  u.d[1].ieee.negative = sign;
+  u.d[0].ieee.exponent = expt + IEEE754_DOUBLE_BIAS;
+  u.d[1].ieee.exponent = 0;
+  exponent2 = expt - 53 + IEEE754_DOUBLE_BIAS;
 
 #if BITS_PER_MP_LIMB == 32
   /* The low order 53 bits (52 + hidden) go into the lower double */
@@ -73,15 +73,15 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
       else
 	lzcount = lzcount + 42;
 
-      if (lzcount > u.ieee.exponent)
+      if (lzcount > u.d[0].ieee.exponent)
 	{
-	  lzcount = u.ieee.exponent;
-	  u.ieee.exponent = 0;
+	  lzcount = u.d[0].ieee.exponent;
+	  u.d[0].ieee.exponent = 0;
 	  exponent2 -= lzcount;
 	}
       else
 	{
-	  u.ieee.exponent -= (lzcount - 1);
+	  u.d[0].ieee.exponent -= (lzcount - 1);
 	  exponent2 -= (lzcount - 1);
 	}
 
@@ -111,9 +111,9 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	    {
 	      if ((hi & (1LL << 53)) != 0)
 		hi -= 1LL << 52;
-	      u.ieee.exponent++;
+	      u.d[0].ieee.exponent++;
 	    }
-	  u.ieee.negative2 = !sign;
+	  u.d[1].ieee.negative = !sign;
 	  lo = (1LL << 53) - lo;
 	}
 
@@ -134,17 +134,17 @@ __mpn_construct_long_double (mp_srcptr frac_ptr, int expt, int sign)
 	  exponent2 = exponent2 - lzcount;
 	}
       if (exponent2 > 0)
-	u.ieee.exponent2 = exponent2;
+	u.d[1].ieee.exponent = exponent2;
       else
 	lo >>= 1 - exponent2;
     }
   else
-    u.ieee.negative2 = 0;
+    u.d[1].ieee.negative = 0;
 
-  u.ieee.mantissa3 = lo & 0xffffffffLL;
-  u.ieee.mantissa2 = (lo >> 32) & 0xfffff;
-  u.ieee.mantissa1 = hi & 0xffffffffLL;
-  u.ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
+  u.d[1].ieee.mantissa1 = lo & 0xffffffffLL;
+  u.d[1].ieee.mantissa0 = (lo >> 32) & 0xfffff;
+  u.d[0].ieee.mantissa1 = hi & 0xffffffffLL;
+  u.d[0].ieee.mantissa0 = (hi >> 32) & ((1LL << (LDBL_MANT_DIG - 86)) - 1);
 
-  return u.d;
+  return u.ld;
 }
diff --git a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
index 247dc20..3fe8333 100644
--- a/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
+++ b/sysdeps/ieee754/ldbl-128ibm/printf_fphex.c
@@ -26,31 +26,31 @@ do {									      \
       unsigned long long int num0, num1;				      \
       unsigned long long hi, lo;					      \
       int ediff;							      \
-      union ibm_extended_long_double eldbl;				      \
-      eldbl.d = fpnum.ldbl.d;						      \
+      union ibm_extended_long_double u;					      \
+      u.ld = fpnum.ldbl.d;						      \
 									      \
       assert (sizeof (long double) == 16);				      \
 									      \
-      lo = ((long long)eldbl.ieee.mantissa2 << 32) | eldbl.ieee.mantissa3;    \
-      hi = ((long long)eldbl.ieee.mantissa0 << 32) | eldbl.ieee.mantissa1;    \
+      lo = ((long long)u.d[1].ieee.mantissa0 << 32) | u.d[1].ieee.mantissa1;  \
+      hi = ((long long)u.d[0].ieee.mantissa0 << 32) | u.d[0].ieee.mantissa1;  \
       lo <<= 7; /* pre-shift lo to match ieee854.  */			      \
       /* If the lower double is not a denomal or zero then set the hidden     \
 	 53rd bit.  */							      \
-      if (eldbl.ieee.exponent2 != 0)					      \
+      if (u.d[1].ieee.exponent != 0)					      \
 	lo |= (1ULL << (52 + 7));					      \
       else								      \
 	lo <<= 1;							      \
       /* The lower double is normalized separately from the upper.  We	      \
 	 may need to adjust the lower manitissa to reflect this.  */	      \
-      ediff = eldbl.ieee.exponent - eldbl.ieee.exponent2;		      \
+      ediff = u.d[0].ieee.exponent - u.d[1].ieee.exponent;		      \
       if (ediff > 53 + 63)						      \
 	lo = 0;								      \
       else if (ediff > 53)						      \
 	lo = lo >> (ediff - 53);					      \
-      else if (eldbl.ieee.exponent2 == 0 && ediff < 53)			      \
+      else if (u.d[1].ieee.exponent == 0 && ediff < 53)			      \
 	lo = lo << (53 - ediff);					      \
-      if (eldbl.ieee.negative != eldbl.ieee.negative2			      \
-	  && (eldbl.ieee.exponent2 != 0 || lo != 0L))			      \
+      if (u.d[0].ieee.negative != u.d[1].ieee.negative			      \
+	  && (u.d[1].ieee.exponent != 0 || lo != 0L))			      \
 	{								      \
 	  lo = (1ULL << 60) - lo;					      \
 	  if (hi == 0L)							      \
@@ -58,7 +58,7 @@ do {									      \
 	      /* we have a borrow from the hidden bit, so shift left 1.  */   \
 	      hi = 0xffffffffffffeLL | (lo >> 59);			      \
 	      lo = 0xfffffffffffffffLL & (lo << 1);			      \
-	      eldbl.ieee.exponent--;					      \
+	      u.d[0].ieee.exponent--;					      \
 	    }								      \
 	  else								      \
 	    hi--;							      \
@@ -109,9 +109,9 @@ do {									      \
 	  *--wnumstr = L'0';						      \
 	}								      \
 									      \
-      leading = eldbl.ieee.exponent == 0 ? '0' : '1';			      \
+      leading = u.d[0].ieee.exponent == 0 ? '0' : '1';			      \
 									      \
-      exponent = eldbl.ieee.exponent;					      \
+      exponent = u.d[0].ieee.exponent;					      \
 									      \
       if (exponent == 0)						      \
 	{								      \
@@ -121,18 +121,18 @@ do {									      \
 	    {								      \
 	      /* This is a denormalized number.  */			      \
 	      expnegative = 1;						      \
-	      exponent = IBM_EXTENDED_LONG_DOUBLE_BIAS - 1;		      \
+	      exponent = IEEE754_DOUBLE_BIAS - 1;			      \
 	    }								      \
 	}								      \
-      else if (exponent >= IBM_EXTENDED_LONG_DOUBLE_BIAS)		      \
+      else if (exponent >= IEEE754_DOUBLE_BIAS)				      \
 	{								      \
 	  expnegative = 0;						      \
-	  exponent -= IBM_EXTENDED_LONG_DOUBLE_BIAS;			      \
+	  exponent -= IEEE754_DOUBLE_BIAS;				      \
 	}								      \
       else								      \
 	{								      \
 	  expnegative = 1;						      \
-	  exponent = -(exponent - IBM_EXTENDED_LONG_DOUBLE_BIAS);	      \
+	  exponent = -(exponent - IEEE754_DOUBLE_BIAS);			      \
 	}								      \
 } while (0)
 
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c b/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
index bfcd110..92ced52 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_nearbyintl.c
@@ -34,11 +34,11 @@ __nearbyintl (long double x)
   fenv_t env;
   static const long double TWO52 = 4503599627370496.0L;
   union ibm_extended_long_double u;
-  u.d = x;
+  u.ld = x;
 
-  if (fabs (u.dd[0]) < TWO52)
+  if (fabs (u.d[0].d) < TWO52)
     {
-      double high = u.dd[0];
+      double high = u.d[0].d;
       feholdexcept (&env);
       if (high > 0.0)
 	{
@@ -52,13 +52,13 @@ __nearbyintl (long double x)
 	  high += TWO52;
           if (high == 0.0) high = -0.0;
 	}
-      u.dd[0] = high;
-      u.dd[1] = 0.0;
-      math_force_eval (u.dd[0]);
-      math_force_eval (u.dd[1]);
+      u.d[0].d = high;
+      u.d[1].d = 0.0;
+      math_force_eval (u.d[0]);
+      math_force_eval (u.d[1]);
       fesetenv (&env);
     }
-  else if (fabs (u.dd[1]) < TWO52 && u.dd[1] != 0.0)
+  else if (fabs (u.d[1].d) < TWO52 && u.d[1].d != 0.0)
     {
       double high, low, tau;
       /* In this case we have to round the low double and handle any
@@ -67,57 +67,57 @@ __nearbyintl (long double x)
          may already be rounded and the low double may have the
          opposite sign to compensate.  */
       feholdexcept (&env);
-      if (u.dd[0] > 0.0)
+      if (u.d[0].d > 0.0)
 	{
-	  if (u.dd[1] > 0.0)
+	  if (u.d[1].d > 0.0)
 	    {
 	      /* If the high/low doubles are the same sign then simply
 	         round the low double.  */
-	      high = u.dd[0];
-	      low = u.dd[1];
+	      high = u.d[0].d;
+	      low = u.d[1].d;
 	    }
-	  else if (u.dd[1] < 0.0)
+	  else if (u.d[1].d < 0.0)
 	    {
 	      /* Else the high double is pre rounded and we need to
 	         adjust for that.  */
 
-	      tau = __nextafter (u.dd[0], 0.0);
-	      tau = (u.dd[0] - tau) * 2.0;
-	      high = u.dd[0] - tau;
-	      low = u.dd[1] + tau;
+	      tau = __nextafter (u.d[0].d, 0.0);
+	      tau = (u.d[0].d - tau) * 2.0;
+	      high = u.d[0].d - tau;
+	      low = u.d[1].d + tau;
 	    }
 	  low += TWO52;
 	  low -= TWO52;
 	}
-      else if (u.dd[0] < 0.0)
+      else if (u.d[0].d < 0.0)
 	{
-	  if (u.dd[1] < 0.0)
+	  if (u.d[1].d < 0.0)
 	    {
 	      /* If the high/low doubles are the same sign then simply
 	         round the low double.  */
-	      high = u.dd[0];
-	      low = u.dd[1];
+	      high = u.d[0].d;
+	      low = u.d[1].d;
 	    }
-	  else if (u.dd[1] > 0.0)
+	  else if (u.d[1].d > 0.0)
 	    {
 	      /* Else the high double is pre rounded and we need to
 	         adjust for that.  */
-	      tau = __nextafter (u.dd[0], 0.0);
-	      tau = (u.dd[0] - tau) * 2.0;
-	      high = u.dd[0] - tau;
-	      low = u.dd[1] + tau;
+	      tau = __nextafter (u.d[0].d, 0.0);
+	      tau = (u.d[0].d - tau) * 2.0;
+	      high = u.d[0].d - tau;
+	      low = u.d[1].d + tau;
 	    }
 	  low = TWO52 - low;
 	  low = -(low - TWO52);
 	}
-      u.dd[0] = high + low;
-      u.dd[1] = high - u.dd[0] + low;
-      math_force_eval (u.dd[0]);
-      math_force_eval (u.dd[1]);
+      u.d[0].d = high + low;
+      u.d[1].d = high - u.d[0].d + low;
+      math_force_eval (u.d[0]);
+      math_force_eval (u.d[1]);
       fesetenv (&env);
     }
 
-  return u.d;
+  return u.ld;
 }
 
 long_double_symbol (libm, __nearbyintl, nearbyintl);
diff --git a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
index 04e3288..93a80c5 100644
--- a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
@@ -43,11 +43,11 @@ libc_hidden_proto (STRTOF)
 #define FLOAT_HUGE_VAL	HUGE_VALL
 # define SET_MANTISSA(flt, mant) \
   do { union ibm_extended_long_double u;				      \
-       u.d = (flt);							      \
-       u.ieee_nan.mantissa0 = (mant) >> 32;				      \
-       u.ieee_nan.mantissa1 = (mant);					      \
-       if ((u.ieee.mantissa0 | u.ieee.mantissa1) != 0)			      \
-	 (flt) = u.d;							      \
+       u.ld = (flt);							      \
+       u.d[0].ieee_nan.mantissa0 = (mant) >> 32;			      \
+       u.d[0].ieee_nan.mantissa1 = (mant);				      \
+       if ((u.d[0].ieee.mantissa0 | u.d[0].ieee.mantissa1) != 0)	      \
+	 (flt) = u.ld;							      \
   } while (0)
 
 #include <strtod_l.c>
diff --git a/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c b/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
index ed0d4a5..06dcf02 100644
--- a/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/x2y2m1l.c
@@ -89,23 +89,23 @@ __x2y2m1l (long double x, long double y)
   double vals[12];
   SET_RESTORE_ROUND (FE_TONEAREST);
   union ibm_extended_long_double xu, yu;
-  xu.d = x;
-  yu.d = y;
-  if (fabs (xu.dd[1]) < 0x1p-500)
-    xu.dd[1] = 0.0;
-  if (fabs (yu.dd[1]) < 0x1p-500)
-    yu.dd[1] = 0.0;
-  mul_split (&vals[1], &vals[0], xu.dd[0], xu.dd[0]);
-  mul_split (&vals[3], &vals[2], xu.dd[0], xu.dd[1]);
+  xu.ld = x;
+  yu.ld = y;
+  if (fabs (xu.d[1].d) < 0x1p-500)
+    xu.d[1].d = 0.0;
+  if (fabs (yu.d[1].d) < 0x1p-500)
+    yu.d[1].d = 0.0;
+  mul_split (&vals[1], &vals[0], xu.d[0].d, xu.d[0].d);
+  mul_split (&vals[3], &vals[2], xu.d[0].d, xu.d[1].d);
   vals[2] *= 2.0;
   vals[3] *= 2.0;
-  mul_split (&vals[5], &vals[4], xu.dd[1], xu.dd[1]);
-  mul_split (&vals[7], &vals[6], yu.dd[0], yu.dd[0]);
-  mul_split (&vals[9], &vals[8], yu.dd[0], yu.dd[1]);
+  mul_split (&vals[5], &vals[4], xu.d[1].d, xu.d[1].d);
+  mul_split (&vals[7], &vals[6], yu.d[0].d, yu.d[0].d);
+  mul_split (&vals[9], &vals[8], yu.d[0].d, yu.d[1].d);
   vals[8] *= 2.0;
   vals[9] *= 2.0;
-  mul_split (&vals[11], &vals[10], yu.dd[1], yu.dd[1]);
-  if (xu.dd[0] >= 0.75)
+  mul_split (&vals[11], &vals[10], yu.d[1].d, yu.d[1].d);
+  if (xu.d[0].d >= 0.75)
     vals[1] -= 1.0;
   else
     {

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=00db055f5a63816c6c44e2cd7d17252140410fab

commit 00db055f5a63816c6c44e2cd7d17252140410fab
Author: Allan McRae <allan@archlinux.org>
Date:   Mon Sep 9 22:52:58 2013 +1000

    Fix memory leak in stdlib/isomac.c

diff --git a/ChangeLog b/ChangeLog
index 7d32373..9d04543 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2013-09-09  Allan McRae  <allan@archlinux.org>
 
+	[BZ #15893]
+	* stdlib/isomac.c (get_null_defines): Fix memory leak.
+
 	[BZ #15892]
 	* libio/memstream.c (open_memstream): Fix memory leak.
 	* libio/wmemstream.c (open_wmemstream): Likewise.
diff --git a/NEWS b/NEWS
index e879adb..b25af8e 100644
--- a/NEWS
+++ b/NEWS
@@ -33,7 +33,8 @@ Version 2.18
   15423, 15424, 15426, 15427, 15429, 15431, 15432, 15441, 15442, 15448,
   15465, 15480, 15485, 15488, 15490, 15492, 15493, 15497, 15506, 15522,
   15529, 15532, 15536, 15553, 15577, 15583, 15618, 15627, 15631, 15654,
-  15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15895.
+  15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15893,
+  15895.
 
 * CVE-2013-2207 Incorrectly granting access to another user's pseudo-terminal
   has been fixed by disabling the use of pt_chown (Bugzilla #15755).
diff --git a/stdlib/isomac.c b/stdlib/isomac.c
index 2c9009b..621b515 100644
--- a/stdlib/isomac.c
+++ b/stdlib/isomac.c
@@ -263,6 +263,7 @@ get_null_defines (void)
   if (system (command))
     {
       puts ("system() returned nonzero");
+      free (command);
       return NULL;
     }
   free (command);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2140e39e13ce3def5dc0e1037f3d7d2be0ab1fd1

commit 2140e39e13ce3def5dc0e1037f3d7d2be0ab1fd1
Author: Allan McRae <allan@archlinux.org>
Date:   Mon Sep 9 22:50:41 2013 +1000

    Fix memory leaks in libio on allocation failure

diff --git a/ChangeLog b/ChangeLog
index 61494ca..7d32373 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-09-09  Allan McRae  <allan@archlinux.org>
+
+	[BZ #15892]
+	* libio/memstream.c (open_memstream): Fix memory leak.
+	* libio/wmemstream.c (open_wmemstream): Likewise.
+
+	[BZ #15895]
+	* nscd/netgroupcache.c: Fix nesting of ifdefs.
+
 2013-09-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc32/power7/memrchr.S (__memrchr): Fix invalid
diff --git a/NEWS b/NEWS
index a71d9fa..e879adb 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,8 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14155, 14699, 15532, 15427, 15522, 15797, 15909, 15996, 16150.
+  14155, 14699, 15532, 15427, 15522, 15797, 15892, 15895, 15909, 15996,
+  16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
@@ -21,18 +22,18 @@ Version 2.18
 
   2546, 2560, 5159, 6809, 7006, 10060, 10062, 10283, 10357, 10686, 11120,
   11561, 12310, 12387, 12492, 12515, 12723, 13550, 13889, 13951, 13988,
-  14142, 14176, 14200, 14256, 14280, 14293, 14317, 14327, 14478, 14496,
-  14582, 14686, 14812, 14888, 14894, 14907, 14908, 14909, 14920, 14952,
-  14964, 14981, 14982, 14985, 14991, 14994, 14996, 15000, 15003, 15006,
-  15007, 15014, 15020, 15022, 15023, 15036, 15054, 15055, 15062, 15078,
-  15084, 15085, 15086, 15100, 15160, 15214, 15221, 15232, 15234, 15283,
-  15285, 15287, 15304, 15305, 15307, 15309, 15327, 15330, 15335, 15336,
-  15337, 15339, 15342, 15346, 15359, 15361, 15366, 15380, 15381, 15394,
-  15395, 15405, 15406, 15409, 15416, 15418, 15419, 15423, 15424, 15426,
-  15429, 15431, 15432, 15441, 15442, 15448, 15465, 15480, 15485, 15488,
-  15490, 15492, 15493, 15497, 15506, 15529, 15536, 15553, 15577, 15583,
-  15618, 15627, 15631, 15654, 15655, 15666, 15667, 15674, 15711, 15755,
-  15759, 15985.
+  14142, 14155, 14176, 14200, 14256, 14280, 14293, 14317, 14327, 14478,
+  14496, 14582, 14686, 14699, 14812, 14888, 14894, 14907, 14908, 14909,
+  14920, 14952, 14964, 14981, 14982, 14985, 14991, 14994, 14996, 15000,
+  15003, 15006, 15007, 15014, 15020, 15022, 15023, 15036, 15054, 15055,
+  15062, 15078, 15084, 15085, 15086, 15100, 15160, 15214, 15221, 15232,
+  15234, 15283, 15285, 15287, 15304, 15305, 15307, 15309, 15327, 15330,
+  15331, 15335, 15336, 15337, 15339, 15342, 15346, 15359, 15361, 15366,
+  15380, 15381, 15394, 15395, 15405, 15406, 15409, 15416, 15418, 15419,
+  15423, 15424, 15426, 15427, 15429, 15431, 15432, 15441, 15442, 15448,
+  15465, 15480, 15485, 15488, 15490, 15492, 15493, 15497, 15506, 15522,
+  15529, 15532, 15536, 15553, 15577, 15583, 15618, 15627, 15631, 15654,
+  15655, 15666, 15667, 15674, 15711, 15755, 15759, 15797, 15892, 15895.
 
 * CVE-2013-2207 Incorrectly granting access to another user's pseudo-terminal
   has been fixed by disabling the use of pt_chown (Bugzilla #15755).
diff --git a/libio/memstream.c b/libio/memstream.c
index 34534e2..3cb1bd7 100644
--- a/libio/memstream.c
+++ b/libio/memstream.c
@@ -84,7 +84,10 @@ open_memstream (bufloc, sizeloc)
 
   buf = calloc (1, _IO_BUFSIZ);
   if (buf == NULL)
-    return NULL;
+    {
+      free (new_f);
+      return NULL;
+    }
   _IO_init (&new_f->fp._sf._sbf._f, 0);
   _IO_JUMPS ((struct _IO_FILE_plus *) &new_f->fp._sf._sbf) = &_IO_mem_jumps;
   _IO_str_init_static_internal (&new_f->fp._sf, buf, _IO_BUFSIZ, buf);
diff --git a/libio/wmemstream.c b/libio/wmemstream.c
index 65738d4..fd7fe44 100644
--- a/libio/wmemstream.c
+++ b/libio/wmemstream.c
@@ -85,8 +85,10 @@ open_wmemstream (bufloc, sizeloc)
 
   buf = calloc (1, _IO_BUFSIZ);
   if (buf == NULL)
-    return NULL;
-
+    {
+      free (new_f);
+      return NULL;
+    }
   _IO_no_init (&new_f->fp._sf._sbf._f, 0, 0, &new_f->wd, &_IO_wmem_jumps);
   _IO_fwide (&new_f->fp._sf._sbf._f, 1);
   _IO_wstr_init_static (&new_f->fp._sf._sbf._f, buf,

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9013984941a46d8c361299f531ea79527fba4d1c

commit 9013984941a46d8c361299f531ea79527fba4d1c
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Thu Sep 5 09:32:56 2013 -0500

    PowerPC: fix POWER7 memrchr for some large inputs

diff --git a/ChangeLog b/ChangeLog
index 5ff3fa4..61494ca 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2013-09-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc32/power7/memrchr.S (__memrchr): Fix invalid
+	memory access for final bytes in some large inputs.
+	* sysdeps/powerpc/powerpc64/power7/memrchr.S (__memrchr): Likewise.
+
+2013-09-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
 	* string/test-memrchr.c: New file.
 	* string/test-memrchr-ifunc.c: New file.
 	* string/Makefile: Add new memrchr testcase.
diff --git a/sysdeps/powerpc/powerpc32/power7/memrchr.S b/sysdeps/powerpc/powerpc32/power7/memrchr.S
index d1e3fda..defd832 100644
--- a/sysdeps/powerpc/powerpc32/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -101,8 +101,8 @@ L(loop):
 	/* We're here because the counter reached 0, and that means we
 	   didn't have any matches for BYTE in the whole range.  Just return
 	   the original range.  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
+	addi	r8,r8,4
+	cmplw	cr6,r8,r7
 	bgt	cr6,L(loop_small)
 	b	L(null)
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memrchr.S b/sysdeps/powerpc/powerpc64/power7/memrchr.S
index d24fbbb..c499952 100644
--- a/sysdeps/powerpc/powerpc64/power7/memrchr.S
+++ b/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -102,8 +102,8 @@ L(loop):
 	/* We're here because the counter reached 0, and that means we
 	   didn't have any matches for BYTE in the whole range.  Just return
 	   the original range.  */
-	addi	r9,r8,8
-	cmpld	cr6,r9,r7
+	addi	r8,r8,8
+	cmpld	cr6,r8,r7
 	bgt	cr6,L(loop_small)
 	b	L(null)
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fd51899bbfe36a9649bca1d11f0f14f8d93d5898

commit fd51899bbfe36a9649bca1d11f0f14f8d93d5898
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Thu Aug 29 15:28:00 2013 -0300

    Add memrchr testcase

diff --git a/ChangeLog b/ChangeLog
index be77e26..5ff3fa4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
-<<<<<<< HEAD
-=======
+2013-09-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* string/test-memrchr.c: New file.
+	* string/test-memrchr-ifunc.c: New file.
+	* string/Makefile: Add new memrchr testcase.
+
 2013-09-03  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #15427]
diff --git a/string/Makefile b/string/Makefile
index 0237edd..72d3e29 100644
--- a/string/Makefile
+++ b/string/Makefile
@@ -45,7 +45,7 @@ strop-tests	:= memchr memcmp memcpy memmove mempcpy memset memccpy	\
 		   stpcpy stpncpy strcat strchr strcmp strcpy strcspn	\
 		   strlen strncmp strncpy strpbrk strrchr strspn memmem	\
 		   strstr strcasestr strnlen strcasecmp strncasecmp	\
-		   strncat rawmemchr strchrnul bcopy bzero
+		   strncat rawmemchr strchrnul bcopy bzero memrchr
 tests		:= tester inl-tester noinl-tester testcopy test-ffs	\
 		   tst-strlen stratcliff tst-svc tst-inlcall		\
 		   bug-strncat1 bug-strspn1 bug-strpbrk1 tst-bswap	\
diff --git a/string/test-memrchr-ifunc.c b/string/test-memrchr-ifunc.c
new file mode 100644
index 0000000..100dedb
--- /dev/null
+++ b/string/test-memrchr-ifunc.c
@@ -0,0 +1,20 @@
+/* Test and measure IFUNC implementations of memrchr function.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_IFUNC 1
+#include "test-memrchr.c"
diff --git a/string/test-memrchr.c b/string/test-memrchr.c
new file mode 100644
index 0000000..a4fe811
--- /dev/null
+++ b/string/test-memrchr.c
@@ -0,0 +1,169 @@
+/* Test and measure memrchr functions.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Jakub Jelinek <jakub@redhat.com>, 1999.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#define TEST_NAME "memrchr"
+#include "test-string.h"
+
+typedef char *(*proto_t) (const char *, int, size_t);
+char *simple_memrchr (const char *, int, size_t);
+
+IMPL (simple_memrchr, 0)
+IMPL (memrchr, 1)
+
+char *
+simple_memrchr (const char *s, int c, size_t n)
+{
+  s = s + n;
+  while (n--)
+    if (*--s == (char) c)
+      return (char *) s;
+  return NULL;
+}
+
+static void
+do_one_test (impl_t *impl, const char *s, int c, size_t n, char *exp_res)
+{
+  char *res = CALL (impl, s, c, n);
+  if (res != exp_res)
+    {
+      error (0, 0, "Wrong result in function %s %p %p", impl->name,
+	     res, exp_res);
+      ret = 1;
+      return;
+    }
+}
+
+static void
+do_test (size_t align, size_t pos, size_t len, int seek_char)
+{
+  size_t i;
+  char *result;
+
+  align &= 7;
+  if (align + len >= page_size)
+    return;
+
+  for (i = 0; i < len; ++i)
+    {
+      buf1[align + i] = 1 + 23 * i % 127;
+      if (buf1[align + i] == seek_char)
+        buf1[align + i] = seek_char + 1;
+    }
+  buf1[align + len] = 0;
+
+  if (pos < len)
+    {
+      buf1[align + pos] = seek_char;
+      buf1[align + len] = -seek_char;
+      result = (char *) (buf1 + align + pos);
+    }
+  else
+    {
+      result = NULL;
+      buf1[align + len] = seek_char;
+    }
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, (char *) (buf1 + align), seek_char, len, result);
+}
+
+static void
+do_random_tests (void)
+{
+  size_t i, j, n, align, pos, len;
+  int seek_char;
+  char *result;
+  unsigned char *p = buf1 + page_size - 512;
+
+  for (n = 0; n < ITERATIONS; n++)
+    {
+      align = random () & 15;
+      pos = random () & 511;
+      if (pos + align >= 512)
+	pos = 511 - align - (random () & 7);
+      len = random () & 511;
+      if (pos >= len)
+	len = pos + (random () & 7);
+      if (len + align >= 512)
+        len = 512 - align - (random () & 7);
+      seek_char = random () & 255;
+      j = len + align + 64;
+      if (j > 512)
+        j = 512;
+
+      for (i = 0; i < j; i++)
+	{
+	  if (i == pos + align)
+	    p[i] = seek_char;
+	  else
+	    {
+	      p[i] = random () & 255;
+	      if (p[i] == seek_char)
+		p[i] = seek_char + 13;
+	    }
+	}
+
+      if (pos < len)
+	result = (char *) (p + pos + align);
+      else
+	result = NULL;
+
+      FOR_EACH_IMPL (impl, 1)
+	if (CALL (impl, (char *) (p + align), seek_char, len) != result)
+	  {
+	    error (0, 0, "Iteration %zd - wrong result in function %s (%zd, %d, %zd, %zd) %p != %p, p %p",
+		   n, impl->name, align, seek_char, len, pos,
+		   CALL (impl, (char *) (p + align), seek_char, len),
+		   result, p);
+	    ret = 1;
+	  }
+    }
+}
+
+int
+test_main (void)
+{
+  size_t i;
+
+  test_init ();
+
+  printf ("%20s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  for (i = 1; i < 8; ++i)
+    {
+      do_test (0, 16 << i, 2048, 23);
+      do_test (i, 64, 256, 23);
+      do_test (0, 16 << i, 2048, 0);
+      do_test (i, 64, 256, 0);
+    }
+  for (i = 1; i < 32; ++i)
+    {
+      do_test (0, i, i + 1, 23);
+      do_test (0, i, i + 1, 0);
+    }
+
+  do_random_tests ();
+  return ret;
+}
+
+#include "../test-skeleton.c"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f0f4db2172678512731d2a095bcf5abf0846d3a3

commit f0f4db2172678512731d2a095bcf5abf0846d3a3
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Tue Sep 3 15:32:54 2013 +0000

    Fix lgammaf spurious underflow (bug 15427).

diff --git a/ChangeLog b/ChangeLog
index 443dcfe..be77e26 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,33 @@
+<<<<<<< HEAD
+=======
+2013-09-03  Joseph Myers  <joseph@codesourcery.com>
+
+	[BZ #15427]
+	* sysdeps/ieee754/flt-32/e_lgammaf_r.c (__ieee754_lgammaf_r): Use
+	2**-30 instead of 2**-70 as threshold for returning -log(|x|).
+	* math/libm-test.inc (lgamma_test_data): Add more tests.
+	* sysdeps/i386/fpu/libm-test-ulps: Update.
+	* sysdeps/x86_64/fpu/libm-test-ulps: Likewise.
+
+2013-09-03   OndÅ?ej BÃlka  <neleai@seznam.cz>
+
+	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: New file.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
+	Add ifunc.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
+	Add strcmp-sse2-unaligned
+	* sysdeps/x86_64/multiarch/strcmp.S (strcmp): Add ifunc.
+
+2013-09-02  Mike Frysinger  <vapier@gentoo.org>
+
+	* Versions.def (libc): Add GLIBC_2.19.
+
+2013-09-02  Mike Frysinger  <vapier@gentoo.org>
+
+	* sysdeps/unix/sysv/linux/tst-fanotify.c: New test.
+	* sysdeps/unix/sysv/linux/Makefile (tests): Add tst-fanotify.
+
+>>>>>>> ffa3cd7... Fix lgammaf spurious underflow (bug 15427).
 2013-09-02  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #14155]
diff --git a/NEWS b/NEWS
index 935b887..a71d9fa 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14155, 14699, 15532, 15522, 15797, 15909, 15996, 16150.
+  14155, 14699, 15532, 15427, 15522, 15797, 15909, 15996, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 9d49255..7a11c90 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -9973,6 +9973,61 @@ static const struct test_f_f1_data lgamma_test_data[] =
     TEST_f_f1 (lgamma, -0.5, M_LOG_2_SQRT_PIl, -1),
     TEST_f_f1 (lgamma, 0.7L, 0.260867246531666514385732417016759578L, 1),
     TEST_f_f1 (lgamma, 1.2L, -0.853740900033158497197028392998854470e-1L, 1),
+
+    TEST_f_f1 (lgamma, 0x1p-5L, 3.4484891277979584796832693452686366085801e+00L, 1),
+    TEST_f_f1 (lgamma, -0x1p-5L, 3.4845895751341394376217526729956836492792e+00L, -1),
+    TEST_f_f1 (lgamma, 0x1p-10L, 6.9309089024194618895406190646600805357273e+00L, 1),
+    TEST_f_f1 (lgamma, -0x1p-10L, 6.9320362775113082175565786721095494761582e+00L, -1),
+    TEST_f_f1 (lgamma, 0x1p-15L, 1.0397190093941001762077888432721419773538e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-15L, 1.0397225324389321751118257981741350715545e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-20L, 1.3862943060723899573457963336920089012399e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-20L, 1.3862944161675408862049886226750366625112e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-25L, 1.7328679496796266133304874243201700664713e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-25L, 1.7328679531201000798551671833865469674673e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-30L, 2.0794415416260785304085859198055798098863e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-30L, 2.0794415417335933262374820960532606449975e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-40L, 2.7725887222397287402100277256545578941303e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-40L, 2.7725887222398337351278293820766115529596e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-50L, 3.4657359027997264958191108994508978906983e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-50L, 3.4657359027997265983532103151309975524744e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-60L, 4.1588830833596718564533272505187468598519e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-60L, 4.1588830833596718565534582069793719571779e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-64L, 4.4361419555836499802671564849429355013920e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-64L, 4.4361419555836499802734146697217245699749e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-70L, 4.8520302639196171659205759581386516869302e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-70L, 4.8520302639196171659206737422758202661268e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-100L, 6.9314718055994530941723212145817201464678e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-100L, 6.9314718055994530941723212145818112150422e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-126L, 8.7336544750553108986571247303730247577506e+01L, 1),
+    TEST_f_f1 (lgamma, -0x1p-126L, 8.7336544750553108986571247303730247577520e+01L, -1),
+    TEST_f_f1 (lgamma, 0x1p-149L, 1.0327892990343185110316758609726830864325e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-149L, 1.0327892990343185110316758609726830864325e+02L, -1),
+#ifndef TEST_FLOAT
+    TEST_f_f1 (lgamma, 0x1p-200L, 1.3862943611198906188344642429163531361510e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-200L, 1.3862943611198906188344642429163531361510e+02L, -1),
+    TEST_f_f1 (lgamma, 0x1p-500L, 3.4657359027997265470861606072908828403775e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-500L, 3.4657359027997265470861606072908828403775e+02L, -1),
+    TEST_f_f1 (lgamma, 0x1p-1000L, 6.9314718055994530941723212145817656807550e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-1000L, 6.9314718055994530941723212145817656807550e+02L, -1),
+    TEST_f_f1 (lgamma, 0x1p-1022L, 7.0839641853226410622441122813025645257316e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-1022L, 7.0839641853226410622441122813025645257316e+02L, -1),
+    TEST_f_f1 (lgamma, 0x1p-1074L, 7.4444007192138126231410729844608163411309e+02L, 1),
+    TEST_f_f1 (lgamma, -0x1p-1074L, 7.4444007192138126231410729844608163411309e+02L, -1),
+#endif
+#if defined TEST_LDOUBLE && LDBL_MIN_EXP <= -16381
+    TEST_f_f1 (lgamma, 0x1p-5000L, 3.4657359027997265470861606072908828403775e+03L, 1),
+    TEST_f_f1 (lgamma, -0x1p-5000L, 3.4657359027997265470861606072908828403775e+03L, -1),
+    TEST_f_f1 (lgamma, 0x1p-10000L, 6.9314718055994530941723212145817656807550e+03L, 1),
+    TEST_f_f1 (lgamma, -0x1p-10000L, 6.9314718055994530941723212145817656807550e+03L, -1),
+    TEST_f_f1 (lgamma, 0x1p-16382L, 1.1355137111933024058873096613727848538213e+04L, 1),
+    TEST_f_f1 (lgamma, -0x1p-16382L, 1.1355137111933024058873096613727848538213e+04L, -1),
+    TEST_f_f1 (lgamma, 0x1p-16445L, 1.1398805384308300613366382237379713662002e+04L, 1),
+    TEST_f_f1 (lgamma, -0x1p-16445L, 1.1398805384308300613366382237379713662002e+04L, -1),
+# if LDBL_MANT_DIG >= 113
+    TEST_f_f1 (lgamma, 0x1p-16494L, 1.1432769596155737933527826611331164313837e+04L, 1),
+    TEST_f_f1 (lgamma, -0x1p-16494L, 1.1432769596155737933527826611331164313837e+04L, -1),
+# endif
+#endif
   };
 
 static void
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 8244863..4759aa9 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -5470,9 +5470,35 @@ double: 1
 idouble: 1
 ildouble: 1
 ldouble: 1
+Test "gamma (-0x1p-10)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+Test "gamma (-0x1p-15)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+Test "gamma (-0x1p-20)":
+double: 1
+idouble: 1
+Test "gamma (-0x1p-30)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "gamma (-0x1p-5)":
+double: 1
+idouble: 1
 Test "gamma (0.7)":
 float: 1
 ifloat: 1
+Test "gamma (0x1p-40)":
+ildouble: 1
+ldouble: 1
 Test "gamma (1.2)":
 double: 1
 float: 2
@@ -5723,9 +5749,35 @@ double: 1
 idouble: 1
 ildouble: 1
 ldouble: 1
+Test "lgamma (-0x1p-10)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+ildouble: 1
+ldouble: 1
+Test "lgamma (-0x1p-15)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+Test "lgamma (-0x1p-20)":
+double: 1
+idouble: 1
+Test "lgamma (-0x1p-30)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "lgamma (-0x1p-5)":
+double: 1
+idouble: 1
 Test "lgamma (0.7)":
 float: 1
 ifloat: 1
+Test "lgamma (0x1p-40)":
+ildouble: 1
+ldouble: 1
 Test "lgamma (1.2)":
 double: 1
 float: 2
diff --git a/sysdeps/ieee754/flt-32/e_lgammaf_r.c b/sysdeps/ieee754/flt-32/e_lgammaf_r.c
index 2e92269..0dba9af 100644
--- a/sysdeps/ieee754/flt-32/e_lgammaf_r.c
+++ b/sysdeps/ieee754/flt-32/e_lgammaf_r.c
@@ -150,8 +150,8 @@ __ieee754_lgammaf_r(float x, int *signgamp)
 	      *signgamp = -1;
 	    return one/fabsf(x);
 	  }
-	if(__builtin_expect(ix<0x1c800000, 0)) {
-	    /* |x|<2**-70, return -log(|x|) */
+	if(__builtin_expect(ix<0x30800000, 0)) {
+	    /* |x|<2**-30, return -log(|x|) */
 	    if(hx<0) {
 		*signgamp = -1;
 		return -__ieee754_logf(-x);
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 477eedc..6fbfa64 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -6222,11 +6222,39 @@ idouble: 1
 Test "gamma (-0.5)":
 ildouble: 1
 ldouble: 1
+Test "gamma (-0x1p-10)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "gamma (-0x1p-15)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+Test "gamma (-0x1p-20)":
+double: 1
+idouble: 1
+Test "gamma (-0x1p-30)":
+ildouble: 1
+ldouble: 1
+Test "gamma (-0x1p-5)":
+double: 1
+idouble: 1
 Test "gamma (0.7)":
 double: 1
 float: 1
 idouble: 1
 ifloat: 1
+Test "gamma (0x1p-10)":
+float: 1
+ifloat: 1
+Test "gamma (0x1p-30)":
+double: 1
+idouble: 1
+Test "gamma (0x1p-40)":
+ildouble: 1
+ldouble: 1
 Test "gamma (1.2)":
 double: 1
 float: 2
@@ -6491,11 +6519,39 @@ ldouble: 2
 Test "lgamma (-0.5)":
 ildouble: 1
 ldouble: 1
+Test "lgamma (-0x1p-10)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "lgamma (-0x1p-15)":
+double: 1
+float: 1
+idouble: 1
+ifloat: 1
+Test "lgamma (-0x1p-20)":
+double: 1
+idouble: 1
+Test "lgamma (-0x1p-30)":
+ildouble: 1
+ldouble: 1
+Test "lgamma (-0x1p-5)":
+double: 1
+idouble: 1
 Test "lgamma (0.7)":
 double: 1
 float: 1
 idouble: 1
 ifloat: 1
+Test "lgamma (0x1p-10)":
+float: 1
+ifloat: 1
+Test "lgamma (0x1p-30)":
+double: 1
+idouble: 1
+Test "lgamma (0x1p-40)":
+ildouble: 1
+ldouble: 1
 Test "lgamma (1.2)":
 double: 1
 float: 2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=184f9acfeb3bbcaad2404a2e238e4d8f71d220fe

commit 184f9acfeb3bbcaad2404a2e238e4d8f71d220fe
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Mon Sep 2 14:51:24 2013 +0000

    Fix spurious jnf underflows (bug 14155).

diff --git a/ChangeLog b/ChangeLog
index 3921a8c..443dcfe 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2013-09-02  Joseph Myers  <joseph@codesourcery.com>
+
+	[BZ #14155]
+	* sysdeps/ieee754/flt-32/e_jnf.c (__ieee754_jnf): Use double for
+	intermediate calculations in recurrence.
+	(__ieee754_ynf): Likewise.
+	* math/libm-test.inc (jn_test_data): Do not allow spurious
+	underflow exception.  Add more tests.
+	(yn_test_data): Add more tests.
+	* sysdeps/i386/fpu/libm-test-ulps: Update.
+	* sysdeps/x86_64/fpu/libm-test-ulps: Likewise.
+
 2013-08-29  Thomas Schwinge  <thomas@codesourcery.com>
 
 	[BZ #15522] strtod ("nan(N)") returning a sNaN in some cases
diff --git a/NEWS b/NEWS
index acb6ef8..935b887 100644
--- a/NEWS
+++ b/NEWS
@@ -8,9 +8,8 @@ using `glibc' in the "product" field.
 Version 2.18.1
 
 * The following bugs are resolved with this release:
-
 
-  14699, 15532, 15522, 15797, 15909, 15996, 16150.
+  14155, 14699, 15532, 15522, 15797, 15909, 15996, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 7408c6b..9d49255 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -9908,8 +9908,14 @@ static const struct test_if_f_data jn_test_data[] =
     TEST_if_f (jn, 8, 2.4048255576957729L, 0.92165786705344923232879022467054148E-4L),
     TEST_if_f (jn, 9, 2.4048255576957729L, 0.12517270977961513005428966643852564E-4L),
 
-    /* Bug 14155: spurious exception may occur.  */
-    TEST_if_f (jn, 2, 0x1.ffff62p+99L, -4.43860668048170034334926693188979974489e-16L, UNDERFLOW_EXCEPTION_OK),
+    TEST_if_f (jn, 2, 0x1.ffff62p+99L, -4.43860668048170034334926693188979974489e-16L),
+    TEST_if_f (jn, 2, 0x1p127L, -6.0784021821505059176832624052765568656702e-20L),
+#ifndef TEST_FLOAT
+    TEST_if_f (jn, 2, 0x1p1023L, 1.5665258060609012834424478437196679802783e-155L),
+#endif
+#if defined TEST_LDOUBLE && LDBL_MAX_EXP >= 16384
+    TEST_if_f (jn, 2, 0x1p16383L, -9.5859502826270374691362975419147645151233e-2467L),
+#endif
   };
 
 static void
@@ -14526,6 +14532,15 @@ static const struct test_if_f_data yn_test_data[] =
     /* Check whether yn returns correct value for LDBL_MIN, DBL_MIN,
        and FLT_MIN.  See Bug 14173.  */
     TEST_if_f (yn, 10, min_value, minus_infty, OVERFLOW_EXCEPTION|ERRNO_ERANGE),
+
+    TEST_if_f (yn, 2, 0x1.ffff62p+99L, -5.5244413477397111790415387179517953221757e-16L),
+    TEST_if_f (yn, 2, 0x1p127L, 6.8569250690166637098111268958532649249771e-21L),
+#ifndef TEST_FLOAT
+    TEST_if_f (yn, 2, 0x1p1023L, -8.2687542933709649327986678723012001545638e-155L),
+#endif
+#if defined TEST_LDOUBLE && LDBL_MAX_EXP >= 16384
+    TEST_if_f (yn, 2, 0x1p16383L, 3.8895531955766020648617743624167352352217e-2467L),
+#endif
   };
 
 static void
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 530dbd7..8244863 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -5635,9 +5635,9 @@ ildouble: 1
 ldouble: 1
 Test "jn (10, 10.0)":
 double: 1
-float: 1
+float: 2
 idouble: 1
-ifloat: 1
+ifloat: 2
 ildouble: 2
 ldouble: 2
 Test "jn (10, 2.0)":
@@ -5648,6 +5648,14 @@ float: 1
 ifloat: 1
 ildouble: 1
 ldouble: 1
+Test "jn (2, 0x1p1023)":
+double: 1
+idouble: 1
+Test "jn (2, 0x1p127)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
 Test "jn (2, 2.4048255576957729)":
 double: 1
 float: 1
@@ -6844,6 +6852,14 @@ ifloat: 1
 Test "yn (10, 2.0)":
 float: 3
 ifloat: 3
+Test "yn (2, 0x1.ffff62p+99)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "yn (2, 0x1p127)":
+float: 2
+ifloat: 2
 Test "yn (3, 0.125)":
 ildouble: 1
 ldouble: 1
diff --git a/sysdeps/ieee754/flt-32/e_jnf.c b/sysdeps/ieee754/flt-32/e_jnf.c
index ad26d7e..5984d94 100644
--- a/sysdeps/ieee754/flt-32/e_jnf.c
+++ b/sysdeps/ieee754/flt-32/e_jnf.c
@@ -54,7 +54,7 @@ __ieee754_jnf(int n, float x)
 	    b = __ieee754_j1f(x);
 	    for(i=1;i<n;i++){
 		temp = b;
-		b = b*((float)(i+i)/x) - a; /* avoid underflow */
+		b = b*((double)(i+i)/x) - a; /* avoid underflow */
 		a = temp;
 	    }
 	} else {
@@ -196,7 +196,7 @@ __ieee754_ynf(int n, float x)
 	GET_FLOAT_WORD(ib,b);
 	for(i=1;i<n&&ib!=0xff800000;i++){
 	    temp = b;
-	    b = ((float)(i+i)/x)*b - a;
+	    b = ((double)(i+i)/x)*b - a;
 	    GET_FLOAT_WORD(ib,b);
 	    a = temp;
 	}
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index d02618a..477eedc 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -6403,6 +6403,11 @@ idouble: 2
 ifloat: 2
 ildouble: 1
 ldouble: 1
+Test "jn (2, 0x1p127)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
 Test "jn (2, 2.4048255576957729)":
 double: 2
 float: 1
@@ -7728,6 +7733,16 @@ double: 3
 float: 1
 idouble: 3
 ifloat: 1
+Test "yn (2, 0x1.ffff62p+99)":
+double: 1
+idouble: 1
+ildouble: 1
+ldouble: 1
+Test "yn (2, 0x1p127)":
+double: 1
+float: 3
+idouble: 1
+ifloat: 3
 Test "yn (3, 0.125)":
 double: 1
 idouble: 1
@@ -8428,9 +8443,9 @@ ldouble: 2
 
 Function: "yn":
 double: 3
-float: 2
+float: 3
 idouble: 3
-ifloat: 2
+ifloat: 3
 ildouble: 4
 ldouble: 4
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9a2afcedaef354324ea08392a6191beb614e9359

commit 9a2afcedaef354324ea08392a6191beb614e9359
Author: Thomas Schwinge <thomas@codesourcery.com>
Date:   Thu May 23 18:00:10 2013 +0200

    [BZ #15522] strtod ("nan(N)") returning a sNaN in some cases

diff --git a/ChangeLog b/ChangeLog
index 649b282..3921a8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,29 @@
+2013-08-29  Thomas Schwinge  <thomas@codesourcery.com>
+
+	[BZ #15522] strtod ("nan(N)") returning a sNaN in some cases
+
+	* stdlib/strtof_l.c (SET_MANTISSA): Rewrite.
+	* stdlib/strtod_l.c (SET_MANTISSA): Likewise.
+	* sysdeps/ieee754/ldbl-64-128/strtold_l.c (SET_MANTISSA):
+	Likewise.
+	* sysdeps/ieee754/ldbl-96/strtold_l.c (SET_MANTISSA): Likewise.
+	* sysdeps/ieee754/ldbl-128/strtold_l.c (SET_MANTISSA): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/strtold_l.c (SET_MANTISSA):
+	Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/ieee754.h
+	(ibm_extended_long_double): Add ieee_nan member.
+	* stdlib/tst-strtod6.c (test): New function, renamed from do_test.
+	(do_test): New function.
+
+	* math/basic-test.c (TEST_CONVERT): New macro, renamed from
+	TEST_TRUNC.
+	(convert_dfsf_test, convert_tfsf_test, convert_tfdf_test): New
+	functions, renamed from truncdfsf_test, trunctfsf_test,
+	trunctfdf_test.
+	(convert_sfdf_test, convert_sftf_test, convert_dftf_test): New
+	functions.
+	(do_test): Run all these.
+
 2013-08-23  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #15532]
diff --git a/NEWS b/NEWS
index 55009cb..acb6ef8 100644
--- a/NEWS
+++ b/NEWS
@@ -8,8 +8,9 @@ using `glibc' in the "product" field.
 Version 2.18.1
 
 * The following bugs are resolved with this release:
+
 
-  14699, 15532, 15797, 15909, 15996, 16150.
+  14699, 15532, 15522, 15797, 15909, 15996, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/stdlib/strtod_l.c b/stdlib/strtod_l.c
index 5b41e2b..8f60653 100644
--- a/stdlib/strtod_l.c
+++ b/stdlib/strtod_l.c
@@ -42,11 +42,10 @@ extern unsigned long long int ____strtoull_l_internal (const char *, char **,
 # define SET_MANTISSA(flt, mant) \
   do { union ieee754_double u;						      \
        u.d = (flt);							      \
-       if ((mant & 0xfffffffffffffULL) == 0)				      \
-	 mant = 0x8000000000000ULL;					      \
-       u.ieee.mantissa0 = ((mant) >> 32) & 0xfffff;			      \
-       u.ieee.mantissa1 = (mant) & 0xffffffff;				      \
-       (flt) = u.d;							      \
+       u.ieee_nan.mantissa0 = (mant) >> 32;				      \
+       u.ieee_nan.mantissa1 = (mant);					      \
+       if ((u.ieee.mantissa0 | u.ieee.mantissa1) != 0)			      \
+	 (flt) = u.d;							      \
   } while (0)
 #endif
 /* End of configuration part.  */
diff --git a/stdlib/strtof_l.c b/stdlib/strtof_l.c
index 6fb44bd..c4c1c1f 100644
--- a/stdlib/strtof_l.c
+++ b/stdlib/strtof_l.c
@@ -37,10 +37,9 @@ extern unsigned long long int ____strtoull_l_internal (const char *, char **,
 #define SET_MANTISSA(flt, mant) \
   do { union ieee754_float u;						      \
        u.f = (flt);							      \
-       if ((mant & 0x7fffff) == 0)					      \
-	 mant = 0x400000;						      \
-       u.ieee.mantissa = (mant) & 0x7fffff;				      \
-       (flt) = u.f;							      \
+       u.ieee_nan.mantissa = (mant);					      \
+       if (u.ieee.mantissa != 0)					      \
+	 (flt) = u.f;							      \
   } while (0)
 
 #include "strtod_l.c"
diff --git a/stdlib/tst-strtod6.c b/stdlib/tst-strtod6.c
index 1d87266..15e79fd 100644
--- a/stdlib/tst-strtod6.c
+++ b/stdlib/tst-strtod6.c
@@ -4,12 +4,13 @@
 #include <string.h>
 
 static int
-do_test (void)
+test (const char str[])
 {
-  static const char str[] = "NaN(blabla)something";
   char *endp;
   int result = 0;
 
+  puts (str);
+
   double d = strtod (str, &endp);
   if (!isnan (d))
     {
@@ -64,5 +65,24 @@ do_test (void)
   return result;
 }
 
+static int
+do_test (void)
+{
+  int result = 0;
+
+  result |= test ("NaN(blabla)something");
+  result |= test ("NaN(1234)something");
+  /* UINT32_MAX.  */
+  result |= test ("NaN(4294967295)something");
+  /* UINT64_MAX.  */
+  result |= test ("NaN(18446744073709551615)something");
+  /* The case of zero is special in that "something" has to be done to make the
+     mantissa different from zero, which would mean infinity instead of
+     NaN.  */
+  result |= test ("NaN(0)something");
+
+  return result;
+}
+
 #define TEST_FUNCTION do_test ()
 #include "../test-skeleton.c"
diff --git a/sysdeps/ieee754/ldbl-128/strtold_l.c b/sysdeps/ieee754/ldbl-128/strtold_l.c
index 8e0bc03..d3a1d1e 100644
--- a/sysdeps/ieee754/ldbl-128/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-128/strtold_l.c
@@ -34,11 +34,13 @@
 #define SET_MANTISSA(flt, mant) \
   do { union ieee854_long_double u;					      \
        u.d = (flt);							      \
-       u.ieee.mantissa0 = 0x8000;					      \
-       u.ieee.mantissa1 = 0;						      \
-       u.ieee.mantissa2 = ((mant) >> 32);	      			      \
-       u.ieee.mantissa3 = (mant) & 0xffffffff;				      \
-       (flt) = u.d;							      \
+       u.ieee_nan.mantissa0 = 0;					      \
+       u.ieee_nan.mantissa1 = 0;					      \
+       u.ieee_nan.mantissa2 = (mant) >> 32;				      \
+       u.ieee_nan.mantissa3 = (mant);					      \
+       if ((u.ieee.mantissa0 | u.ieee.mantissa1				      \
+	    | u.ieee.mantissa2 | u.ieee.mantissa3) != 0)		      \
+	 (flt) = u.d;							      \
   } while (0)
 
 #include <strtod_l.c>
diff --git a/sysdeps/ieee754/ldbl-128ibm/ieee754.h b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
index e5644f5..9e94f53 100644
--- a/sysdeps/ieee754/ldbl-128ibm/ieee754.h
+++ b/sysdeps/ieee754/ldbl-128ibm/ieee754.h
@@ -199,6 +199,25 @@ union ibm_extended_long_double
 	unsigned int mantissa2:20;
 	unsigned int mantissa3:32;
       } ieee;
+
+    /* This format makes it easier to see if a NaN is a signalling NaN.  */
+    struct
+      { /* Big endian.  There is no other.  */
+
+	unsigned int negative:1;
+	unsigned int exponent:11;
+	unsigned int quiet_nan:1;
+	/* Together Mantissa0-3 comprise the mantissa.  */
+	unsigned int mantissa0:19;
+	unsigned int mantissa1:32;
+
+	unsigned int negative2:1;
+	unsigned int exponent2:11;
+	/* There is an implied 1 here?  */
+	/* Together these comprise the mantissa.  */
+	unsigned int mantissa2:20;
+	unsigned int mantissa3:32;
+      } ieee_nan;
    };
 
 #define IBM_EXTENDED_LONG_DOUBLE_BIAS 0x3ff /* Added to exponent.  */
diff --git a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
index 93415f0..04e3288 100644
--- a/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-128ibm/strtold_l.c
@@ -44,11 +44,10 @@ libc_hidden_proto (STRTOF)
 # define SET_MANTISSA(flt, mant) \
   do { union ibm_extended_long_double u;				      \
        u.d = (flt);							      \
-       if ((mant & 0xfffffffffffffULL) == 0)				      \
-	 mant = 0x8000000000000ULL;					      \
-       u.ieee.mantissa0 = ((mant) >> 32) & 0xfffff;			      \
-       u.ieee.mantissa1 = (mant) & 0xffffffff;				      \
-       (flt) = u.d;							      \
+       u.ieee_nan.mantissa0 = (mant) >> 32;				      \
+       u.ieee_nan.mantissa1 = (mant);					      \
+       if ((u.ieee.mantissa0 | u.ieee.mantissa1) != 0)			      \
+	 (flt) = u.d;							      \
   } while (0)
 
 #include <strtod_l.c>
diff --git a/sysdeps/ieee754/ldbl-64-128/strtold_l.c b/sysdeps/ieee754/ldbl-64-128/strtold_l.c
index 8182b2b..e9b33f2 100644
--- a/sysdeps/ieee754/ldbl-64-128/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-64-128/strtold_l.c
@@ -44,11 +44,13 @@ libc_hidden_proto (STRTOF)
 #define SET_MANTISSA(flt, mant) \
   do { union ieee854_long_double u;					      \
        u.d = (flt);							      \
-       u.ieee.mantissa0 = 0x8000;					      \
-       u.ieee.mantissa1 = 0;						      \
-       u.ieee.mantissa2 = ((mant) >> 32);	      			      \
-       u.ieee.mantissa3 = (mant) & 0xffffffff;				      \
-       (flt) = u.d;							      \
+       u.ieee_nan.mantissa0 = 0;					      \
+       u.ieee_nan.mantissa1 = 0;					      \
+       u.ieee_nan.mantissa2 = (mant) >> 32;				      \
+       u.ieee_nan.mantissa3 = (mant);					      \
+       if ((u.ieee.mantissa0 | u.ieee.mantissa1				      \
+	    | u.ieee.mantissa2 | u.ieee.mantissa3) != 0)		      \
+	 (flt) = u.d;							      \
   } while (0)
 
 #include <strtod_l.c>
diff --git a/sysdeps/ieee754/ldbl-96/strtold_l.c b/sysdeps/ieee754/ldbl-96/strtold_l.c
index ded84f3..dccf98c 100644
--- a/sysdeps/ieee754/ldbl-96/strtold_l.c
+++ b/sysdeps/ieee754/ldbl-96/strtold_l.c
@@ -34,11 +34,10 @@
 #define SET_MANTISSA(flt, mant) \
   do { union ieee854_long_double u;					      \
        u.d = (flt);							      \
-       if ((mant & 0x7fffffffffffffffULL) == 0)				      \
-	 mant = 0x4000000000000000ULL;					      \
-       u.ieee.mantissa0 = (((mant) >> 32) & 0x7fffffff) | 0x80000000;	      \
-       u.ieee.mantissa1 = (mant) & 0xffffffff;				      \
-       (flt) = u.d;							      \
+       u.ieee_nan.mantissa0 = (mant) >> 32;				      \
+       u.ieee_nan.mantissa1 = (mant);					      \
+       if ((u.ieee.mantissa0 | u.ieee.mantissa1) != 0)			      \
+	 (flt) = u.d;							      \
   } while (0)
 
 #include <stdlib/strtod_l.c>

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=85248b2448e4614414aa30ae31ac479bdfd87696

commit 85248b2448e4614414aa30ae31ac479bdfd87696
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Fri Aug 23 19:45:38 2013 +0000

    Fix cexp (NaN + i0) (bug 15532).

diff --git a/ChangeLog b/ChangeLog
index 346f699..649b282 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-08-23  Joseph Myers  <joseph@codesourcery.com>
+
+	[BZ #15532]
+	* math/s_cexp.c (__cexp): Return NaN + i0 for NaN + i0 argument.
+	* math/s_cexpf.c (__cexpf): Likewise.
+	* math/s_cexpl.c (__cexpl): Likewise.
+	* math/libm-test.inc (cexp_test_data): Correct expected return
+	value for NaN + i0.  Add another test.
+
 2013-08-21  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #15797]
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 8c1dcac..7408c6b 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -6193,7 +6193,8 @@ static const struct test_c_c_data cexp_test_data[] =
 
     TEST_c_c (cexp, plus_infty, qnan_value, plus_infty, qnan_value),
 
-    TEST_c_c (cexp, qnan_value, 0.0, qnan_value, qnan_value, INVALID_EXCEPTION_OK),
+    TEST_c_c (cexp, qnan_value, 0.0, qnan_value, 0.0),
+    TEST_c_c (cexp, qnan_value, minus_zero, qnan_value, minus_zero),
     TEST_c_c (cexp, qnan_value, 1.0, qnan_value, qnan_value, INVALID_EXCEPTION_OK),
 
     TEST_c_c (cexp, qnan_value, plus_infty, qnan_value, qnan_value, INVALID_EXCEPTION_OK),
diff --git a/math/s_cexp.c b/math/s_cexp.c
index 655e4e8..40e0e51 100644
--- a/math/s_cexp.c
+++ b/math/s_cexp.c
@@ -145,12 +145,18 @@ __cexp (__complex__ double x)
     }
   else
     {
-      /* If the real part is NaN the result is NaN + iNaN.  */
+      /* If the real part is NaN the result is NaN + iNaN unless the
+	 imaginary part is zero.  */
       __real__ retval = __nan ("");
-      __imag__ retval = __nan ("");
+      if (icls == FP_ZERO)
+	__imag__ retval = __imag__ x;
+      else
+	{
+	  __imag__ retval = __nan ("");
 
-      if (rcls != FP_NAN || icls != FP_NAN)
-	feraiseexcept (FE_INVALID);
+	  if (rcls != FP_NAN || icls != FP_NAN)
+	    feraiseexcept (FE_INVALID);
+	}
     }
 
   return retval;
diff --git a/math/s_cexpf.c b/math/s_cexpf.c
index fa942d3..7c42205 100644
--- a/math/s_cexpf.c
+++ b/math/s_cexpf.c
@@ -145,12 +145,18 @@ __cexpf (__complex__ float x)
     }
   else
     {
-      /* If the real part is NaN the result is NaN + iNaN.  */
+      /* If the real part is NaN the result is NaN + iNaN unless the
+	 imaginary part is zero.  */
       __real__ retval = __nanf ("");
-      __imag__ retval = __nanf ("");
+      if (icls == FP_ZERO)
+	__imag__ retval = __imag__ x;
+      else
+	{
+	  __imag__ retval = __nanf ("");
 
-      if (rcls != FP_NAN || icls != FP_NAN)
-	feraiseexcept (FE_INVALID);
+	  if (rcls != FP_NAN || icls != FP_NAN)
+	    feraiseexcept (FE_INVALID);
+	}
     }
 
   return retval;
diff --git a/math/s_cexpl.c b/math/s_cexpl.c
index d827bc3..0c35603 100644
--- a/math/s_cexpl.c
+++ b/math/s_cexpl.c
@@ -145,12 +145,18 @@ __cexpl (__complex__ long double x)
     }
   else
     {
-      /* If the real part is NaN the result is NaN + iNaN.  */
+      /* If the real part is NaN the result is NaN + iNaN unless the
+	 imaginary part is zero.  */
       __real__ retval = __nanl ("");
-      __imag__ retval = __nanl ("");
+      if (icls == FP_ZERO)
+	__imag__ retval = __imag__ x;
+      else
+	{
+	  __imag__ retval = __nanl ("");
 
-      if (rcls != FP_NAN || icls != FP_NAN)
-	feraiseexcept (FE_INVALID);
+	  if (rcls != FP_NAN || icls != FP_NAN)
+	    feraiseexcept (FE_INVALID);
+	}
     }
 
   return retval;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=72993206c61252fbb7cc3d605b420fb04f2f8c43

commit 72993206c61252fbb7cc3d605b420fb04f2f8c43
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Wed Aug 21 19:56:48 2013 +0000

    Fix fdim handling of infinities (bug 15797).

diff --git a/ChangeLog b/ChangeLog
index 37617d8..346f699 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-08-21  Joseph Myers  <joseph@codesourcery.com>
+
+	[BZ #15797]
+	* math/s_fdim.c (__fdim): Check for infinite arguments if result
+	is infinite, not alongside NaN test.
+	* math/s_fdimf.c (__fdimf): Likewise.
+	* math/s_fdiml.c (__fdiml): Likewise.
+	* math/libm-test.inc (fdim_test_data): Add more tests.  Test that
+	errno is unchanged.
+
 2013-07-23  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	[BZ #15867]
diff --git a/NEWS b/NEWS
index ae05edc..55009cb 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14699, 15532, 15909, 15996, 16150.
+  14699, 15532, 15797, 15909, 15996, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 3b382af..8c1dcac 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -8139,33 +8139,37 @@ fabs_test (void)
 
 static const struct test_ff_f_data fdim_test_data[] =
   {
-    TEST_ff_f (fdim, 0, 0, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 9, 0, 9, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 0, 9, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, -9, 0, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 0, -9, 9, NO_INEXACT_EXCEPTION),
-
-    TEST_ff_f (fdim, plus_infty, 9, plus_infty, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, plus_infty, -9, plus_infty, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, minus_infty, 9, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, minus_infty, -9, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 9, minus_infty, plus_infty, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, -9, minus_infty, plus_infty, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 9, plus_infty, 0, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, -9, plus_infty, 0, NO_INEXACT_EXCEPTION),
-
-    TEST_ff_f (fdim, 0, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, 9, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, -9, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, qnan_value, 9, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, qnan_value, -9, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, plus_infty, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, minus_infty, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, qnan_value, plus_infty, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, qnan_value, minus_infty, qnan_value, NO_INEXACT_EXCEPTION),
-    TEST_ff_f (fdim, qnan_value, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
-
-    TEST_ff_f (fdim, plus_infty, plus_infty, 0, NO_INEXACT_EXCEPTION),
+    TEST_ff_f (fdim, 0, 0, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 9, 0, 9, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 0, 9, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, -9, 0, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 0, -9, 9, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+
+    TEST_ff_f (fdim, plus_infty, 9, plus_infty, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, plus_infty, -9, plus_infty, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, minus_infty, 9, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, minus_infty, -9, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 9, minus_infty, plus_infty, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, -9, minus_infty, plus_infty, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 9, plus_infty, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, -9, plus_infty, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+
+    TEST_ff_f (fdim, 0, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, 9, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, -9, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, 0, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, 9, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, -9, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, plus_infty, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, minus_infty, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, plus_infty, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, minus_infty, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, qnan_value, qnan_value, qnan_value, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+
+    TEST_ff_f (fdim, plus_infty, plus_infty, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, plus_infty, minus_infty, plus_infty, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, minus_infty, plus_infty, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fdim, minus_infty, minus_infty, 0, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
   };
 
 static void
diff --git a/math/s_fdim.c b/math/s_fdim.c
index 2f97948..f8fd804 100644
--- a/math/s_fdim.c
+++ b/math/s_fdim.c
@@ -26,16 +26,16 @@ __fdim (double x, double y)
   int clsx = fpclassify (x);
   int clsy = fpclassify (y);
 
-  if (clsx == FP_NAN || clsy == FP_NAN
-      || (y < 0 && clsx == FP_INFINITE && clsy == FP_INFINITE))
-    /* Raise invalid flag.  */
+  if (clsx == FP_NAN || clsy == FP_NAN)
+    /* Raise invalid flag for signaling but not quiet NaN.  */
     return x - y;
 
   if (x <= y)
     return 0.0;
 
   double r = x - y;
-  if (fpclassify (r) == FP_INFINITE)
+  if (fpclassify (r) == FP_INFINITE
+      && clsx != FP_INFINITE && clsy != FP_INFINITE)
     __set_errno (ERANGE);
 
   return r;
diff --git a/math/s_fdimf.c b/math/s_fdimf.c
index 03810b5..86efe6e 100644
--- a/math/s_fdimf.c
+++ b/math/s_fdimf.c
@@ -26,16 +26,16 @@ __fdimf (float x, float y)
   int clsx = fpclassify (x);
   int clsy = fpclassify (y);
 
-  if (clsx == FP_NAN || clsy == FP_NAN
-      || (y < 0 && clsx == FP_INFINITE && clsy == FP_INFINITE))
-    /* Raise invalid flag.  */
+  if (clsx == FP_NAN || clsy == FP_NAN)
+    /* Raise invalid flag for signaling but not quiet NaN.  */
     return x - y;
 
   if (x <= y)
     return 0.0f;
 
   float r = x - y;
-  if (fpclassify (r) == FP_INFINITE)
+  if (fpclassify (r) == FP_INFINITE
+      && clsx != FP_INFINITE && clsy != FP_INFINITE)
     __set_errno (ERANGE);
 
   return r;
diff --git a/math/s_fdiml.c b/math/s_fdiml.c
index 5604532..030fcc2 100644
--- a/math/s_fdiml.c
+++ b/math/s_fdiml.c
@@ -26,16 +26,16 @@ __fdiml (long double x, long double y)
   int clsx = fpclassify (x);
   int clsy = fpclassify (y);
 
-  if (clsx == FP_NAN || clsy == FP_NAN
-      || (y < 0 && clsx == FP_INFINITE && clsy == FP_INFINITE))
-    /* Raise invalid flag.  */
+  if (clsx == FP_NAN || clsy == FP_NAN)
+    /* Raise invalid flag for signaling but not quiet NaN.  */
     return x - y;
 
   if (x <= y)
     return 0.0f;
 
   long double r = x - y;
-  if (fpclassify (r) == FP_INFINITE)
+  if (fpclassify (r) == FP_INFINITE
+      && clsx != FP_INFINITE && clsy != FP_INFINITE)
     __set_errno (ERANGE);
 
   return r;

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a44cf86eab2f01195d414347f5d0b786562537f0

commit a44cf86eab2f01195d414347f5d0b786562537f0
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Aug 20 15:01:59 2013 -0500

    PowerPC: fix backtrace to handle signal trampolines
    
    This patch fixes backtrace for PPC32 and PPC64 to correctly handle
    signal trampolines. The 'debug/tst-backtrace6.c' also check for
    SA_SIGINFO handling, where is triggers another vDSO symbols for PPC32.

diff --git a/ChangeLog b/ChangeLog
index 00f0c7b..37617d8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2013-07-23  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	[BZ #15867]
+	* sysdeps/powerpc/powerpc32/backtrace.c (__backtrace): Handle signal
+	trampoline stack frame information.
+	* sysdeps/powerpc/powerpc64/backtrace.c (__backtrace): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
+	(__vdso_sigtramp_rt64): New variable: PPC64 signal trampoline.
+	(__vdso_sigtramp32): New variable: PPC32 signal trampoline.
+	(__vdso_sigtramp_rt32): New variable: PPC32 signal trampoline.
+	* sysdeps/unix/sysv/linux/powerpc/init-first.c
+	(_libc_vdso_platform_setup): Initialize the signal trampolines.
+	* debug/tst-backtrace5.c (fn): Add an option set modify sigaction
+	sa_flags value.
+	* debug/tst-backtrace6.c: New file: check backtrace for signal frames,
+	interrupting a syscall and set with option SA_SIGINFO.
+
 2013-08-20  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #15531]
diff --git a/debug/Makefile b/debug/Makefile
index 779741f..13ee5c8 100644
--- a/debug/Makefile
+++ b/debug/Makefile
@@ -130,16 +130,18 @@ CFLAGS-tst-backtrace2.c += -funwind-tables
 CFLAGS-tst-backtrace3.c += -funwind-tables
 CFLAGS-tst-backtrace4.c += -funwind-tables
 CFLAGS-tst-backtrace5.c += -funwind-tables
+CFLAGS-tst-backtrace6.c += -funwind-tables
 LDFLAGS-tst-backtrace2 = -rdynamic
 LDFLAGS-tst-backtrace3 = -rdynamic
 LDFLAGS-tst-backtrace4 = -rdynamic
 LDFLAGS-tst-backtrace5 = -rdynamic
+LDFLAGS-tst-backtrace6 = -rdynamic
 
 tests = backtrace-tst tst-longjmp_chk tst-chk1 tst-chk2 tst-chk3 \
 	tst-lfschk1 tst-lfschk2 tst-lfschk3 test-strcpy_chk test-stpcpy_chk \
 	tst-chk4 tst-chk5 tst-chk6 tst-lfschk4 tst-lfschk5 tst-lfschk6 \
 	tst-longjmp_chk2 tst-backtrace2 tst-backtrace3 tst-backtrace4 \
-	tst-backtrace5
+	tst-backtrace5 tst-backtrace6
 
 tests-ifunc := $(stpcpy_chk strcpy_chk:%=test-%-ifunc)
 tests += $(tests-ifunc)
diff --git a/debug/tst-backtrace5.c b/debug/tst-backtrace5.c
index ca47437..51180c1 100644
--- a/debug/tst-backtrace5.c
+++ b/debug/tst-backtrace5.c
@@ -28,6 +28,10 @@
 
 #include "tst-backtrace.h"
 
+#ifndef SIGACTION_FLAGS
+# define SIGACTION_FLAGS 0
+#endif
+
 static int do_test (void);
 #define TEST_FUNCTION do_test ()
 #include "../test-skeleton.c"
@@ -91,7 +95,7 @@ handle_signal (int signum)
 }
 
 NO_INLINE int
-fn (int c)
+fn (int c, int flags)
 {
   pid_t parent_pid, child_pid;
   int pipefd[2];
@@ -100,12 +104,13 @@ fn (int c)
 
   if (c > 0)
     {
-      fn (c - 1);
+      fn (c - 1, flags);
       return x;
     }
 
   memset (&act, 0, sizeof (act));
   act.sa_handler = handle_signal;
+  act.sa_flags = flags;
   sigemptyset (&act.sa_mask);
   sigaction (SIGUSR1, &act, NULL);
   parent_pid = getpid ();
@@ -131,6 +136,6 @@ fn (int c)
 NO_INLINE static int
 do_test (void)
 {
-  fn (2);
+  fn (2, SIGACTION_FLAGS);
   return ret;
 }
diff --git a/debug/tst-backtrace6.c b/debug/tst-backtrace6.c
new file mode 100644
index 0000000..cd8dbcd
--- /dev/null
+++ b/debug/tst-backtrace6.c
@@ -0,0 +1,21 @@
+/* Test backtrace and backtrace_symbols for signal frames, where a
+   system call was interrupted by a signal.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define SIGACTION_FLAGS SA_SIGINFO
+#include <debug/tst-backtrace5.c>
diff --git a/sysdeps/powerpc/powerpc32/backtrace.c b/sysdeps/powerpc/powerpc32/backtrace.c
index b4b11dd..8d413e6 100644
--- a/sysdeps/powerpc/powerpc32/backtrace.c
+++ b/sysdeps/powerpc/powerpc32/backtrace.c
@@ -18,6 +18,9 @@
 
 #include <execinfo.h>
 #include <stddef.h>
+#include <string.h>
+#include <signal.h>
+#include <bits/libc-vdso.h>
 
 /* This is the stack layout we see with every stack frame.
    Note that every routine is required by the ABI to lay out the stack
@@ -35,6 +38,46 @@ struct layout
   void *return_address;
 };
 
+#define SIGNAL_FRAMESIZE 64
+
+/* Since the signal handler is just like any other function it needs to
+   save/restore its LR and it will save it into callers stack frame.
+   Since a signal handler doesn't have a caller, the kernel creates a
+   dummy frame to make it look like it has a caller.  */
+struct signal_frame_32 {
+  char               dummy[SIGNAL_FRAMESIZE];
+  struct sigcontext  sctx;
+  mcontext_t         mctx;
+  /* We don't care about the rest, since IP value is at 'mctx' field.  */
+};
+
+static inline int
+is_sigtramp_address (unsigned int nip)
+{
+#ifdef SHARED
+  if (nip == (unsigned int)__vdso_sigtramp32)
+    return 1;
+#endif
+  return 0;
+}
+
+struct rt_signal_frame_32 {
+  char               dummy[SIGNAL_FRAMESIZE + 16];
+  siginfo_t          info;
+  struct ucontext    uc;
+  /* We don't care about the rest, since IP value is at 'uc' field.  */
+};
+
+static inline int
+is_sigtramp_address_rt (unsigned int nip)
+{
+#ifdef SHARED
+  if (nip == (unsigned int)__vdso_sigtramp_rt32)
+    return 1;
+#endif
+  return 0;
+}
+
 int
 __backtrace (void **array, int size)
 {
@@ -50,7 +93,28 @@ __backtrace (void **array, int size)
   for (				count = 0;
        current != NULL && 	count < size;
        current = current->next, count++)
-    array[count] = current->return_address;
+    {
+      gregset_t *gregset = NULL;
+
+      array[count] = current->return_address;
+
+      /* Check if the symbol is the signal trampoline and get the interrupted
+       * symbol address from the trampoline saved area.  */
+      if (is_sigtramp_address ((unsigned int)current->return_address))
+	{
+	  struct signal_frame_32 *sigframe =
+	    (struct signal_frame_32*) current;
+          gregset = &sigframe->mctx.gregs;
+        }
+      else if (is_sigtramp_address_rt ((unsigned int)current->return_address))
+	{
+	  struct rt_signal_frame_32 *sigframe =
+            (struct rt_signal_frame_32*) current;
+          gregset = &sigframe->uc.uc_mcontext.uc_regs->gregs;
+        }
+      if (gregset)
+	array[++count] = (void*)((*gregset)[PT_NIP]);
+    }
 
   /* It's possible the second-last stack frame can't return
      (that is, it's __libc_start_main), in which case
diff --git a/sysdeps/powerpc/powerpc64/backtrace.c b/sysdeps/powerpc/powerpc64/backtrace.c
index 2d3e051..9b9a9f1 100644
--- a/sysdeps/powerpc/powerpc64/backtrace.c
+++ b/sysdeps/powerpc/powerpc64/backtrace.c
@@ -18,6 +18,9 @@
 
 #include <execinfo.h>
 #include <stddef.h>
+#include <string.h>
+#include <signal.h>
+#include <bits/libc-vdso.h>
 
 /* This is the stack layout we see with every stack frame.
    Note that every routine is required by the ABI to lay out the stack
@@ -38,6 +41,27 @@ struct layout
   void *return_address;
 };
 
+/* Since the signal handler is just like any other function it needs to
+   save/restore its LR and it will save it into callers stack frame.
+   Since a signal handler doesn't have a caller, the kernel creates a
+   dummy frame to make it look like it has a caller.  */
+struct signal_frame_64 {
+#define SIGNAL_FRAMESIZE 128
+  char            dummy[SIGNAL_FRAMESIZE];
+  struct ucontext uc;
+  /* We don't care about the rest, since the IP value is at 'uc' field.  */
+};
+
+static inline int
+is_sigtramp_address (unsigned long nip)
+{
+#ifdef SHARED
+  if (nip == (unsigned long)__vdso_sigtramp_rt64)
+    return 1;
+#endif
+  return 0;
+}
+
 int
 __backtrace (void **array, int size)
 {
@@ -53,7 +77,17 @@ __backtrace (void **array, int size)
   for (				count = 0;
        current != NULL && 	count < size;
        current = current->next, count++)
-    array[count] = current->return_address;
+    {
+      array[count] = current->return_address;
+
+      /* Check if the symbol is the signal trampoline and get the interrupted
+       * symbol address from the trampoline saved area.  */
+      if (is_sigtramp_address ((unsigned long)current->return_address))
+        {
+	  struct signal_frame_64 *sigframe = (struct signal_frame_64*) current;
+          array[++count] = (void*)sigframe->uc.uc_mcontext.gp_regs[PT_NIP];
+	}
+    }
 
   /* It's possible the second-last stack frame can't return
      (that is, it's __libc_start_main), in which case
diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
index 8b195db..ba54de4 100644
--- a/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/libc-vdso.h
@@ -34,6 +34,13 @@ extern void *__vdso_getcpu;
 
 extern void *__vdso_time;
 
+#if defined(__PPC64__) || defined(__powerpc64__)
+extern void *__vdso_sigtramp_rt64;
+#else
+extern void *__vdso_sigtramp32;
+extern void *__vdso_sigtramp_rt32;
+#endif
+
 /* This macro is needed for PPC64 to return a skeleton OPD entry of a vDSO
    symbol.  This works because _dl_vdso_vsym always return the function
    address, and no vDSO symbols use the TOC or chain pointers from the OPD
diff --git a/sysdeps/unix/sysv/linux/powerpc/init-first.c b/sysdeps/unix/sysv/linux/powerpc/init-first.c
index f6f05f0..061715f 100644
--- a/sysdeps/unix/sysv/linux/powerpc/init-first.c
+++ b/sysdeps/unix/sysv/linux/powerpc/init-first.c
@@ -29,6 +29,12 @@ void *__vdso_clock_getres;
 void *__vdso_get_tbfreq;
 void *__vdso_getcpu;
 void *__vdso_time;
+#if defined(__PPC64__) || defined(__powerpc64__)
+void *__vdso_sigtramp_rt64;
+#else
+void *__vdso_sigtramp32;
+void *__vdso_sigtramp_rt32;
+#endif
 
 static inline void
 _libc_vdso_platform_setup (void)
@@ -46,6 +52,16 @@ _libc_vdso_platform_setup (void)
   __vdso_getcpu = _dl_vdso_vsym ("__kernel_getcpu", &linux2615);
 
   __vdso_time = _dl_vdso_vsym ("__kernel_time", &linux2615);
+
+  /* PPC64 uses only one signal trampoline symbol, while PPC32 will use
+     two depending if SA_SIGINFO is used (__kernel_sigtramp_rt32) or not
+     (__kernel_sigtramp32).  */
+#if defined(__PPC64__) || defined(__powerpc64__)
+  __vdso_sigtramp_rt64 = _dl_vdso_vsym ("__kernel_sigtramp_rt64", &linux2615);
+#else
+  __vdso_sigtramp32 = _dl_vdso_vsym ("__kernel_sigtramp32", &linux2615);
+  __vdso_sigtramp_rt32 = _dl_vdso_vsym ("__kernel_sigtramp_rt32", &linux2615);
+#endif
 }
 
 # define VDSO_SETUP _libc_vdso_platform_setup

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2eca8c3c19866af93c014a61badba27dd272a40c

commit 2eca8c3c19866af93c014a61badba27dd272a40c
Author: Joseph Myers <joseph@codesourcery.com>
Date:   Tue Aug 20 19:41:15 2013 +0000

    Fix cproj handling of (finite, NaN) arguments (bug 15531).

diff --git a/ChangeLog b/ChangeLog
index 2ded8ae..00f0c7b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2013-08-20  Joseph Myers  <joseph@codesourcery.com>
+
+	[BZ #15531]
+	* math/s_cproj.c (__cproj): Only return an infinity if one part of
+	argument is infinite.
+	* math/s_cprojf.c (__cprojf): Likewise.
+	* math/s_cprojl.c (__cprojl): Likewise.
+	* sysdeps/ieee754/ldbl-128ibm/s_cprojl.c (__cprojl): Likewise.
+	* math/libm-test.inc (cproj_test_data): Add more tests.
+
+	* sysdeps/unix/sysv/linux/mmap64.c: Include <string.h>.
+
+	* sysdeps/unix/sysv/linux/mmap64.c (__mmap64)
+	[MMAP2_PAGE_SHIFT == -1]: Use __getpagesize to determine page
+	size.  Use __ffs to determine corresponding shift.
+
 2013-08-20  Andreas Arnez  <arnez@linux.vnet.ibm.com>
 
 	* elf/setup-vdso.h (setup_vdso): Fix missing string termination.
diff --git a/NEWS b/NEWS
index d6bc721..ae05edc 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  14699, 15909, 15996, 16150.
+  14699, 15532, 15909, 15996, 16150.
 
 * CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
   to the d_name member of struct dirent, or omit the terminating NUL
diff --git a/math/libm-test.inc b/math/libm-test.inc
index 2324d4f..3b382af 100644
--- a/math/libm-test.inc
+++ b/math/libm-test.inc
@@ -7061,11 +7061,51 @@ static const struct test_c_c_data cproj_test_data[] =
 
     TEST_c_c (cproj, qnan_value, qnan_value, qnan_value, qnan_value, NO_INEXACT_EXCEPTION),
 
+    TEST_c_c (cproj, plus_zero, qnan_value, plus_zero, qnan_value, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_zero, qnan_value, minus_zero, qnan_value, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, qnan_value, plus_zero, qnan_value, plus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, qnan_value, minus_zero, qnan_value, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, 1.0, qnan_value, 1.0, qnan_value, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, -1.0, qnan_value, -1.0, qnan_value, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, qnan_value, 1.0, qnan_value, 1.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, qnan_value, -1.0, qnan_value, -1.0, NO_INEXACT_EXCEPTION),
+
     TEST_c_c (cproj, plus_infty, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
     TEST_c_c (cproj, plus_infty, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
     TEST_c_c (cproj, minus_infty, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
     TEST_c_c (cproj, minus_infty, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
 
+    TEST_c_c (cproj, plus_infty, plus_zero, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, plus_infty, minus_zero, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, plus_zero, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, minus_zero, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, plus_zero, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, plus_zero, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_zero, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_zero, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, plus_infty, 1.0, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, plus_infty, -1.0, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, 1.0, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, -1.0, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, 1.0, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, 1.0, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, -1.0, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, -1.0, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, plus_infty, qnan_value, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, plus_infty, -qnan_value, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, qnan_value, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, minus_infty, -qnan_value, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
+    TEST_c_c (cproj, qnan_value, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, qnan_value, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, -qnan_value, plus_infty, plus_infty, 0.0, NO_INEXACT_EXCEPTION),
+    TEST_c_c (cproj, -qnan_value, minus_infty, plus_infty, minus_zero, NO_INEXACT_EXCEPTION),
+
     TEST_c_c (cproj, 1.0, 0.0, 1.0, 0.0, NO_INEXACT_EXCEPTION),
     TEST_c_c (cproj, 2.0, 3.0, 2.0, 3.0, NO_INEXACT_EXCEPTION),
   };
diff --git a/math/s_cproj.c b/math/s_cproj.c
index c0be461..98f1a4c 100644
--- a/math/s_cproj.c
+++ b/math/s_cproj.c
@@ -24,9 +24,7 @@
 __complex__ double
 __cproj (__complex__ double x)
 {
-  if (isnan (__real__ x) && isnan (__imag__ x))
-    return x;
-  else if (!isfinite (__real__ x) || !isfinite (__imag__ x))
+  if (__isinf_ns (__real__ x) || __isinf_ns (__imag__ x))
     {
       __complex__ double res;
 
diff --git a/math/s_cprojf.c b/math/s_cprojf.c
index 188bbe3..e4dbc18 100644
--- a/math/s_cprojf.c
+++ b/math/s_cprojf.c
@@ -24,9 +24,7 @@
 __complex__ float
 __cprojf (__complex__ float x)
 {
-  if (isnan (__real__ x) && isnan (__imag__ x))
-    return x;
-  else if (!isfinite (__real__ x) || !isfinite (__imag__ x))
+  if (__isinf_nsf (__real__ x) || __isinf_nsf (__imag__ x))
     {
       __complex__ float res;
 
diff --git a/math/s_cprojl.c b/math/s_cprojl.c
index fbdf279..b564a83 100644
--- a/math/s_cprojl.c
+++ b/math/s_cprojl.c
@@ -24,9 +24,7 @@
 __complex__ long double
 __cprojl (__complex__ long double x)
 {
-  if (isnan (__real__ x) && isnan (__imag__ x))
-    return x;
-  else if (!isfinite (__real__ x) || !isfinite (__imag__ x))
+  if (__isinf_nsl (__real__ x) || __isinf_nsl (__imag__ x))
     {
       __complex__ long double res;
 
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_cprojl.c b/sysdeps/ieee754/ldbl-128ibm/s_cprojl.c
index 3b4af54..a344e92 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_cprojl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_cprojl.c
@@ -24,9 +24,7 @@
 __complex__ long double
 __cprojl (__complex__ long double x)
 {
-  if (isnan (__real__ x) && isnan (__imag__ x))
-    return x;
-  else if (!isfinite (__real__ x) || !isfinite (__imag__ x))
+  if (__isinf_nsl (__real__ x) || __isinf_nsl (__imag__ x))
     {
       __complex__ long double res;
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1d3c75f2a03be1a9686203ee2b32b0e5fb5e847b

commit 1d3c75f2a03be1a9686203ee2b32b0e5fb5e847b
Author: Andreas Arnez <arnez@linux.vnet.ibm.com>
Date:   Tue Aug 20 14:03:04 2013 +0200

    * elf/setup-vdso.h (setup_vdso): Fix missing string termination.

diff --git a/ChangeLog b/ChangeLog
index cdcbd3d..2ded8ae 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-20  Andreas Arnez  <arnez@linux.vnet.ibm.com>
+
+	* elf/setup-vdso.h (setup_vdso): Fix missing string termination.
+
 2013-08-16  Florian Weimer  <fweimer@redhat.com>
 
 	[BZ #14699]
diff --git a/elf/setup-vdso.h b/elf/setup-vdso.h
index a98dfec..056d885 100644
--- a/elf/setup-vdso.h
+++ b/elf/setup-vdso.h
@@ -89,7 +89,7 @@ setup_vdso (struct link_map *main_map __attribute__ ((unused)),
 	     addresses in the vsyscall DSO pages in writev() calls.  */
 	  const char *dsoname = ((char *) D_PTR (l, l_info[DT_STRTAB])
 				 + l->l_info[DT_SONAME]->d_un.d_val);
-	  size_t len = strlen (dsoname);
+	  size_t len = strlen (dsoname) + 1;
 	  char *copy = malloc (len);
 	  if (copy == NULL)
 	    _dl_fatal_printf ("out of memory\n");

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b5f581e49c151ea062a747e3006b9b400ee94d83

commit b5f581e49c151ea062a747e3006b9b400ee94d83
Author: Florian Weimer <fweimer@redhat.com>
Date:   Fri Aug 16 09:38:52 2013 +0200

    CVE-2013-4237, BZ #14699: Buffer overflow in readdir_r
    
    	* sysdeps/posix/dirstream.h (struct __dirstream): Add errcode
    	member.
    	* sysdeps/posix/opendir.c (__alloc_dir): Initialize errcode
    	member.
    	* sysdeps/posix/rewinddir.c (rewinddir): Reset errcode member.
    	* sysdeps/posix/readdir_r.c (__READDIR_R): Enforce NAME_MAX limit.
    	Return delayed error code.  Remove GETDENTS_64BIT_ALIGNED
    	conditional.
    	* sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c: Do not define
    	GETDENTS_64BIT_ALIGNED.
    	* sysdeps/unix/sysv/linux/i386/readdir64_r.c: Likewise.
    	* manual/filesys.texi (Reading/Closing Directory): Document
    	ENAMETOOLONG return value of readdir_r.  Recommend readdir more
    	strongly.
    	* manual/conf.texi (Limits for Files): Add portability note to
    	NAME_MAX, PATH_MAX.
    	(Pathconf): Add portability note for _PC_NAME_MAX, _PC_PATH_MAX.

diff --git a/ChangeLog b/ChangeLog
index 1e5efa7..cdcbd3d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2013-08-16  Florian Weimer  <fweimer@redhat.com>
+
+	[BZ #14699]
+	CVE-2013-4237
+	* sysdeps/posix/dirstream.h (struct __dirstream): Add errcode
+	member.
+	* sysdeps/posix/opendir.c (__alloc_dir): Initialize errcode
+	member.
+	* sysdeps/posix/rewinddir.c (rewinddir): Reset errcode member.
+	* sysdeps/posix/readdir_r.c (__READDIR_R): Enforce NAME_MAX limit.
+	Return delayed error code.  Remove GETDENTS_64BIT_ALIGNED
+	conditional.
+	* sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c: Do not define
+	GETDENTS_64BIT_ALIGNED.
+	* sysdeps/unix/sysv/linux/i386/readdir64_r.c: Likewise.
+	* manual/filesys.texi (Reading/Closing Directory): Document
+	ENAMETOOLONG return value of readdir_r.  Recommend readdir more
+	strongly.
+	* manual/conf.texi (Limits for Files): Add portability note to
+	NAME_MAX, PATH_MAX.
+	(Pathconf): Add portability note for _PC_NAME_MAX, _PC_PATH_MAX.
+
 2013-11-11  David S. Miller  <davem@davemloft.net>
 
 	[BZ #16150]
diff --git a/NEWS b/NEWS
index df97235..d6bc721 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,11 @@ Version 2.18.1
 
 * The following bugs are resolved with this release:
 
-  15909, 15996, 16150.
+  14699, 15909, 15996, 16150.
+
+* CVE-2013-4237 The readdir_r function could write more than NAME_MAX bytes
+  to the d_name member of struct dirent, or omit the terminating NUL
+  character.  (Bugzilla #14699).
 
 Version 2.18
 
diff --git a/manual/conf.texi b/manual/conf.texi
index 7eb8b36..c720063 100644
--- a/manual/conf.texi
+++ b/manual/conf.texi
@@ -1149,6 +1149,9 @@ typed ahead as input.  @xref{I/O Queues}.
 @deftypevr Macro int NAME_MAX
 The uniform system limit (if any) for the length of a file name component, not
 including the terminating null character.
+
+@strong{Portability Note:} On some systems, @theglibc{} defines
+@code{NAME_MAX}, but does not actually enforce this limit.
 @end deftypevr
 
 @comment limits.h
@@ -1157,6 +1160,9 @@ including the terminating null character.
 The uniform system limit (if any) for the length of an entire file name (that
 is, the argument given to system calls such as @code{open}), including the
 terminating null character.
+
+@strong{Portability Note:} @Theglibc{} does not enforce this limit
+even if @code{PATH_MAX} is defined.
 @end deftypevr
 
 @cindex limits, pipe buffer size
@@ -1476,6 +1482,9 @@ Inquire about the value of @code{POSIX_REC_MIN_XFER_SIZE}.
 Inquire about the value of @code{POSIX_REC_XFER_ALIGN}.
 @end table
 
+@strong{Portability Note:} On some systems, @theglibc{} does not
+enforce @code{_PC_NAME_MAX} or @code{_PC_PATH_MAX} limits.
+
 @node Utility Limits
 @section Utility Program Capacity Limits
 
diff --git a/manual/filesys.texi b/manual/filesys.texi
index 1df9cf2..814c210 100644
--- a/manual/filesys.texi
+++ b/manual/filesys.texi
@@ -444,9 +444,9 @@ symbols are declared in the header file @file{dirent.h}.
 @comment POSIX.1
 @deftypefun {struct dirent *} readdir (DIR *@var{dirstream})
 This function reads the next entry from the directory.  It normally
-returns a pointer to a structure containing information about the file.
-This structure is statically allocated and can be rewritten by a
-subsequent call.
+returns a pointer to a structure containing information about the
+file.  This structure is associated with the @var{dirstream} handle
+and can be rewritten by a subsequent call.
 
 @strong{Portability Note:} On some systems @code{readdir} may not
 return entries for @file{.} and @file{..}, even though these are always
@@ -461,19 +461,61 @@ conditions are defined for this function:
 The @var{dirstream} argument is not valid.
 @end table
 
-@code{readdir} is not thread safe.  Multiple threads using
-@code{readdir} on the same @var{dirstream} may overwrite the return
-value.  Use @code{readdir_r} when this is critical.
+To distinguish between an end-of-directory condition or an error, you
+must set @code{errno} to zero before calling @code{readdir}.  To avoid
+entering an infinite loop, you should stop reading from the directory
+after the first error.
+
+In POSIX.1-2008, @code{readdir} is not thread-safe.  In @theglibc{}
+implementation, it is safe to call @code{readdir} concurrently on
+different @var{dirstream}s, but multiple threads accessing the same
+@var{dirstream} result in undefined behavior.  @code{readdir_r} is a
+fully thread-safe alternative, but suffers from poor portability (see
+below).  It is recommended that you use @code{readdir}, with external
+locking if multiple threads access the same @var{dirstream}.
 @end deftypefun
 
 @comment dirent.h
 @comment GNU
 @deftypefun int readdir_r (DIR *@var{dirstream}, struct dirent *@var{entry}, struct dirent **@var{result})
-This function is the reentrant version of @code{readdir}.  Like
-@code{readdir} it returns the next entry from the directory.  But to
-prevent conflicts between simultaneously running threads the result is
-not stored in statically allocated memory.  Instead the argument
-@var{entry} points to a place to store the result.
+This function is a version of @code{readdir} which performs internal
+locking.  Like @code{readdir} it returns the next entry from the
+directory.  To prevent conflicts between simultaneously running
+threads the result is stored inside the @var{entry} object.
+
+@strong{Portability Note:} It is recommended to use @code{readdir}
+instead of @code{readdir_r} for the following reasons:
+
+@itemize @bullet
+@item
+On systems which do not define @code{NAME_MAX}, it may not be possible
+to use @code{readdir_r} safely because the caller does not specify the
+length of the buffer for the directory entry.
+
+@item
+On some systems, @code{readdir_r} cannot read directory entries with
+very long names.  If such a name is encountered, @theglibc{}
+implementation of @code{readdir_r} returns with an error code of
+@code{ENAMETOOLONG} after the final directory entry has been read.  On
+other systems, @code{readdir_r} may return successfully, but the
+@code{d_name} member may not be NUL-terminated or may be truncated.
+
+@item
+POSIX-1.2008 does not guarantee that @code{readdir} is thread-safe,
+even when access to the same @var{dirstream} is serialized.  But in
+current implementations (including @theglibc{}), it is safe to call
+@code{readdir} concurrently on different @var{dirstream}s, so there is
+no need to use @code{readdir_r} in most multi-threaded programs.  In
+the rare case that multiple threads need to read from the same
+@var{dirstream}, it is still better to use @code{readdir} and external
+synchronization.
+
+@item
+It is expected that future versions of POSIX will obsolete
+@code{readdir_r} and mandate the level of thread safety for
+@code{readdir} which is provided by @theglibc{} and other
+implementations today.
+@end itemize
 
 Normally @code{readdir_r} returns zero and sets @code{*@var{result}}
 to @var{entry}.  If there are no more entries in the directory or an
@@ -481,15 +523,6 @@ error is detected, @code{readdir_r} sets @code{*@var{result}} to a
 null pointer and returns a nonzero error code, also stored in
 @code{errno}, as described for @code{readdir}.
 
-@strong{Portability Note:} On some systems @code{readdir_r} may not
-return a NUL terminated string for the file name, even when there is no
-@code{d_reclen} field in @code{struct dirent} and the file
-name is the maximum allowed size.  Modern systems all have the
-@code{d_reclen} field, and on old systems multi-threading is not
-critical.  In any case there is no such problem with the @code{readdir}
-function, so that even on systems without the @code{d_reclen} member one
-could use multiple threads by using external locking.
-
 It is also important to look at the definition of the @code{struct
 dirent} type.  Simply passing a pointer to an object of this type for
 the second parameter of @code{readdir_r} might not be enough.  Some
diff --git a/sysdeps/posix/dirstream.h b/sysdeps/posix/dirstream.h
index a7a074d..8e8570d 100644
--- a/sysdeps/posix/dirstream.h
+++ b/sysdeps/posix/dirstream.h
@@ -39,6 +39,8 @@ struct __dirstream
 
     off_t filepos;		/* Position of next entry to read.  */
 
+    int errcode;		/* Delayed error code.  */
+
     /* Directory block.  */
     char data[0] __attribute__ ((aligned (__alignof__ (void*))));
   };
diff --git a/sysdeps/posix/opendir.c b/sysdeps/posix/opendir.c
index ddfc3a7..fc05b0f 100644
--- a/sysdeps/posix/opendir.c
+++ b/sysdeps/posix/opendir.c
@@ -231,6 +231,7 @@ __alloc_dir (int fd, bool close_fd, int flags, const struct stat64 *statp)
   dirp->size = 0;
   dirp->offset = 0;
   dirp->filepos = 0;
+  dirp->errcode = 0;
 
   return dirp;
 }
diff --git a/sysdeps/posix/readdir_r.c b/sysdeps/posix/readdir_r.c
index b5a8e2e..8ed5c3f 100644
--- a/sysdeps/posix/readdir_r.c
+++ b/sysdeps/posix/readdir_r.c
@@ -40,6 +40,7 @@ __READDIR_R (DIR *dirp, DIRENT_TYPE *entry, DIRENT_TYPE **result)
   DIRENT_TYPE *dp;
   size_t reclen;
   const int saved_errno = errno;
+  int ret;
 
   __libc_lock_lock (dirp->lock);
 
@@ -70,10 +71,10 @@ __READDIR_R (DIR *dirp, DIRENT_TYPE *entry, DIRENT_TYPE **result)
 		  bytes = 0;
 		  __set_errno (saved_errno);
 		}
+	      if (bytes < 0)
+		dirp->errcode = errno;
 
 	      dp = NULL;
-	      /* Reclen != 0 signals that an error occurred.  */
-	      reclen = bytes != 0;
 	      break;
 	    }
 	  dirp->size = (size_t) bytes;
@@ -106,29 +107,46 @@ __READDIR_R (DIR *dirp, DIRENT_TYPE *entry, DIRENT_TYPE **result)
       dirp->filepos += reclen;
 #endif
 
-      /* Skip deleted files.  */
+#ifdef NAME_MAX
+      if (reclen > offsetof (DIRENT_TYPE, d_name) + NAME_MAX + 1)
+	{
+	  /* The record is very long.  It could still fit into the
+	     caller-supplied buffer if we can skip padding at the
+	     end.  */
+	  size_t namelen = _D_EXACT_NAMLEN (dp);
+	  if (namelen <= NAME_MAX)
+	    reclen = offsetof (DIRENT_TYPE, d_name) + namelen + 1;
+	  else
+	    {
+	      /* The name is too long.  Ignore this file.  */
+	      dirp->errcode = ENAMETOOLONG;
+	      dp->d_ino = 0;
+	      continue;
+	    }
+	}
+#endif
+
+      /* Skip deleted and ignored files.  */
     }
   while (dp->d_ino == 0);
 
   if (dp != NULL)
     {
-#ifdef GETDENTS_64BIT_ALIGNED
-      /* The d_reclen value might include padding which is not part of
-	 the DIRENT_TYPE data structure.  */
-      reclen = MIN (reclen,
-		    offsetof (DIRENT_TYPE, d_name) + sizeof (dp->d_name));
-#endif
       *result = memcpy (entry, dp, reclen);
-#ifdef GETDENTS_64BIT_ALIGNED
+#ifdef _DIRENT_HAVE_D_RECLEN
       entry->d_reclen = reclen;
 #endif
+      ret = 0;
     }
   else
-    *result = NULL;
+    {
+      *result = NULL;
+      ret = dirp->errcode;
+    }
 
   __libc_lock_unlock (dirp->lock);
 
-  return dp != NULL ? 0 : reclen ? errno : 0;
+  return ret;
 }
 
 #ifdef __READDIR_R_ALIAS
diff --git a/sysdeps/posix/rewinddir.c b/sysdeps/posix/rewinddir.c
index 2935a8e..d4991ad 100644
--- a/sysdeps/posix/rewinddir.c
+++ b/sysdeps/posix/rewinddir.c
@@ -33,6 +33,7 @@ rewinddir (dirp)
   dirp->filepos = 0;
   dirp->offset = 0;
   dirp->size = 0;
+  dirp->errcode = 0;
 #ifndef NOT_IN_libc
   __libc_lock_unlock (dirp->lock);
 #endif
diff --git a/sysdeps/unix/sysv/linux/i386/readdir64_r.c b/sysdeps/unix/sysv/linux/i386/readdir64_r.c
index 8ebbcfd..a7d114e 100644
--- a/sysdeps/unix/sysv/linux/i386/readdir64_r.c
+++ b/sysdeps/unix/sysv/linux/i386/readdir64_r.c
@@ -18,7 +18,6 @@
 #define __READDIR_R __readdir64_r
 #define __GETDENTS __getdents64
 #define DIRENT_TYPE struct dirent64
-#define GETDENTS_64BIT_ALIGNED 1
 
 #include <sysdeps/posix/readdir_r.c>
 
diff --git a/sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c b/sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c
index 5ed8e95..290f2c8 100644
--- a/sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c
+++ b/sysdeps/unix/sysv/linux/wordsize-64/readdir_r.c
@@ -1,5 +1,4 @@
 #define readdir64_r __no_readdir64_r_decl
-#define GETDENTS_64BIT_ALIGNED 1
 #include <sysdeps/posix/readdir_r.c>
 #undef readdir64_r
 weak_alias (__readdir_r, readdir64_r)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=df2b535be8dcc810706211d89d26967e4df24504

commit df2b535be8dcc810706211d89d26967e4df24504
Author: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Date:   Fri Nov 15 07:44:20 2013 -0600

    Partially revert commit 2663b74f8103a2a8a46b4896439b7a452480fc7c
    
    This change is necessary in order to avoid the issue documented at
    http://sourceware.org/ml/libc-alpha/2013-05/msg00350.html.

diff --git a/localedata/locales/bo_CN b/localedata/locales/bo_CN
index aa8ff07..cd61856 100644
--- a/localedata/locales/bo_CN
+++ b/localedata/locales/bo_CN
@@ -144,8 +144,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-
-name_fmt  ""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"
diff --git a/localedata/locales/bo_IN b/localedata/locales/bo_IN
index 9e9c4ff..c90db17 100644
--- a/localedata/locales/bo_IN
+++ b/localedata/locales/bo_IN
@@ -70,7 +70,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-name_fmt	""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d19762a5fd946ef86fb2922eb5710aac8000ab25

commit d19762a5fd946ef86fb2922eb5710aac8000ab25
Author: Ryan S. Arnold <rsa@linux.vnet.ibm.com>
Date:   Fri Nov 15 07:42:33 2013 -0600

    Remove assert() if DT_RUNPATH and DT_RPATH flags are found in ld.so.

diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h
index 3cc1073..1c30c44 100644
--- a/elf/get-dynamic-info.h
+++ b/elf/get-dynamic-info.h
@@ -130,8 +130,8 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp)
   assert (info[DT_FLAGS] == NULL
 	  || (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0);
   /* Flags must not be set for ld.so.  */
-  assert (info[DT_RUNPATH] == NULL);
-  assert (info[DT_RPATH] == NULL);
+  info[DT_RUNPATH] == NULL;
+  info[DT_RPATH] == NULL;
 #else
   if (info[DT_FLAGS] != NULL)
     {

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]