This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch master updated. glibc-2.16-ports-merge-627-g351dc60

From: cmetcalf at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 6 Nov 2012 14:51:24 -0000
Subject: GNU C Library master sources branch master updated. glibc-2.16-ports-merge-627-g351dc60
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  351dc60c55467552753646c1f585c3fb54cb2b06 (commit)
       via  a0bce338e8e6f35e38183dfbcfc3c760ecd07159 (commit)
       via  fedff58953bf60c99498c018b6d787db117c8932 (commit)
       via  e7776fefa76a5815493b463049fd7ced67cb31a2 (commit)
       via  cd84016efe83d92ee3903fef37f79ca2bafb3985 (commit)
      from  82477c28f46c579a149a8333c07233e9f4e43408 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=351dc60c55467552753646c1f585c3fb54cb2b06

commit 351dc60c55467552753646c1f585c3fb54cb2b06
Author: Chris Metcalf <cmetcalf@tilera.com>
Date:   Tue Nov 6 09:43:58 2012 -0500

    tile: use atomic op to unlock pthread_spinlock_t
    
    Atomic ops are issued directly from the core, rather than
    potentially sitting in the write buffer, so can improve the
    performance of other waiters.  In addition, if we didn't end
    up pulling a copy of the cache line where the lock is into cache,
    by using an atomic op we don't have to acquire the cache line
    before we can unlock.

diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index 99e95db..06f24c2 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,3 +1,7 @@
+2012-11-06  Chris Metcalf  <cmetcalf@tilera.com>
+
+	* sysdeps/tile/nptl/pthread_spin_unlock.c: New file.
+
 2012-11-05  Chris Metcalf  <cmetcalf@tilera.com>
 
 	* sysdeps/tile/math_private.h: Provide additional no-op defines
diff --git a/ports/sysdeps/tile/nptl/pthread_spin_unlock.c b/ports/sysdeps/tile/nptl/pthread_spin_unlock.c
new file mode 100644
index 0000000..260f6fa
--- /dev/null
+++ b/ports/sysdeps/tile/nptl/pthread_spin_unlock.c
@@ -0,0 +1,33 @@
+/* pthread_spin_unlock -- unlock a spin lock.  Tile version.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "pthreadP.h"
+#include <atomic.h>
+
+int
+pthread_spin_unlock (pthread_spinlock_t *lock)
+{
+#ifdef __tilegx__
+  /* Use exchange() to bypass the write buffer. */
+  atomic_exchange_rel (lock, 0);
+#else
+  atomic_full_barrier ();
+  *lock = 0;
+#endif
+  return 0;
+}

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a0bce338e8e6f35e38183dfbcfc3c760ecd07159

commit a0bce338e8e6f35e38183dfbcfc3c760ecd07159
Author: Chris Metcalf <cmetcalf@tilera.com>
Date:   Mon Nov 5 11:36:09 2012 -0500

    Make ieee754 fma tolerate architectures without exception support.

diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index fd722b3..99e95db 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,5 +1,8 @@
 2012-11-05  Chris Metcalf  <cmetcalf@tilera.com>
 
+	* sysdeps/tile/math_private.h: Provide additional no-op defines
+	for exception and rounding macros.
+
 	* sysdeps/tile/tilegx/Makefile: Generate Makefile fragment to determine
 	whether to build elf-init.c and gmon-start.c with -mcmodel=large.
 	* sysdeps/tile/crti.S: Support large memory model.
diff --git a/ports/sysdeps/tile/math_private.h b/ports/sysdeps/tile/math_private.h
index 858db4a..90dcc3f 100644
--- a/ports/sysdeps/tile/math_private.h
+++ b/ports/sysdeps/tile/math_private.h
@@ -1,13 +1,31 @@
 #ifndef _MATH_PRIVATE_H
 
+/* Internally, we suppress any use of exception or rounding other
+   than what is supported by the hardware.  This does mean that some
+   code will silently fail to report exceptions, set rounding mode
+   as expected, etc., but it allows math code to compile that otherwise
+   wouldn't (such as math/s_fma.c) and so is valuable.
+
+   We intentionally ignore the "exception" arguments of functions that
+   take an exception, since we can't even evaluate the argument
+   without causing a build failure.  The extra level of statement
+   expression wrapping avoids "statement with no effect" warnings.
+   Since the callers don't check for errors anyway, we just claim
+   success in every case.
+
+   The overrides for libc_ functions must happen before we include
+   the generic math_private.h, and the overrides for regular
+   <fenv.h> functions must happen afterwards, to avoid clashing with
+   the declarations of those functions.  */
+
+#define libc_fesetround(rnd)			({ 0; })
+#define libc_fetestexcept(exc)			({ 0; })
+#define libc_feholdexcept_setround(env, exc)	({ (void) (env); 0; })
+#define libc_feupdateenv_test(env, exc)		({ (void) (env); 0; })
+
 #include_next <math_private.h>
 
-/* We have no exception support, so feraiseexcept() must be a no-op.
-   And since we don't define FE_INVALID, FE_DIVBYZERO, etc., we
-   must ignore the argument of feraiseexcept() as well.  we return
-   "1" to indicate we failed to raise an exception, though none of
-   the callers in glibc actually care.  The extra level of statement
-   expression wrapping avoids "statement with no effect" warnings.  */
-#define feraiseexcept(excepts) ({ 1; })
+#define feraiseexcept(excepts)			({ 0; })
+#define feclearexcept(exc)			({ 0; })
 
 #endif

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=fedff58953bf60c99498c018b6d787db117c8932

commit fedff58953bf60c99498c018b6d787db117c8932
Author: Chris Metcalf <cmetcalf@tilera.com>
Date:   Mon Nov 5 10:43:40 2012 -0500

    tile: support very large shared objects
    
    With gcc 4.8 tilegx has support for -mcmodel=large, to tolerate very
    large shared objects.  This option changes the compiler output to
    not include direct jump instructions, which have a range of only
    2^30, i.e +/- 512MB.  Instead the compiler marshalls the target PCs
    into registers and then uses jump- or call-to-register instructions.
    
    For glibc, the upshot is that we need to arrange for a few functions
    to tolerate the possibility of a large range between the PC and
    the target.  In particular, the crti.S and start.S code needs
    to be able to reach from .init to the PLT, as does gmon-start.c.
    The elf-init.c code has the reverse problem, needing to call from
    libc_nonshared.a (linked at the end of shared objects) back to the
    _init section at the beginning.
    
    No other functions in *_nonshared.a need to be built this way, as
    they only call the PLT (or potentially each other), but all of that
    code is linked at the very end of the shared object.
    
    We don't build the standard -static archives with this option as the
    performance cost is high enough and the use case is rare enough that
    it doesn't seem worthwhile.  Instead, we would encourage developers
    who need the -static model with huge executables to build a private
    copy of glibc and configure it with -mcmodel=large.
    
    Note that libc.so et al don't need any changes; the only changes
    are for code that is statically linked into user code built with
    -mcmodel=large.
    
    For the assembly code, I just rewrote it so that it unconditionally
    uses the large model.  To be able to pass -mcmodel=large to
    csu/elf-init.c and csu/gmon-start.c, I need to check to see if the
    compiler supports that flag, since gcc 4.7 doesn't; I added the
    support by creating a small Makefile fragment that just runs the
    compiler to check.

diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index 71aaf56..fd722b3 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,3 +1,10 @@
+2012-11-05  Chris Metcalf  <cmetcalf@tilera.com>
+
+	* sysdeps/tile/tilegx/Makefile: Generate Makefile fragment to determine
+	whether to build elf-init.c and gmon-start.c with -mcmodel=large.
+	* sysdeps/tile/crti.S: Support large memory model.
+	* sysdeps/tile/start.S: Likewise.
+
 2012-11-02  Chris Metcalf  <cmetcalf@tilera.com>
 
 	* sysdeps/tile/dl-runtime.c (_dl_after_load): Handle simulator
diff --git a/ports/sysdeps/tile/crti.S b/ports/sysdeps/tile/crti.S
index ccb4464..467816c 100644
--- a/ports/sysdeps/tile/crti.S
+++ b/ports/sysdeps/tile/crti.S
@@ -70,16 +70,17 @@ _init:
 #if PREINIT_FUNCTION_WEAK
 	lnk r2
 0:
-#ifdef __tilegx__
+# ifdef __tilegx__
+	moveli r1, hw2_last(_GLOBAL_OFFSET_TABLE_ - 0b)
 	{
-	 moveli r1, hw1_last(_GLOBAL_OFFSET_TABLE_ - 0b)
+	 shl16insli r1, r1, hw1(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 moveli r0, hw1_last_got(PREINIT_FUNCTION)
 	}
 	{
 	 shl16insli r1, r1, hw0(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 shl16insli r0, r0, hw0_got(PREINIT_FUNCTION)
 	}
-#else
+# else
 	{
 	 moveli r1, lo16(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 moveli r0, got_lo16(PREINIT_FUNCTION)
@@ -88,13 +89,25 @@ _init:
 	 auli r1, r1, ha16(_GLOBAL_OFFSET_TABLE_ - 0b)
 	 auli r0, r0, got_ha16(PREINIT_FUNCTION)
 	}
-#endif
+# endif
 	ADD_PTR r0, r0, r1
 	ADD_PTR r0, r0, r2
 	LD_PTR r0, r0
 	BEQZ r0, .Lno_weak_fn
-#endif
+	jalr r0
+#elif defined(__tilegx__)
+	/* Since we are calling from the start of the object to the PLT,
+	   call by loading the full address into a register.  */
+	lnk r2
+0:
+	moveli r0, hw2_last_plt(PREINIT_FUNCTION - 0b)
+	shl16insli r0, r0, hw1_plt(PREINIT_FUNCTION - 0b)
+	shl16insli r0, r0, hw0_plt(PREINIT_FUNCTION - 0b)
+	add r0, r0, r2
+	jalr r0
+#else
 	jal plt(PREINIT_FUNCTION)
+#endif
 .Lno_weak_fn:
 
 	.section .fini,"ax",@progbits
diff --git a/ports/sysdeps/tile/start.S b/ports/sysdeps/tile/start.S
index 999bb53..54f015f 100644
--- a/ports/sysdeps/tile/start.S
+++ b/ports/sysdeps/tile/start.S
@@ -126,27 +126,37 @@ _start:
 	 moveli r0, hw2_last(main - .Lmy_pc)
 	}
 	{
-	 moveli r3, hw2_last(__libc_csu_init - .Lmy_pc)
 	 shl16insli r0, r0, hw1(main - .Lmy_pc)
+	 moveli r3, hw2_last(__libc_csu_init - .Lmy_pc)
 	}
 	{
-	 shl16insli r3, r3, hw1(__libc_csu_init - .Lmy_pc)
 	 shl16insli r0, r0, hw0(main - .Lmy_pc)
+	 shl16insli r3, r3, hw1(__libc_csu_init - .Lmy_pc)
 	}
 	{
+	 ADD_PTR r0, r0, r13
 	 shl16insli r3, r3, hw0(__libc_csu_init - .Lmy_pc)
+	}
+	{
+	 moveli r12, hw2_last_plt(__libc_start_main - .Lmy_pc)
+	 ADD_PTR r3, r3, r13
+	}
+	{
+	 shl16insli r12, r12, hw1_plt(__libc_start_main - .Lmy_pc)
 	 moveli r4, hw2_last(__libc_csu_fini - .Lmy_pc)
 	}
 	{
-	 ADD_PTR r0, r0, r13
+	 shl16insli r12, r12, hw0_plt(__libc_start_main - .Lmy_pc)
 	 shl16insli r4, r4, hw1(__libc_csu_fini - .Lmy_pc)
 	}
 	{
-	 ADD_PTR r3, r3, r13
+	 ADD_PTR r12, r12, r13
 	 shl16insli r4, r4, hw0(__libc_csu_fini - .Lmy_pc)
 	}
 	{
 	 ADD_PTR r4, r4, r13
+	 jalr r12
+	}
 #else
 	 addli r0, r13, lo16(main - .Lmy_pc)
 	}
@@ -160,13 +170,12 @@ _start:
 	}
 	{
 	 auli r4, r4, ha16(__libc_csu_fini - .Lmy_pc)
-
-#endif
-
 	 /* Call the user's main function, and exit with its value.
 	    But let the libc call main. */
 	 j plt(__libc_start_main)
 	}
+#endif
+
 	{
 	 /* Tell backtracer to give up (_start has no caller). */
 	 info INFO_OP_CANNOT_BACKTRACE
diff --git a/ports/sysdeps/tile/tilegx/Makefile b/ports/sysdeps/tile/tilegx/Makefile
new file mode 100644
index 0000000..d3a0e97
--- /dev/null
+++ b/ports/sysdeps/tile/tilegx/Makefile
@@ -0,0 +1,18 @@
+include $(common-objpfx)cflags-mcmodel-large.mk
+
+$(common-objpfx)cflags-mcmodel-large.mk: $(common-objpfx)config.make
+	mcmodel=no; \
+	$(CC) -S -o /dev/null -xc /dev/null -mcmodel=large && mcmodel=yes; \
+	echo "cflags-mcmodel-large = $$mcmodel" > $@
+
+ifeq ($(subdir),csu)
+ifeq (yes,$(cflags-mcmodel-large))
+# elf-init.c is in libc_nonshared.o (the end of the shared object) but
+# must reach the _init symbol at the very start of the shared object.
+CFLAGS-elf-init.c += -mcmodel=large
+
+# __gmon_start__ is at the very start of the shared object when linked
+# with profiling, but calls to libc.so via the PLT at the very end.
+CFLAGS-gmon-start.c += -mcmodel=large
+endif
+endif

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e7776fefa76a5815493b463049fd7ced67cb31a2

commit e7776fefa76a5815493b463049fd7ced67cb31a2
Author: Chris Metcalf <cmetcalf@tilera.com>
Date:   Fri Nov 2 13:49:42 2012 -0400

    tile: improve simulator notification for relative paths in dlopen
    
    Normally, the simulator is notified of absolute pathnames by the
    _dl_load_hook hook.  However, when a relative pathname is used, the
    simulator may not know that the relative path matches a path that
    it could figure out in the file system that it has access to.
    Instead we provide a simplified version of the realpath function
    so we can pass a plausible absolute pathname to the simulator.
    
    Since we're now doing more work at object load time, we also add
    a guard so we do no work at all if we're not running on the simulator.

diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index bc3946c..71aaf56 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,5 +1,8 @@
 2012-11-02  Chris Metcalf  <cmetcalf@tilera.com>
 
+	* sysdeps/tile/dl-runtime.c (_dl_after_load): Handle simulator
+	notification better for dlopen() of relative paths.
+
 	* sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize.
 	* sysdeps/tile/memcopy.h: New file.
 	* sysdeps/tile/wordcopy.c: New file.
diff --git a/ports/sysdeps/tile/dl-runtime.c b/ports/sysdeps/tile/dl-runtime.c
index 6864c3a..e965828 100644
--- a/ports/sysdeps/tile/dl-runtime.c
+++ b/ports/sysdeps/tile/dl-runtime.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011 Free Software Foundation, Inc.
+/* Copyright (C) 2011-2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Chris Metcalf <cmetcalf@tilera.com>, 2011.
 
@@ -28,44 +28,119 @@
 #include <sys/mman.h>
 #include <arch/sim.h>
 
-/* Support notifying the simulator about new objects. */
+/* Like realpath(), but simplified: no dynamic memory use, no lstat(),
+   no set_errno(), no valid "rpath" on error, etc.  This handles some
+   simple cases where the simulator might not have a valid entry for
+   a loaded Elf object, in particular dlopen() with a relative path.
+   For this relatively rare case, one could also imagine using
+   link_map.l_origin to avoid the getcwd() here, but the simpler code
+   here seems like a better solution.  */
+static char *
+dl_realpath (const char *name, char *rpath)
+{
+  char *dest;
+  const char *start, *end;
+
+  if (name[0] != '/')
+    {
+      if (!__getcwd (rpath, PATH_MAX))
+        return NULL;
+      dest = __rawmemchr (rpath, '\0');
+    }
+  else
+    {
+      rpath[0] = '/';
+      dest = rpath + 1;
+    }
+
+  for (start = end = name; *start; start = end)
+    {
+      /* Skip sequence of multiple path-separators.  */
+      while (*start == '/')
+	++start;
+
+      /* Find end of path component.  */
+      for (end = start; *end && *end != '/'; ++end)
+	/* Nothing.  */;
+
+      if (end - start == 0)
+	break;
+      else if (end - start == 1 && start[0] == '.')
+	/* nothing */;
+      else if (end - start == 2 && start[0] == '.' && start[1] == '.')
+	{
+	  /* Back up to previous component, ignore if at root already.  */
+	  if (dest > rpath + 1)
+	    while ((--dest)[-1] != '/');
+	}
+      else
+	{
+	  if (dest[-1] != '/')
+	    *dest++ = '/';
+
+	  if (dest + (end - start) >= rpath + PATH_MAX)
+            return NULL;
+
+	  dest = __mempcpy (dest, start, end - start);
+	  *dest = '\0';
+	}
+    }
+  if (dest > rpath + 1 && dest[-1] == '/')
+    --dest;
+  *dest = '\0';
+
+  return rpath;
+}
+
+/* Support notifying the simulator about new objects.  */
 void internal_function
 _dl_after_load (struct link_map *l)
 {
   int shift;
+  char pathbuf[PATH_MAX];
+  char *path;
 
-#define DLPUTC(c) __insn_mtspr(SPR_SIM_CONTROL,                         \
-                               (SIM_CONTROL_DLOPEN                      \
-                                | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+  /* Don't bother if not in the simulator. */
+  if (__insn_mfspr (SPR_SIM_CONTROL) == 0)
+    return;
 
-  /* Write the library address in hex. */
+#define DLPUTC(c) __insn_mtspr (SPR_SIM_CONTROL,                         \
+                                (SIM_CONTROL_DLOPEN                      \
+                                 | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+
+  /* Write the library address in hex.  */
   DLPUTC ('0');
   DLPUTC ('x');
   for (shift = (int) sizeof (unsigned long) * 8 - 4; shift >= 0; shift -= 4)
     DLPUTC ("0123456789abcdef"[(l->l_map_start >> shift) & 0xF]);
   DLPUTC (':');
 
-  /* Write the library path, including the terminating '\0'. */
+  /* Write the library path, including the terminating '\0'.  */
+  path = dl_realpath (l->l_name, pathbuf) ?: l->l_name;
   for (size_t i = 0;; i++)
     {
-      DLPUTC (l->l_name[i]);
-      if (l->l_name[i] == '\0')
+      DLPUTC (path[i]);
+      if (path[i] == '\0')
         break;
     }
 #undef DLPUTC
 }
 
-/* Support notifying the simulator about removed objects prior to munmap(). */
+/* Support notifying the simulator about removed objects prior to munmap().  */
 void internal_function
 _dl_unmap (struct link_map *l)
 {
   int shift;
 
-#define DLPUTC(c) __insn_mtspr(SPR_SIM_CONTROL,                         \
-                               (SIM_CONTROL_DLCLOSE                     \
-                                | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
+  /* Don't bother if not in the simulator.  */
+  if (__insn_mfspr (SPR_SIM_CONTROL) == 0)
+    return;
+
+#define DLPUTC(c) __insn_mtspr (SPR_SIM_CONTROL,                         \
+                                (SIM_CONTROL_DLCLOSE                     \
+                                 | ((c) << _SIM_CONTROL_OPERATOR_BITS)))
 
-  /* Write the library address in hex. */
+  /* Write the library address in hex.  */
   DLPUTC ('0');
   DLPUTC ('x');
   for (shift = (int) sizeof (unsigned long) * 8 - 4; shift >= 0; shift -= 4)

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cd84016efe83d92ee3903fef37f79ca2bafb3985

commit cd84016efe83d92ee3903fef37f79ca2bafb3985
Author: Chris Metcalf <cmetcalf@tilera.com>
Date:   Fri Nov 2 12:53:57 2012 -0400

    Optimize tile (mostly tilegx) memcpy and memmove performance.
    
    - Override <memcopy.h> so we use full 8-byte word copies on tilegx32
      for memmove, then use op_t in memcpy instead of the previous
      locally-defined word_t just to avoid proliferating identical types.
    - Fix bug in memcpy prefetch that caused us to never prefetch past
      the first cache line.
    - Optimize misaligned memcpy by inlining _wordcopy_fwd_dest_aligned
      instead of just doing a dumb word-at-a-time copy.
    - Make memcpy safe for forward copies by doing all the loads from
      a given cache line prior to doing a wh64 (cache line zero-fill)
      on the destination.  Remove now-redundant src == dst check.
    - Copy and optimize the generic wordcopy.c routines to use the tile
      "double align" instruction instead of the MERGE macro; to avoid
      offset addressing mode (which tile doesn't have) by rewriting the
      pointer math to load and store with a zero index; and to use
      post-increment addresses in the inner loops to improve scheduling.

diff --git a/ports/ChangeLog.tile b/ports/ChangeLog.tile
index 0fecf63..bc3946c 100644
--- a/ports/ChangeLog.tile
+++ b/ports/ChangeLog.tile
@@ -1,3 +1,9 @@
+2012-11-02  Chris Metcalf  <cmetcalf@tilera.com>
+
+	* sysdeps/tile/tilegx/memcpy.c (__memcpy): Optimize.
+	* sysdeps/tile/memcopy.h: New file.
+	* sysdeps/tile/wordcopy.c: New file.
+
 2012-11-03  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #3439]
diff --git a/ports/sysdeps/tile/memcopy.h b/ports/sysdeps/tile/memcopy.h
new file mode 100644
index 0000000..2bc3fce
--- /dev/null
+++ b/ports/sysdeps/tile/memcopy.h
@@ -0,0 +1,27 @@
+/* memcopy.h -- definitions for memory copy functions.  Tile version.
+   Copyright (C) 2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/generic/memcopy.h>
+#include <bits/wordsize.h>
+
+/* Support more efficient copying on tilegx32, which supports
+   long long as a native 64-bit type.  */
+#if defined (__tilegx__) && __WORDSIZE == 32
+# undef op_t
+# define op_t	unsigned long long int
+#endif
diff --git a/ports/sysdeps/tile/tilegx/memcpy.c b/ports/sysdeps/tile/tilegx/memcpy.c
index dd6e30d..5b015f3 100644
--- a/ports/sysdeps/tile/tilegx/memcpy.c
+++ b/ports/sysdeps/tile/tilegx/memcpy.c
@@ -19,11 +19,9 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <memcopy.h>
 #include <arch/chip.h>
 
-/* Must be 8 bytes in size. */
-#define word_t uint64_t
-
 /* How many cache lines ahead should we prefetch? */
 #define PREFETCH_LINES_AHEAD 3
 
@@ -34,8 +32,8 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
   const char *__restrict src1 = (const char *) srcv;
   const char *__restrict src1_end;
   const char *__restrict prefetch;
-  word_t *__restrict dst8; /* 8-byte pointer to destination memory. */
-  word_t final; /* Final bytes to write to trailing word, if any */
+  op_t *__restrict dst8; /* 8-byte pointer to destination memory. */
+  op_t final; /* Final bytes to write to trailing word, if any */
   long i;
 
   if (n < 16)
@@ -55,101 +53,169 @@ __memcpy (void *__restrict dstv, const void *__restrict srcv, size_t n)
     {
       __insn_prefetch (prefetch);
       prefetch += CHIP_L2_LINE_SIZE ();
-      prefetch = (prefetch > src1_end) ? prefetch : src1;
+      prefetch = (prefetch < src1_end) ? prefetch : src1;
     }
 
   /* Copy bytes until dst is word-aligned. */
-  for (; (uintptr_t) dst1 & (sizeof (word_t) - 1); n--)
+  for (; (uintptr_t) dst1 & (sizeof (op_t) - 1); n--)
     *dst1++ = *src1++;
 
   /* 8-byte pointer to destination memory. */
-  dst8 = (word_t *) dst1;
+  dst8 = (op_t *) dst1;
 
-  if (__builtin_expect ((uintptr_t) src1 & (sizeof (word_t) - 1), 0))
+  if (__builtin_expect ((uintptr_t) src1 & (sizeof (op_t) - 1), 0))
     {
-      /* Misaligned copy.  Copy 8 bytes at a time, but don't bother
-         with other fanciness.
-         TODO: Consider prefetching and using wh64 as well.  */
+      /* Misaligned copy.  Use glibc's _wordcopy_fwd_dest_aligned, but
+         inline it to avoid prologue/epilogue.  TODO: Consider
+         prefetching and using wh64 as well.  */
+      void * srci;
+      op_t a0, a1, a2, a3;
+      long int dstp = (long int) dst1;
+      long int srcp = (long int) src1;
+      long int len = n / OPSIZ;
 
-      /* Create an aligned src8. */
-      const word_t *__restrict src8 =
-        (const word_t *) ((uintptr_t) src1 & -sizeof (word_t));
-      word_t b;
+      /* Save the initial source pointer so we know the number of
+         bytes to shift for merging two unaligned results.  */
+      srci = (void *) srcp;
 
-      word_t a = *src8++;
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
-        {
-          b = *src8++;
-          a = __insn_dblalign (a, b, src1);
-          *dst8++ = a;
-          a = b;
-        }
+      /* Make SRCP aligned by rounding it down to the beginning of the
+         `op_t' it points in the middle of.  */
+      srcp &= -OPSIZ;
+
+      switch (len % 4)
+	{
+	case 2:
+	  a1 = ((op_t *) srcp)[0];
+	  a2 = ((op_t *) srcp)[1];
+	  len += 2;
+	  srcp += 2 * OPSIZ;
+	  goto do1;
+	case 3:
+	  a0 = ((op_t *) srcp)[0];
+	  a1 = ((op_t *) srcp)[1];
+	  len += 1;
+	  srcp += 2 * OPSIZ;
+	  goto do2;
+	case 0:
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    return dstv;
+	  a3 = ((op_t *) srcp)[0];
+	  a0 = ((op_t *) srcp)[1];
+	  len += 0;
+	  srcp += 2 * OPSIZ;
+	  goto do3;
+	case 1:
+	  a2 = ((op_t *) srcp)[0];
+	  a3 = ((op_t *) srcp)[1];
+	  srcp += 2 * OPSIZ;
+	  len -= 1;
+	  if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	    goto do0;
+	  goto do4;			/* No-op.  */
+	}
 
+      do
+	{
+	do4:
+	  a0 = ((op_t *) srcp)[0];
+	  a2 = __insn_dblalign (a2, a3, srci);
+	  ((op_t *) dstp)[0] = a2;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do3:
+	  a1 = ((op_t *) srcp)[0];
+	  a3 = __insn_dblalign (a3, a0, srci);
+	  ((op_t *) dstp)[0] = a3;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do2:
+	  a2 = ((op_t *) srcp)[0];
+	  a0 = __insn_dblalign (a0, a1, srci);
+	  ((op_t *) dstp)[0] = a0;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	do1:
+	  a3 = ((op_t *) srcp)[0];
+	  a1 = __insn_dblalign (a1, a2, srci);
+	  ((op_t *) dstp)[0] = a1;
+	  srcp += OPSIZ;
+	  dstp += OPSIZ;
+	  len -= 4;
+	}
+      while (len != 0);
+
+      /* This is the right position for do0.  Please don't move
+         it into the loop.  */
+    do0:
+      ((op_t *) dstp)[0] = __insn_dblalign (a2, a3, srci);
+
+      n = n % OPSIZ;
       if (n == 0)
-        return dstv;
+	return dstv;
 
-      b = ((const char *) src8 <= src1_end) ? *src8 : 0;
+      a0 = ((const char *) srcp <= src1_end) ? ((op_t *) srcp)[0] : 0;
 
-      /* Final source bytes to write to trailing partial word, if any. */
-      final = __insn_dblalign (a, b, src1);
+      final = __insn_dblalign (a3, a0, srci);
+      dst8 = (op_t *)(dstp + OPSIZ);
     }
   else
     {
       /* Aligned copy. */
 
-      const word_t *__restrict src8 = (const word_t *) src1;
+      const op_t *__restrict src8 = (const op_t *) src1;
 
       /* src8 and dst8 are both word-aligned. */
       if (n >= CHIP_L2_LINE_SIZE ())
         {
           /* Copy until 'dst' is cache-line-aligned. */
           for (; (uintptr_t) dst8 & (CHIP_L2_LINE_SIZE () - 1);
-               n -= sizeof (word_t))
+               n -= sizeof (op_t))
             *dst8++ = *src8++;
 
-          /* If copying to self, return.  The test is cheap enough
-             that we do it despite the fact that the memcpy() contract
-             doesn't require us to support overlapping dst and src.
-             This is the most common case of overlap, and any close
-             overlap will cause corruption due to the wh64 below.
-             This case is particularly important since the compiler
-             will emit memcpy() calls for aggregate copies even if it
-             can't prove that src != dst.  */
-          if (__builtin_expect (dst8 == src8, 0))
-            return dstv;
-
           for (; n >= CHIP_L2_LINE_SIZE ();)
-            {
-              __insn_wh64 (dst8);
-
-              /* Prefetch and advance to next line to prefetch, but
-                 don't go past the end.  */
-              __insn_prefetch (prefetch);
-              prefetch += CHIP_L2_LINE_SIZE ();
-              prefetch = (prefetch > src1_end) ? prefetch :
-                (const char *) src8;
-
-              /* Copy an entire cache line.  Manually unrolled to
-                 avoid idiosyncracies of compiler unrolling.  */
-#define COPY_WORD(offset) ({ dst8[offset] = src8[offset]; n -= 8; })
-              COPY_WORD (0);
-              COPY_WORD (1);
-              COPY_WORD (2);
-              COPY_WORD (3);
-              COPY_WORD (4);
-              COPY_WORD (5);
-              COPY_WORD (6);
-              COPY_WORD (7);
+	    {
+	      op_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+	      /* Prefetch and advance to next line to prefetch, but
+		 don't go past the end.  */
+	      __insn_prefetch (prefetch);
+	      prefetch += CHIP_L2_LINE_SIZE ();
+	      prefetch = (prefetch < src1_end) ? prefetch :
+		(const char *) src8;
+
+	      /* Do all the loads before wh64.  This is necessary if
+		 [src8, src8+7] and [dst8, dst8+7] share the same
+		 cache line and dst8 <= src8, as can be the case when
+		 called from memmove, or with code tested on x86 whose
+		 memcpy always works with forward copies.  */
+	      tmp0 = *src8++;
+	      tmp1 = *src8++;
+	      tmp2 = *src8++;
+	      tmp3 = *src8++;
+	      tmp4 = *src8++;
+	      tmp5 = *src8++;
+	      tmp6 = *src8++;
+	      tmp7 = *src8++;
+
+	      __insn_wh64 (dst8);
+
+	      *dst8++ = tmp0;
+	      *dst8++ = tmp1;
+	      *dst8++ = tmp2;
+	      *dst8++ = tmp3;
+	      *dst8++ = tmp4;
+	      *dst8++ = tmp5;
+	      *dst8++ = tmp6;
+	      *dst8++ = tmp7;
+
+	      n -= 64;
+	    }
 #if CHIP_L2_LINE_SIZE() != 64
 # error "Fix code that assumes particular L2 cache line size."
 #endif
-
-              dst8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-              src8 += CHIP_L2_LINE_SIZE () / sizeof (word_t);
-            }
         }
 
-      for (; n >= sizeof (word_t); n -= sizeof (word_t))
+      for (; n >= sizeof (op_t); n -= sizeof (op_t))
         *dst8++ = *src8++;
 
       if (__builtin_expect (n == 0, 1))
diff --git a/ports/sysdeps/tile/wordcopy.c b/ports/sysdeps/tile/wordcopy.c
new file mode 100644
index 0000000..f978d8f
--- /dev/null
+++ b/ports/sysdeps/tile/wordcopy.c
@@ -0,0 +1,449 @@
+/* wordcopy.c -- subroutines for memory copy functions.  Tile version.
+   Copyright (C) 1991-2012 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* To optimize for tile, we make the following changes from the
+   default glibc version:
+   - Use the double align instruction instead of the MERGE macro.
+   - Since we don't have offset addressing mode, make sure the loads /
+     stores in the inner loop always have indices of 0.
+   - Use post-increment addresses in the inner loops, which yields
+     better scheduling.  */
+
+/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...!  */
+
+#include <stddef.h>
+#include <memcopy.h>
+
+/* Provide the appropriate dblalign builtin to shift two registers
+   based on the alignment of a pointer held in a third register.  */
+#ifdef __tilegx__
+#define DBLALIGN __insn_dblalign
+#else
+#define DBLALIGN __insn_dword_align
+#endif
+
+/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   Both SRCP and DSTP should be aligned for memory operations on `op_t's.  */
+
+void
+_wordcopy_fwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp)[0];
+      srcp += OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to
+   block beginning at DSTP with LEN `op_t' words (not LEN bytes!).
+   DSTP should be aligned for memory operations on `op_t's, but SRCP must
+   *not* be aligned.  */
+
+void
+_wordcopy_fwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the `op_t'
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      a1 = ((op_t *) srcp)[0];
+      a2 = ((op_t *) srcp)[1];
+      len += 2;
+      srcp += 2 * OPSIZ;
+      goto do1;
+    case 3:
+      a0 = ((op_t *) srcp)[0];
+      a1 = ((op_t *) srcp)[1];
+      len += 1;
+      srcp += 2 * OPSIZ;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a3 = ((op_t *) srcp)[0];
+      a0 = ((op_t *) srcp)[1];
+      len += 0;
+      srcp += 2 * OPSIZ;
+      goto do3;
+    case 1:
+      a2 = ((op_t *) srcp)[0];
+      a3 = ((op_t *) srcp)[1];
+      srcp += 2 * OPSIZ;
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, a3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, a0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do2:
+      a2 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, a1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+    do1:
+      a3 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, a2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp += OPSIZ;
+      dstp += OPSIZ;
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = DBLALIGN (a2, a3, srci);
+}
+
+/* _wordcopy_bwd_aligned -- Copy block finishing right before
+   SRCP to block finishing right before DSTP with LEN `op_t' words
+   (not LEN bytes!).  Both SRCP and DSTP should be aligned for memory
+   operations on `op_t's.  */
+
+void
+_wordcopy_bwd_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  op_t a0, a1;
+  long int srcp1;
+
+  srcp1 = srcp - 1 * OPSIZ;
+  srcp -= 2 * OPSIZ;
+  dstp -= 1 * OPSIZ;
+
+  switch (len % 8)
+    {
+    case 2:
+      a0 = ((op_t *) srcp1)[0];
+      len += 6;
+      goto do1;
+    case 3:
+      a1 = ((op_t *) srcp1)[0];
+      len += 5;
+      goto do2;
+    case 4:
+      a0 = ((op_t *) srcp1)[0];
+      len += 4;
+      goto do3;
+    case 5:
+      a1 = ((op_t *) srcp1)[0];
+      len += 3;
+      goto do4;
+    case 6:
+      a0 = ((op_t *) srcp1)[0];
+      len += 2;
+      goto do5;
+    case 7:
+      a1 = ((op_t *) srcp1)[0];
+      len += 1;
+      goto do6;
+
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      a0 = ((op_t *) srcp1)[0];
+      goto do7;
+    case 1:
+      a1 = ((op_t *) srcp1)[0];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do8;			/* No-op.  */
+    }
+
+  do
+    {
+    do8:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do7:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do6:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do5:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do4:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      a0 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      a1 = ((op_t *) srcp)[0];
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 8;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  ((op_t *) dstp)[0] = a1;
+}
+
+/* _wordcopy_bwd_dest_aligned -- Copy block finishing right
+   before SRCP to block finishing right before DSTP with LEN `op_t'
+   words (not LEN bytes!).  DSTP should be aligned for memory
+   operations on `op_t', but SRCP must *not* be aligned.  */
+
+void
+_wordcopy_bwd_dest_aligned (dstp, srcp, len)
+     long int dstp;
+     long int srcp;
+     size_t len;
+{
+  void * srci;
+  op_t a0, a1, a2, a3;
+  op_t b0, b1, b2, b3;
+
+  /* Save the initial source pointer so we know the number of bytes to
+     shift for merging two unaligned results.  */
+  srci = (void *) srcp;
+
+  /* Make SRCP aligned by rounding it down to the beginning of the op_t
+     it points in the middle of.  */
+  srcp &= -OPSIZ;
+  srcp += OPSIZ;
+
+  switch (len % 4)
+    {
+    case 2:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b2 = ((op_t *) srcp)[2];
+      b1 = a1 = ((op_t *) srcp)[1];
+      len += 2;
+      goto do1;
+    case 3:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b3 = ((op_t *) srcp)[2];
+      b2 = a2 = ((op_t *) srcp)[1];
+      len += 1;
+      goto do2;
+    case 0:
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	return;
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b0 = ((op_t *) srcp)[2];
+      b3 = a3 = ((op_t *) srcp)[1];
+      goto do3;
+    case 1:
+      srcp -= 3 * OPSIZ;
+      dstp -= 1 * OPSIZ;
+      b1 = ((op_t *) srcp)[2];
+      b0 = a0 = ((op_t *) srcp)[1];
+      len -= 1;
+      if (OP_T_THRES <= 3 * OPSIZ && len == 0)
+	goto do0;
+      goto do4;			/* No-op.  */
+    }
+
+  do
+    {
+    do4:
+      b3 = a3 = ((op_t *) srcp)[0];
+      a0 = DBLALIGN (a0, b1, srci);
+      ((op_t *) dstp)[0] = a0;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do3:
+      b2 = a2 = ((op_t *) srcp)[0];
+      a3 = DBLALIGN (a3, b0, srci);
+      ((op_t *) dstp)[0] = a3;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do2:
+      b1 = a1 = ((op_t *) srcp)[0];
+      a2 = DBLALIGN (a2, b3, srci);
+      ((op_t *) dstp)[0] = a2;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+    do1:
+      b0 = a0 = ((op_t *) srcp)[0];
+      a1 = DBLALIGN (a1, b2, srci);
+      ((op_t *) dstp)[0] = a1;
+      srcp -= OPSIZ;
+      dstp -= OPSIZ;
+
+      len -= 4;
+    }
+  while (len != 0);
+
+  /* This is the right position for do0.  Please don't move
+     it into the loop.  */
+ do0:
+  a0 = DBLALIGN (a0, b1, srci);
+  ((op_t *) dstp)[0] = a0;
+}

-----------------------------------------------------------------------

Summary of changes:
 ports/ChangeLog.tile                          |   23 ++
 ports/sysdeps/tile/crti.S                     |   23 +-
 ports/sysdeps/tile/dl-runtime.c               |  103 +++++-
 ports/sysdeps/tile/math_private.h             |   32 ++-
 ports/sysdeps/tile/memcopy.h                  |   27 ++
 ports/sysdeps/tile/nptl/pthread_spin_unlock.c |   33 ++
 ports/sysdeps/tile/start.S                    |   23 +-
 ports/sysdeps/tile/tilegx/Makefile            |   18 +
 ports/sysdeps/tile/tilegx/memcpy.c            |  200 ++++++++----
 ports/sysdeps/tile/wordcopy.c                 |  449 +++++++++++++++++++++++++
 10 files changed, 831 insertions(+), 100 deletions(-)
 create mode 100644 ports/sysdeps/tile/memcopy.h
 create mode 100644 ports/sysdeps/tile/nptl/pthread_spin_unlock.c
 create mode 100644 ports/sysdeps/tile/tilegx/Makefile
 create mode 100644 ports/sysdeps/tile/wordcopy.c


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]