This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

PATCH: Update ia64 memory functions


Intel updated ia64 memory functions:

http://www3.intel.com/cd/software/products/asmo-na/eng/219884.htm

This patch ported them to glibc. The speedups on Montecito are

Average speedup for memcmp: 53.8154%
Average speedup for memcpy: 22.6089%
Average speedup for memmove: 381.785%
Average speedup for memset: 1.27461%


H.J.
----
2007-01-12  H.J. Lu  <hongjiu.lu@intel.com>

	* sysdeps/ia64/Makefile (sysdep_routines): Add
	memcopyD-large-al, memcopyD-large-ual, memcpy-a0-mt-array
	and serial-memmove.

	* sysdeps/ia64/memcmp.S: Replaced with the one contributed by
	Intel.
	* sysdeps/ia64/memcpy.S: Likewise.
	* sysdeps/ia64/memmove.S: Likewise.
	* sysdeps/ia64/memset.S: Likewise.

	* sysdeps/ia64/memcopyD-large-al.S: New. Contributed by Intel.
	* sysdeps/ia64/memcopyD-large-ual.S: Likewise.
	* sysdeps/ia64/memcpy-a0-mt-array.S: Likewise.
	* sysdeps/ia64/serial-memmove.S: Likewise.

--- sysdeps/ia64/Makefile.intel	2004-08-15 23:46:14.000000000 -0700
+++ sysdeps/ia64/Makefile	2006-11-30 11:08:05.000000000 -0800
@@ -22,3 +22,8 @@ sysdep-dl-routines += dl-symaddr dl-fptr
 sysdep_routines += $(sysdep-dl-routines)
 sysdep-rtld-routines += $(sysdep-dl-routines)
 endif
+
+ifeq ($(subdir),string)
+sysdep_routines += memcopyD-large-al memcopyD-large-ual \
+		   memcpy-a0-mt-array serial-memmove
+endif
--- sysdeps/ia64/memcmp.S.intel	2004-04-17 15:58:49.000000000 -0700
+++ sysdeps/ia64/memcmp.S	2006-12-01 09:08:27.000000000 -0800
@@ -1,165 +1,503 @@
-/* Optimized version of the standard memcmp() function.
-   This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2004 Free Software Foundation, Inc.
-   Contributed by Dan Pop <Dan.Pop@cern.ch>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Return: the result of the comparison
-
-   Inputs:
-        in0:    dest (aka s1)
-        in1:    src  (aka s2)
-        in2:    byte count
-
-   In this form, it assumes little endian mode.  For big endian mode, the
-   the two shifts in .l2 must be inverted:
-
-	shl   	tmp1[0] = r[1 + MEMLAT], sh1   // tmp1 = w0 << sh1
-	shr.u   tmp2[0] = r[0 + MEMLAT], sh2   // tmp2 = w1 >> sh2
-
-   and all the mux1 instructions should be replaced by plain mov's.  */
+// memcmp - compare memory areas
+//
+// Copyright (c) 2004-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// Basicly the code is divided in these parts:
+// 1)short - linear code for comparing < 16 bytes
+// 2)long(size > 16), this one further devided into:
+//   a)unaligned - first compares first unaligned bytes < 8 and than uses ld8 to find a pair of 8B that differ
+//     than branches to unaligned_cmp, if such pair not found - do the rest by branch to short
+//   b)aligned - uses ld8 to find a pair of 8B that differ than branches to aligned_cmp, if such pair not found - do the rest by branch to short
+//
+//
+//       Author: Boris Shurygin, Moscow
+//       Date:   December, 2004
+//
+// -- Begin  memcmp
 
 #include <sysdep.h>
 #undef ret
 
-#define OP_T_THRES 	16
-#define OPSIZ 		8
-#define MEMLAT		2
-
-#define start		r15
-#define saved_pr	r17
-#define saved_lc	r18
-#define dest		r19
-#define src		r20
-#define len		r21
-#define asrc		r22
-#define tmp		r23
-#define value1		r24
-#define value2		r25
-#define sh2		r28
-#define	sh1		r29
-#define loopcnt		r30
-
-ENTRY(memcmp)
+    .section .text
+	.proc  memcmp#
+    .align 32
+	.global memcmp#
+	 bcmp==memcmp
+	.weak bcmp#
 	.prologue
-	alloc 	r2 = ar.pfs, 3, 37, 0, 40
-
-	.rotr	r[MEMLAT + 2], q[MEMLAT + 5], tmp1[4], tmp2[4], val[2]
-	.rotp	p[MEMLAT + 4 + 1]
-
-	mov	ret0 = r0		// by default return value = 0
-	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicate registers
-	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
-	.body
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	mov	len = in2		// len
-	sub	tmp = r0, in0		// tmp = -dest
-	;;
-	and	loopcnt = 7, tmp		// loopcnt = -dest % 8
-	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
-(p6)	br.cond.spnt	.cmpfew			// compare byte by byte
-	;;
-	cmp.eq	p6, p0 = loopcnt, r0
-(p6)	br.cond.sptk .dest_aligned
-	sub	len = len, loopcnt	// len -= -dest % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	;;
-	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	value1 = [src], 1	// value = *src++
-	ld1	value2 = [dest], 1
-	;;
-	cmp.ne	p6, p0 = value1, value2
-(p6)	br.cond.spnt .done
-	br.cloop.dptk .l1
-.dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	tmp = -8, len   	// tmp = len & -OPSIZ
-	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and	len = 7, len ;;		// len = len % 8
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	mov     pr.rot = 1 << 16 ;;	// set rotating predicates
-	sub	sh2 = 64, sh1		// sh2 = 64 - sh1
-	mov	ar.lc = loopcnt		// set LC
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
-(p6)    br.cond.sptk .src_aligned
-	add	src = src, tmp		// src += len & -OPSIZ
-	mov	ar.ec = MEMLAT + 4 + 1 	// four more passes needed
-	ld8	r[1] = [asrc], 8 ;;	// r[1] = w0
-	.align	32
-
-// We enter this loop with p6 cleared by the above comparison
-
-.l2:
-(p[0])		ld8	r[0] = [asrc], 8		// r[0] = w1
-(p[0])		ld8	q[0] = [dest], 8
-(p[MEMLAT])	shr.u	tmp1[0] = r[1 + MEMLAT], sh1	// tmp1 = w0 >> sh1
-(p[MEMLAT])	shl	tmp2[0] = r[0 + MEMLAT], sh2  	// tmp2 = w1 << sh2
-(p[MEMLAT+4])	cmp.ne	p6, p0 = q[MEMLAT + 4], val[1]
-(p[MEMLAT+3])	or	val[0] = tmp1[3], tmp2[3] 	// val = tmp1 | tmp2
-(p6)		br.cond.spnt .l2exit
-		br.ctop.sptk    .l2
-		br.cond.sptk .cmpfew
-.l3exit:
-	mux1	value1 = r[MEMLAT], @rev
-	mux1	value2 = q[MEMLAT], @rev
-	cmp.ne	p6, p0 = r0, r0	;;	// clear p6
-.l2exit:
-(p6)	mux1	value1 = val[1], @rev
-(p6)	mux1	value2 = q[MEMLAT + 4], @rev ;;
-	cmp.ltu	p6, p7 = value2, value1 ;;
-(p6)	mov	ret0 = -1
-(p7)	mov	ret0 = 1
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-.src_aligned:
-	cmp.ne	p6, p0 = r0, r0		// clear p6
-	mov     ar.ec = MEMLAT + 1 ;;	// set EC
-.l3:
-(p[0])		ld8	r[0] = [src], 8
-(p[0])		ld8	q[0] = [dest], 8
-(p[MEMLAT])	cmp.ne	p6, p0 = r[MEMLAT], q[MEMLAT]
-(p6)		br.cond.spnt .l3exit
-		br.ctop.dptk .l3 ;;
-.cmpfew:
-	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
-	adds	len = -1, len		// --len;
-(p6)	br.cond.spnt	.restore_and_exit ;;
-	mov	ar.lc = len
-.l4:
-	ld1	value1 = [src], 1
-	ld1	value2 = [dest], 1
-	;;
-	cmp.ne	p6, p0 = value1, value2
-(p6)	br.cond.spnt	.done
-	br.cloop.dptk	.l4 ;;
-.done:
-(p6)	sub	ret0 = value2, value1	// don't execute it if falling thru
-.restore_and_exit:
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-END(memcmp)
+memcmp:
+{.mii      
+           mov r19=r32 
+           mov.i r18=ar.lc
+           mov r20=r33
+}
+{.mib       
+           mov r21=r34
+           cmp.lt p6,p0=16,r34
+     (p6)  br.cond.spnt.many long;;//branch to long
+}
+{.mii       
+           mov r8=r0
+           mov r15=pr
+           nop.i  0
+}
+short://short compare
+{.mib
+            cmp.eq p6,p0=r21,r0  //length == 0?
+            add  r16=r19,r21
+       (p6) br.cond.spnt.few restore_exit;;
+}
+
+ {   .mmi
+	cmp.le  p6,p7=2,r21		            // 1
+	ld1     r24=[r19],1                 // 1
+	cmp.eq	p8,p14=0,r0		            // 1
+ }
+ {   .mmi
+	ld1     r25=[r20],1                 // 1
+	add	r17=-1,r16		                // 1
+	nop.i	0 ;;		                // 1
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r25,r24  // 2
+  (p6)  ld1     r27=[r20],1             // 2
+        cmp.lt  p8,p9=r19,r17           // 2
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 2
+  (p15) br.cond.dptk    diff_exit       // 2
+  (p7)  br.cond.dptk    restore_exit ;; // 2
+ }
+ {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 3
+  (p8)  ld1     r25=[r20],1             // 3
+        cmp.lt  p6,p7=r19,r17           // 3
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 3
+  (p14) br.cond.dptk    diff_exit_dup   // 3
+  (p9)  br.cond.dptk    restore_exit ;;	// 3
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 4
+  (p6)  ld1     r27=[r20],1             // 4
+        cmp.lt  p8,p9=r19,r17           // 4
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 4
+  (p15) br.cond.dptk    diff_exit       // 4
+  (p7)  br.cond.dptk    restore_exit ;; // 4
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 5
+  (p8)  ld1     r25=[r20],1             // 5
+        cmp.lt  p6,p7=r19,r17           // 5
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 5
+  (p14) br.cond.dptk    diff_exit_dup   // 5
+  (p9)  br.cond.dptk    restore_exit ;;	// 5
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 6
+  (p6)  ld1     r27=[r20],1             // 6
+        cmp.lt  p8,p9=r19,r17           // 6
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 6
+  (p15) br.cond.dptk    diff_exit       // 6
+  (p7)  br.cond.dptk    restore_exit ;; // 6
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 7
+  (p8)  ld1     r25=[r20],1             // 7
+        cmp.lt  p6,p7=r19,r17           // 7
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 7
+  (p14) br.cond.dptk    diff_exit_dup   // 7
+  (p9)  br.cond.dptk    restore_exit ;;	// 7
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 8
+  (p6)  ld1     r27=[r20],1             // 8
+        cmp.lt  p8,p9=r19,r17           // 8
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 8
+  (p15) br.cond.dptk    diff_exit       // 8
+  (p7)  br.cond.dptk    restore_exit ;; // 8
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 9
+  (p8)  ld1     r25=[r20],1             // 9
+        cmp.lt  p6,p7=r19,r17           // 9
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 9
+  (p14) br.cond.dptk    diff_exit_dup   // 9
+  (p9)  br.cond.dptk    restore_exit ;;	// 9
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 10
+  (p6)  ld1     r27=[r20],1             // 10
+        cmp.lt  p8,p9=r19,r17           // 10
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 10
+  (p15) br.cond.dptk    diff_exit       // 10
+  (p7)  br.cond.dptk    restore_exit ;; // 10
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 11
+  (p8)  ld1     r25=[r20],1             // 11
+        cmp.lt  p6,p7=r19,r17           // 11
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 11
+  (p14) br.cond.dptk    diff_exit_dup   // 11
+  (p9)  br.cond.dptk    restore_exit ;;	// 11
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 12
+  (p6)  ld1     r27=[r20],1             // 12
+        cmp.lt  p8,p9=r19,r17           // 12
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 12
+  (p15) br.cond.dptk    diff_exit       // 12
+  (p7)  br.cond.dptk    restore_exit ;; // 12
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 13
+  (p8)  ld1     r25=[r20],1             // 13
+        cmp.lt  p6,p7=r19,r17           // 13
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 13
+  (p14) br.cond.dptk    diff_exit_dup   // 13
+  (p9)  br.cond.dptk    restore_exit ;;	// 13
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 14
+  (p6)  ld1     r27=[r20],1             // 14
+        cmp.lt  p8,p9=r19,r17           // 14
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 14
+  (p15) br.cond.dptk    diff_exit       // 14
+  (p7)  br.cond.dptk    restore_exit ;; // 14
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 15
+  (p8)  ld1     r25=[r20],1             // 15
+        cmp.lt  p6,p7=r19,r17           // 15
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 15
+  (p14) br.cond.dptk    diff_exit_dup   // 15
+  (p9)  br.cond.dptk    restore_exit ;;	// 15
+ }
+ {   .mmi
+  (p8)  cmp.ne.unc      p15,p0=r24,r25  // 16
+  (p6)  ld1     r27=[r20],1             // 16
+        cmp.lt  p8,p9=r19,r17           // 16
+ }
+ {   .mbb
+  (p6)  ld1     r26=[r19],1             // 16
+  (p15) br.cond.dptk    diff_exit       // 16
+  (p7)  br.cond.dptk    restore_exit ;; // 16
+ }
+  {   .mmi
+  (p6)  cmp.ne.unc      p14,p0=r26,r27  // 17
+  (p8)  ld1     r25=[r20],1             // 17
+        cmp.lt  p6,p7=r19,r17           // 17
+ }
+ {   .mbb
+  (p8)  ld1     r24=[r19],1             // 17
+  (p14) br.cond.dptk    diff_exit_dup   // 17
+  (p9)  br.cond.dptk    restore_exit ;;	// 17
+ }
+diff_exit:        
+{.mii
+        sub r8=r24,r25
+        mov pr=r15,0x10000   // restore rotating predicates
+        mov.i ar.lc=r18
+}
+{.mfb
+        nop.m 0
+        nop.f 0
+        br.ret.sptk.many b0;;
+}
+diff_exit_dup:        
+{.mii
+        sub r8=r26,r27
+        mov pr=r15,0x10000  // restore rotating predicates
+        mov.i ar.lc=r18
+}
+{.mfb
+        nop.m 0
+        nop.f 0
+        br.ret.sptk.many b0;;
+}
+restore_exit:        
+{.mii
+           nop.m 0
+           mov pr=r15,0x10000   // restore rotating predicates
+           mov.i ar.lc=r18
+}
+{.mfb
+           nop.m 0
+           nop.f 0
+           br.ret.sptk.many b0;;
+}
+long:
+{.mii       
+            alloc r2=ar.pfs,3,37,0,40
+            mov r15=pr
+			and r28=7,r19
+}
+{.mii
+            and r29=7,r20
+            mov.i r18=ar.lc
+			mov r8=r0
+            ;;
+}
+{.mib
+           nop.m  0
+           shr.u r22=r21,3 //needed in aligned case, we'll recalculate r22 for unaligned later
+           nop.b  0
+}
+{.mii       
+            cmp.eq p7,p0=r28,r0 //check if first address is 8-bytes aligned
+            cmp.ne p6,p0=r0,r0
+			sub r30=8,r28;;
+}
+{.mib       
+      (p7)  cmp.eq p6,p0=r29,r0 //check if second address is 8-bytes aligned
+            add r30=-1,r30
+      (p6)  br.cond.sptk.many aligned;;//both aligned, use ld8 SWP loop
+}
+{.mib
+            nop.m  0
+            mov.i ar.lc=r30
+	(p7)    br.cond.sptk.many unaligned;;
+}
+pre_loop://compare first n bytes (1<=n<=8), so r19 is aligned after this
+{.mmi
+            ld1 r24=[r19],1   
+            ld1 r25=[r20],1
+            add r21=-1,r21;;
+}
+{.mbb
+            cmp.eq p0,p10=r24,r25
+      (p10) br.cond.spnt.few diff_exit//<memcmp+528>
+            br.cloop.dptk.few pre_loop//<memcmp+96>
+}
+{.mib       
+           nop.m 0
+           cmp.gt p6,p0=16,r21
+     (p6)  br.cond.spnt.many short;;//branch to short
+}
+unaligned:
+{.mii
+            and r23=-8,r20  //aligned pointer
+			and r28=7,r20   //shift ammount calculation
+			shr.u r22=r21,3;;//for LC
+}
+{.mii
+            and r29=7,r20
+            shl r28=r28,3
+            adds r22=-2,r22;;
+}
+{.mmi
+			add r29=-8,r29
+			nop.m 0
+			nop.i 0	;;
+}
+{.mii       
+            sub r30=64,r28
+            mov pr.rot=0x10000
+            mov.i ar.ec=7;;          
+}
+{.mii       
+            ld8 r33=[r23],8
+            add r20=r20,r29
+            mov.i ar.lc=r22;;
+}
+unaligned_loop:        
+{.mmi
+      (p16) ld8 r32=[r23],8
+      (p16) ld8 r36=[r19],8
+      (p18) shr.u r43=r35,r28
+}
+{.mii
+      (p21) or r51=r46,r50
+      (p18) shl r47=r34,r30
+      (p22) cmp.eq p0,p6=r42,r52
+}
+{.mbb
+      (p22) add r21=-8,r21
+      (p6)  br.cond.spnt.few unaligned_cmp
+            br.ctop.sptk.few unaligned_loop;;
+}
+{.mmb
+            add r20=r23,r29
+            nop.m 0
+            br.sptk.many short;;
+}
+aligned:
+{.mii
+            cmp.eq p0,p6=r0,r0
+            adds r22=-1,r22//calculate LC
+			mov pr.rot=0x10000;;
+}
+{.mii
+            nop.m  0
+            mov.i ar.lc=r22
+            mov.i ar.ec=3;;			
+}
+aligned_loop:  
+{.mmi 
+      (p16) ld8 r32=[r19],8
+      (p16) ld8 r36=[r20],8
+      (p18) cmp.eq p0,p6=r34,r38
+}
+{.mbb
+      (p18) add r21=-8,r21
+       (p6) br.cond.spnt.few aligned_cmp
+            br.ctop.dptk.few aligned_loop;;
+}
+{.mfb
+            nop.m 0
+            nop.f 0
+            br.sptk.many short;;
+}
+unaligned_cmp:
+{.mii
+           nop.m  0
+           mux1 r25=r52,@rev
+           mux1 r24=r42,@rev;;
+}
+{.mii
+            cmp.ltu p10,p11=r24,r25
+            mov pr=r15,0x10000  // restore rotating predicates
+            mov.i ar.lc=r18;;		   
+}
+{.mii      
+      (p11) sub r24=r24,r25
+      (p10) sub r24=r25,r24
+		    nop.i 0;;
+}
+{.mii      
+            nop.m  0
+			shr.u r25=r24,32;;
+			cmp.ne p6,p7 = r25,r0;;
+}
+{.mii
+            nop.m  0
+       (p6) shr.u r26=r25,16
+	   (p7) shr.u r26=r24,16;;
+}
+{.mmi
+       (p7) mov r25=r24
+            cmp.ne p8,p9 = r26,r0
+			nop.i  0;;
+}
+{.mii
+            nop.m  0
+       (p8) shr.u r24=r26,8
+	   (p9) shr.u r24=r25,8;;
+}
+{.mmi
+       (p9) mov r26=r25
+            cmp.ne p6,p7 = r24,r0
+			nop.i  0;;
+}
+{.mii
+      (p6)  mov r8=r24
+      (p7)  mov r8=r26
+            nop.i  0;;
+}
+{.mib
+    (p10)   sub r8=r0,r8
+            nop.i  0
+            br.ret.sptk.many b0;;
+}
+
+aligned_cmp:
+{.mii
+            nop.m  0
+			mux1 r24=r34,@rev
+            mux1 r25=r38,@rev;;
+}
+{.mii
+            cmp.ltu p10,p11=r24,r25
+            mov pr=r15,0x10000  // restore rotating predicates
+            mov.i ar.lc=r18;;		   
+}
+{.mii      
+      (p11) sub r24=r24,r25
+      (p10) sub r24=r25,r24
+		    nop.i 0;;
+}
+{.mii      
+            nop.m  0
+			shr.u r25=r24,32;;
+			cmp.ne p6,p7 = r25,r0;;
+}
+{.mii
+            nop.m  0
+       (p6) shr.u r26=r25,16
+	   (p7) shr.u r26=r24,16;;
+}
+{.mmi
+       (p7) mov r25=r24
+            cmp.ne p8,p9 = r26,r0
+			nop.i  0;;
+}
+{.mii
+            nop.m  0
+       (p8) shr.u r24=r26,8
+	   (p9) shr.u r24=r25,8;;
+}
+{.mmi
+       (p9) mov r26=r25
+            cmp.ne p6,p7 = r24,r0
+			nop.i  0;;
+}
+{.mii
+      (p6)  mov r8=r24
+      (p7)  mov r8=r26
+            nop.i  0;;
+}
+{.mib
+    (p10)   sub r8=r0,r8
+            nop.i  0
+            br.ret.sptk.many b0;;
+}
+
+// end of memcmp
+        .endp  memcmp#
+// End
 
-weak_alias (memcmp, bcmp)
 libc_hidden_builtin_def (memcmp)
--- sysdeps/ia64/memcopyD-large-al.S.intel	2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcopyD-large-al.S	2006-11-30 11:03:41.000000000 -0800
@@ -0,0 +1,316 @@
+// memmove:     copy a counted number of bytes.
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// copy length greater or equal to 8 using descending method (decreasing address).
+// To use only if data are aligned on 8 Byte boundaries.
+// This version supports interleaved accesses to limit bank conflict
+//
+//       Author: Steve Skedzielewski, JT Acquaviva
+//       Date:   May, 2002
+//
+
+// -- Begin  ?0_memcpyD 
+	.section .text
+	.proc  ?0_memcpyD# 
+	.align 32 
+	.global ?0_memcpyD# 
+?0_memcpyD: 
+ {   .mib                                   // ---> cycle 0
+// bundle for quick exit in case of length equal to 0
+	cmp.eq  p13,p0=r34,r0 
+        mov     r8=r32
+ (p13)  br.ret.sptk.many	b0
+ } 
+// computing the base address for src and destination
+ { .mmi
+	and	r10=7,r34				
+	add	r25=r33,r34	
+	add	r24=r32,r34	
+        ;;
+ }
+ {   .mmi                                   // ---> cycle 1
+	alloc 	r56=ar.pfs,3,26,3,24
+        cmp4.le     p6,p0 = 2,r10
+        tbit.z.unc  p0,p9 = r34,0  // check if the length is odd
+ }
+// bunch of comparisons for short length copy
+// short copy: duplication of address for handling 2 streams of ls and st
+ {   .mmi                                   
+        cmp4.le p7 = 4,r10
+        add     r42=-1,r25
+        add     r43=-2,r25
+        ;;
+ }
+ { .mmi                             // ---> cycle 2
+        cmp4.le p8 = 6,r10
+        add     r40=-1,r24
+        add     r41=-2,r24
+ }
+ { .mmi                             
+  (p9) ld1      r44=[r42], -2
+  (p6) ld1      r45=[r43], -2
+       and      r14=-8,r33  // aligned source address 
+       ;;
+ }
+ // short copy: start to move byte around !
+ { .mmi                                 // ---> cycle 3	
+  (p6) ld1      r46=[r42], -2
+  (p7) ld1      r47=[r43], -2
+       and      r15=-8,r32  // aligned destination address 
+ }
+ {
+ (p9)  st1      [r40]=r44, -2
+ (p6)  st1      [r41]=r45, -2
+       sub      r63=r34,r10   // testing length to know if eveything is done
+       ;;
+ }
+ { .mmi                             // --> cycle 4
+ (p7)  ld1      r44=[r42], -2
+ (p8)  ld1      r45=[r43], -2
+       cmp.ne   p11 = 0,r0    // p11 is initialized to 0 (false)
+ }
+ { .mmi                             
+ (p6)  st1     [r40]=r46, -2
+ (p7)  st1     [r41]=r47, -2
+       cmp.ne   p12 = 0,r0    // p12 is initialized to 0 (false)
+       ;;
+ }
+ { .mmi                           // --> cycle 5
+ (p8)  ld1     r46=[r42], -2
+       nop.m   0
+       cmp.eq.unc   p13=r0,r63
+ }
+ { .mmi
+  (p7) st1     [r40]=r44, -2
+  (p8) st1     [r41]=r45, -2
+       cmp.ne.unc   p11,p12=r33,r14 // if src not aligned, p11 is set to 1, else p12 is set to 1
+       ;;
+ }
+ { .mib                            // --> cycle 6
+ (p8)   st1    [r40]=r46, -2
+ (p12)  cmp.ne      p11=r32,r15    // if dest not aligned, p11 is set to 1.
+ (p13)	br.ret.spnt.many	b0  // if everything is done, just branch out
+ }
+//
+// end of short copy
+//
+// If there are more by to copy continue. First of all the short copy was used to 
+// copy the 'tail' (i.e. length mod. 8) therefore we need to decrease the amount 
+// of byte to copy by the tail length.
+//
+ { .mmi
+        and r34=-8,r34  // decrease the length by what was already done
+        nop.m   0
+        nop.i   0
+ }
+ {   .mib     // if p11 set then there is unaligned addresses, branch to the corresponfing routine
+	add	r26=-8,r25	// rewind 8 Byte address for src + length address (used later)
+	add	r29=-8,r24      // rewind 8 Byte address for dst + length address (used later)
+ (p11)  br.cond.spnt    ?0_memcopyDu#
+        ;;
+ }
+ {   .mmi 
+        and     r3=-8,r26      // aligned src to the word base address
+        and     r2=-8,r29      // aligned dst to the word base address
+	shr.u	r25=r63,3      // word_count is length >> 3
+        ;; 
+ } 
+ {   .mmi 
+        nop.m   0
+	add     r63= -128, r2     // base address for prefetch write stream 
+	shr.u	r28=r25,3         // count is word_count >> 3  (i.e. r63 >> 6)
+ } 
+//
+//  start prefetch, then after branch if short case 
+//
+ {   .mmi
+	add     r62= -256, r3    // base address for prefetch read stream 
+	add	r23=   -8, r3       
+        and     r10=    7,r25
+        ;;
+ }
+ {   .mmi 
+	add	r18=-8,r2     
+	add	r24=-1,r28		
+	mov	r58=ar.lc			
+        ;; 
+ } 
+// Prolog copy: start to move byte around !
+// The idea is to proceed to up to 7 8-byte chunk copy, all the remainder (if any)
+// will be do by an unrolled copy-loop. This is equivalent to start by the tail of the
+// unrolled loop.
+//
+ { .mmi
+       cmp4.le     p6,p0 = 2,r10
+       cmp4.le     p7,p0 = 4,r10
+       tbit.z.unc  p10,p9 = r10,0  // check if the length is odd
+       ;;
+ }
+ {   .mmi
+ (p9) ld8      r44=[r3], -16
+ (p6) ld8      r45=[r23], -16
+      cmp4.le p8,p0 = 6,r10
+      ;;
+ }
+ { .mmi                                 
+ (p6) ld8      r46=[r3], -16
+ (p7) ld8      r47=[r23], -16
+      mov      r59=pr			
+ }
+ { .mmi
+ (p9) st8      [r2]=r44, -16
+ (p6) st8      [r18]=r45, -16
+      nop.i    0
+      ;;
+ }
+ { .mmi                             
+ (p7) ld8      r44=[r3], -16
+ (p8) ld8      r45=[r23], -16
+      sub      r25=r25,r10   // decrease the length of the to do work
+ }
+ { .mmi
+  (p6) st8     [r2]=r46, -16
+  (p7) st8     [r18]=r47, -16
+       nop.i   0
+       ;;
+ }
+ { .mmi                           
+ (p8)  ld8     r46=[r3], -16
+       nop.m   0
+       cmp.eq  p13,p0=r25,r0
+ }
+ { .mmi
+ (p7) st8     [r2]=r44, -16
+ (p8) st8     [r18]=r45, -16
+      nop.i   0
+      ;;
+ }
+ { .mib
+ (p8)   st8    [r2]=r46, -16
+        nop.i  0
+ (p13)  br.ret.dptk.many        b0  // if everything is done, just branch out
+         ;;
+ }
+//
+// end of the copy prolog
+//
+// Now the 8-times unrolled copy loop can be used.
+//
+ {   .mmi 
+       lfetch  [r62],-64 // prefetch read stream 
+       lfetch.excl  [r63],-128 // prefetch write stream 
+       mov	ar.ec=3					
+        ;;
+ }
+// spacing the addresses to avoid bank conflict in the core loop
+// this operation is legal due to overlapping of accesses: load
+// are done 2 cache line ahead preventing any store to overwrite
+// source data.
+//
+// check p9 to know with of the duplicate pointers was the latest used
+ {   .mmi   
+ (p10)  add     r23=-8,r23   // now, r3 and r23 are 16 Byte away to limit bank conflict
+ (p10)  add     r18=-8,r18    // now, r2 and r18 are 16 Byte away to limit bank conflict
+        mov     ar.lc=r24			
+ }
+ {   .mmi   
+ (p9) add     r3=r0,r23   //  r23 was the last, switch both pointers
+ (p9) add     r2=r0,r18   //  r18 was the last, switch both pointers
+      nop.i   0
+      ;;
+ }
+ {   .mmi 
+ (p9) add     r23=-16,r3   // now, r3 and r23 are 16 Byte away to limit bank conflict
+ (p9) add     r18=-16,r2   // now, r3 and r23 are 16 Byte away to limit bank conflict
+      mov     pr.rot=0x10000				 
+      ;;
+ }
+
+.b1_11: 
+ {   .mmi 
+  (p16)	ld8	r32=[r3],-8
+  (p16)	ld8	r35=[r23],-8
+	nop.i	0				 
+ } 
+ {   .mmi 
+  (p18)	st8	[r2]=r34,-8
+  (p18)	st8	[r18]=r37,-8
+	tbit.z.unc  p0,p8=r62,6
+        ;;				 
+ } 
+ {   .mmi 
+  (p16)	ld8	r38=[r3],-24
+  (p16)	ld8	r41=[r23],-24
+	nop.i	0				 
+ } 
+ {   .mmi 
+  (p18)	st8	[r2]=r40,-24
+  (p18)	st8	[r18]=r43,-24
+	nop.i	0				 
+        ;;				 
+ } 
+ {   .mmi 
+  (p16)	ld8	r44=[r3],-8
+  (p16)	ld8	r47=[r23],-8
+	nop.i	0				 
+ } 
+ {   .mmi 
+  (p18)	st8	[r2]=r46,-8
+  (p18)	st8	[r18]=r49,-8
+	nop.i	0 
+        ;;				 
+ } 
+ {   .mmi 
+       lfetch  [r62],-64 // prefetch read stream 
+  (p8) lfetch.excl  [r63],-128 // prefetch write stream 
+       nop.i   0
+       ;; 
+ } 
+ {   .mmi 
+  (p16)	ld8	r50=[r3],-24
+  (p16)	ld8	r53=[r23],-24
+ } 
+ {   .mmb 
+  (p18)	st8	[r2]=r52,-24
+  (p18)	st8	[r18]=r55,-24
+        br.ctop.sptk	.b1_11 
+        ;;
+ } 
+ {  .mii 
+        nop.m   0
+        mov     ar.lc=r58
+        mov     pr=r59
+        ;;
+ }
+ { .mib
+        nop.m   0
+        nop.i   0
+        br.ret.sptk.many        b0  // just branch out
+ }
+//
+// End of memcopyD !
+//
+	.endp  ?0_memcpyD# 
+  	.type	?0_memcopyDu#,@function 
+  	.global ?0_memcopyDu# 
+// End 
--- sysdeps/ia64/memcopyD-large-ual.S.intel	2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcopyD-large-ual.S	2006-11-30 09:12:55.000000000 -0800
@@ -0,0 +1,426 @@
+// memmove:     copy a counted number of bytes.
+// 
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// routine specialized for the copy of large (length >8 byte) chunk of
+// of memory by decreasing address. Unaligned boundaries.
+//
+//       Author: Steve Skedzielewski, JT Acquaviva
+//       Date:   February, 2002
+//
+
+ 	.section .text
+ 	.proc  ?0_memcopyDu#
+        .align 32
+        .global ?0_memcopyDu#
+?0_memcopyDu:
+ {   .mii
+	alloc	r3=ar.pfs,3,34,0,16			 
+	mov	r62=pr					 
+	add	r21=r33,r34				
+ }
+ {   .mmi
+	mov	r8=r32					 
+	add	r65=0,r0				 
+	add	r31=0,r32	
+        ;;
+ }
+ {   .mii
+	sub	r20=8,r21				 
+	sxt4	r23=r65					 
+	and	r19=-8,r21				 
+ }
+ {   .mmf
+	add	r22=r8,r34				 
+	mov	r66=ar.unat				 
+	nop.f	0 ;;				
+ }
+ {   .mmi
+	cmp.ne.unc	p0,p8=r19,r21			 
+	and	r29=7,r20				 
+	mov	r63=ar.lc				 
+ }
+ {   .mmi
+	shladd	r39=r23,3,r0				 
+	sub	r9=8,r22				 
+	and	r2=-8,r22 ;;				 
+ }
+ {   .mii
+  (p8)	add	r19=-8,r19				 
+	and	r3=7,r9					 
+	cmp.ne.unc	p0,p9=r2,r22			 
+ }
+ {   .mmf
+	shladd	r14=r29,3,r0				 
+	nop.m	0				
+	nop.f	0 ;;				
+ }
+ {   .mmi
+	ld8	r11=[r19]				 
+	cmp.leu.unc	p7,p6=r29,r3			 
+	shladd	r28=r3,3,r0				 
+ }
+ {   .mmi
+	add	r15=r34,r3				 
+	add	r26=-8,r19				 
+  (p9)	add	r2=-8,r2 ;;				 
+ }
+ {   .mmi
+	ld8	r29=[r2]				 
+	sub	r25=64,r28				 
+	shr.u	r60=r15,3				 
+ }
+ {   .mmi
+  (p7)	sub	r52=r28,r14				 
+	add	r10=-8,r2				 
+  (p6)	sub	r27=r28,r14 ;;				 
+ }
+ {   .mmi
+	shladd	r19=r60,3,r0				 
+  (p7)	sub	r21=64,r52				 
+	shl	r16=r11,r14				 
+ }
+ {   .mmi
+  (p7)	add	r60=-1,r60				 
+	mov	r3=r26					 
+  (p6)	add	r52=64,r27 ;;				 
+ }
+ {   .mmi
+	add	r58=-8,r26				 
+	add	r22=-24,r26				 
+	shr.u	r20=r29,r25				 
+ }
+ {   .mmi
+	add	r45=-32,r26				 
+	add	r24=-48,r26				 
+	sub	r64=r15,r19 ;;				 
+ }
+ {   .mmi
+	shladd	r61=r64,3,r0				 
+	add	r23=-56,r26				 
+	shr.u	r15=r60,3				 
+ }
+ {   .mmi
+	add	r29=-40,r26				 
+	add	r19=-16,r26				 
+	shr.u	r9=r16,r28 ;;				 
+ }
+ {   .mmi
+	ld8.s	r17=[r3]				 
+	sub	r59=64,r61				 
+	shl	r27=r20,r25				 
+ }
+ {   .mmi
+	cmp.leu.unc	p15,p0=r15,r0			 
+	sub	r26=r60,r39				 
+  (p7)	shl	r18=r11,r21 
+        ;;				 
+ }
+ {   .mmi
+	or	r25=r9,r27 ;;				 
+  (p7)	st8	[r2]=r25				 
+  (p7)	mov	r2=r10					 
+ }
+ {   .mmi
+  (p7)	add	r10=-8,r10				 
+        add     r67=-64,r3   // prefetch read stream
+  (p6)	mov	r18=r25 
+        ;;
+ }
+ {   .mmi
+	add	r20=-40,r2				
+	add	r11=-24,r2				 
+	add	r28=-16,r2				 
+ }
+ {   .mmi
+	add	r16=-48,r2				 
+	add	r25=-32,r2				 
+	add	r14=-56,r2 
+       ;;				 
+ }
+ {   .mib
+        add     r68=-64,r29  // prefetch write stream
+	add	r21=8,r2				 
+  (p15)	br.cond.dpnt	.b1_8 
+        ;;
+// 
+// prolog of the core loop. This loop should take benefit from prefetch
+//
+ }
+ {   .mmi
+        lfetch  [r67], -64        // first read prefetch
+        lfetch.excl  [r68], -128  // first write prefetch
+	mov	pr.rot=0x10000				 
+ }
+ {   .mmi
+	mov	r30=r45					 
+	mov	r33=r65					 
+	nop.i   0
+        ;;
+ }
+ {   .mmi
+        lfetch  [r67], -64        // second read prefetch
+        lfetch.excl  [r68], -128  // second write prefetch
+	add	r17=-1,r15				 
+ }
+ {   .mii
+        nop.m   0
+	mov	ar.ec=2					 
+	mov	r8=r52 ;;				 
+ }
+ {   .mmi
+        lfetch  [r67], -64        // third read prefetch
+        lfetch.excl  [r68], -128  // third write prefetch
+	sub	r27=64,r52				 
+ }
+ {   .mii
+	nop.m	0				
+	mov	ar.lc=r17				 
+	nop.i	0 ;;				
+ }
+.b1_9:
+ {   .mmi
+  (p16)	ld8	r49=[r3],-64				 
+  (p16)	ld8	r36=[r58],-64				 
+  (p17)	shr.u	r48=r21,r8				 
+ }
+ {   .mmi
+  (p17)	or	r54=r17,r37				 
+  (p17)	st8	[r28]=r9,-64				 
+  (p17)	shl	r34=r21,r27 ;;				 
+ }
+ {   .mmi
+  (p16)	ld8	r37=[r22],-64				 
+  (p16)	ld8	r57=[r19],-64				 
+  (p17)	shl	r18=r39,r27				 
+ }
+ {   .mmi
+  (p17)	or	r56=r15,r35				 
+  (p17)	st8	[r11]=r54,-64				 
+  (p17)	shr.u	r55=r39,r8 ;;				 
+ }
+ {   .mmi
+  (p16)	ld8	r54=[r30],-64				 
+  (p16)	ld8	r35=[r29],-64				 
+  (p16)	shr.u	r53=r49,r8				 
+ }
+ {   .mmi
+  (p17)	or	r50=r26,r38				 
+  (p17)	st8	[r25]=r56,-64				 
+  (p16)	shl	r39=r49,r27 ;;				 
+ }
+ {   .mmi
+  (p16)	ld8	r21=[r24],-64				 
+  (p16)	ld8	r38=[r23],-64				 
+  (p16)	shr.u	r52=r36,r8				 
+ }
+ {   .mmi
+  (p17)	or	r49=r48,r40				 
+  (p17)	st8	[r20]=r50,-64				 
+  (p16)	shl	r51=r36,r27 ;;				 
+ }
+ {   .mmi
+  (p9) lfetch.excl  [r68], -128  // write stream prefetch
+   lfetch  [r67], -64  // read stream prefetch
+  (p17)	or	r48=r55,r34				 
+ }
+ {   .mii
+  (p17)	st8	[r16]=r49,-64 
+  (p16)	shr.u	r50=r57,r8				
+  (p16)	shl	r36=r57,r27				 
+  ;;			 //12:107  107
+ }
+ { .mii
+  (p16)	or	r49=r53,r18				 
+  tbit.z.unc  p0,p9=r67,6  // test to make a write lfetch every 2 iteration
+  (p16)	shr.u	r17=r37,r8				 
+ }
+ {   .mii
+  (p17)	st8	[r14]=r48,-64				 
+  (p16)	shl	r34=r37,r27 ;;				 
+  (p16)	shr.u	r15=r54,r8				 
+ }
+ {   .mmi
+  (p16)	or	r48=r52,r39				 
+  (p16)	st8	[r2]=r49,-64				 
+  (p16)	shl	r37=r54,r27 ;;				 
+ }
+ {   .mii
+  (p16)	or	r9=r50,r51				 
+  (p16)	shr.u	r26=r35,r8				 
+  (p16)	shl	r39=r35,r27				 
+ }
+ {   .mmb
+  (p16)	st8	[r10]=r48,-64				 
+  (p16)	add	r32=1,r33				 
+	br.ctop.sptk	.b1_9 ;;			 
+ }
+ {   .mii
+	mov	r65=r34					 
+        nop.i   0
+	mov	r52=r8					 
+ }
+ {   .bbb
+	nop.b	0					 
+	nop.b	0				
+	nop.b	0 ;;				
+ }
+ {   .mii
+	ld8.s	r17=[r3]				 
+	sxt4	r23=r65					 
+	add	r21=8,r2 ;;				 
+ }
+ {   .mmi
+	shladd	r39=r23,3,r0 ;;				 
+	sub	r26=r60,r39				 
+	nop.i	0 ;;				
+ }
+.b1_8:
+ {   .mib
+	cmp.leu.unc	p14,p0=r26,r0			 
+	add	r20=-4,r21				 
+  (p14)	br.cond.dpnt	.b1_10 ;;			 
+// Block 26: prolog  Pred: 8     Succ: 11 
+// Freq 1.0e+001, Prob 1.00, Ipc 3.33
+ }
+ {   .mii
+	add	r15=-1,r26				
+	mov	pr.rot=0x10000				 
+        nop.i   0
+ }
+ {   .mfb
+	sub	r9=64,r52				 
+	nop.f	0				
+	clrrrb.pr	 ;;				 
+ }
+ {   .mii
+        nop.m   0
+	mov	ar.ec=7					 
+	mov	r32=r18					 
+ }
+ {   .mmi
+	mov	r8=r52 ;;				 
+	nop.m	0				
+	mov	ar.lc=r15 ;;				 
+ }
+.b1_11:
+ {   .mii
+  (p21)	or	r45=r41,r37				 
+  (p18)	shl	r33=r44,r9				 
+  (p18)	shr.u	r38=r44,r8				 
+ }
+ {   .mmb
+  (p16)	ld8	r42=[r3],-8				 
+  (p22)	st8	[r2]=r46,-8				 
+	br.ctop.sptk	.b1_11 ;;			 
+ }
+ {   .mii
+	mov	r18=r38					 
+        nop.i   0
+	mov	r52=r8 ;;				 
+ }
+ {   .mii
+	ld8.s	r17=[r3]				 
+	add	r21=8,r2				 
+	nop.i	0 ;;				
+ }
+ {   .mmi
+	add	r20=-4,r21				 
+	nop.m	0				
+	nop.i	0 ;;				
+ }
+.b1_10:
+ {   .mmi
+	cmp.geu.unc	p0,p13=r52,r61 ;;		 
+	cmp.gtu.unc	p0,p12=4,r64			 
+  (p13)	shr.u	r19=r17,r52 ;;				 
+ }
+ {   .mii
+  (p13)	chk.s	r17,.b1_66				 
+  (p12)	tbit.z.unc	p0,p10=r64,0			 
+  (p12)	mov	r21=r20 ;;				 
+ }
+.b1_67:
+ {   .mii
+	add	r14=-2,r21				 
+  (p12)	tbit.z.unc	p0,p9=r64,1			 
+  (p13)	or	r18=r18,r19 ;;				 
+ }
+ {   .mii
+	nop.m	0				
+	tbit.z.unc	p0,p8=r64,1 ;;			 
+  (p8)	tbit.z.unc	p0,p7=r64,0			 
+ }
+ {   .mii
+  (p8)	mov	r21=r14					 
+	shr.u	r11=r18,r59 ;;				 
+	tbit.z.unc	p0,p6=r64,0 ;;			 
+ }
+ {   .mii
+  (p6)	add	r10=-1,r21				 
+	mov	ar.lc=r63				 
+	nop.i	0 ;;				
+ }
+ {   .mmi
+	mov	r9=r11 ;;				 
+	nop.m	0				
+  (p10)	shr.u	r11=r11,8 ;;				 
+ }
+ {   .mii
+	nop.m	0				
+  (p9)	shr.u	r11=r11,16				 
+	nop.i	0 ;;				
+ }
+ {   .mii
+  (p12)	st4	[r20]=r11				 
+  (p12)	mov	r11=r9 ;;				 
+  (p7)	shr.u	r11=r11,8 ;;				 
+ }
+ {   .mii
+  (p8)	st2	[r14]=r11				 
+  (p8)	mov	r11=r9					 
+	nop.i	0 ;;				
+ }
+ {   .mii
+  (p6)	st1	[r10]=r11				 
+	mov	pr=r62,0x1003e				 
+	mov     r8=r31
+ }
+ {   .mmb
+	nop.m	0				
+	mov	ar.unat=r66				 
+	br.ret.sptk.many	b0 ;;			 
+ }
+.b1_66:
+ {   .mii
+	ld8	r17=[r3]				 
+  (p12)	tbit.z.unc	p0,p10=r64,0			 
+  (p12)	mov	r21=r20 ;;				 
+ }
+ {   .mib
+	nop.m	0				
+  (p13)	shr.u	r19=r17,r52				 
+	br.cond.sptk	.b1_67 
+        ;;
+ }
+	.endp ?0_memcopyDu#
+// End
--- sysdeps/ia64/memcpy-a0-mt-array.S.intel	2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcpy-a0-mt-array.S	2006-11-30 09:12:55.000000000 -0800
@@ -0,0 +1,442 @@
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+//
+//       Author: Steve Skedzielewski
+//       Date:   January, 2002
+//         + changed by Boris Shurygin, Moscow
+//           October 2004
+//
+        .section .text
+// -- Begin  ?0_memcopyA
+	.proc  ?0_memcopyA#
+	.align 32
+	.global ?0_memcopyA#
+	.prologue
+?0_memcopyA:
+ {   .mii
+	alloc	r3=ar.pfs,3,18,0,16			 //0:  5  358
+	add	r26=-8,r0				 //0: 37   19
+	nop.i	0 ;;				
+ }
+ {   .mmi
+	and	r24=7,r32				 //1: 18    7
+	and	r27=-8,r33				 //1: 37   17
+       .save ar.lc,r52
+	mov	r52=pr					 //1:  5  356
+ }
+ {   .mmi
+	and	r14=r33,r26				 //1: 37   20
+	mov	r51=r32					 //1:  5    3
+	and	r28=7,r33 ;;				 //1: 20    8
+ }
+ {   .mmi
+	add	r10=r34,r24				 //2: 29   13
+	ld8	r9=[r14]				 //2: 37   21
+       .save ar.lc,r50
+	mov	r50=ar.lc				 //2:  5    2
+ }
+	.body
+ {   .mmi
+	add	r25=7,r32				 //2: 22   11
+	shladd	r11=r28,3,r0				 //2: 20    9
+	cmp.leu.unc	p13,p12=r28,r24 ;;		 //2: 40  267
+ }
+ {   .mii
+  (p12)	cmp.eq.unc	p9,p8=r24,r0			 //3: 43  271
+	shr.u	r49=r10,3				 //3: 30   14
+	add	r48=8,r27				 //3: 37   18
+ }
+ {   .mfb
+  (p13)	cmp.eq.unc	p10,p0=r24,r0			 //3: 61  269
+	nop.f	0				
+  (p10)	br.cond.dpnt.many	.b1_58 ;;		 //3: 61   30
+// Block 2: collapsed  Pred: 0     Succ: 3 4  -G
+// Freq 7.5e-001, Prob 0.25
+ }
+ {   .mii
+  (p8)	ld8	r22=[r48]				 //0: 44   36
+  (p8)	sub	r21=64,r11				 //0: 45   38
+	and	r20=-8,r25				 //0: 22  306
+ }
+ {   .mmb
+	shladd	r27=r24,3,r0				 //0: 21  305
+	shladd	r26=r49,3,r0				 //0: 31  307
+  (p9)	br.cond.dpnt.many	.b1_3 ;;		 //0: 43   34
+// Block 4: collapsed  Pred: 2     Succ: 57 8  -GO
+// Freq 5.6e-001, Prob 0.50
+ }
+ {   .mmi
+	sub	r25=8,r24				 //0: 63   51
+	add	r49=-1,r49				 //0: 62   50
+	shr.u	r19=r9,r11				 //0: 37  311
+ }
+ {   .mmi
+  (p8)	add	r48=8,r48				 //0: 44   37
+	sub	r24=r10,r26				 //0: 31  312
+  (p12)	sub	r18=r27,r11 ;;				 //0: 42   31
+ }
+ {   .mii
+	cmp.gtu.unc	p0,p6=4,r25			 //1: 66  266
+	tbit.z.unc	p7,p0=r25,0			 //1: 64   52
+  (p8)	shl	r17=r22,r21				 //1: 45   39
+ }
+ {   .mfb
+  (p12)	add	r3=64,r18				 //1: 42  315
+	nop.f	0				
+  (p7)	br.cond.dpnt.many	.b1_57 ;;		 //1: 64   53
+// Block 8:  Pred: 4     Succ: 7  -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+ {   .mii
+  (p13)	sub	r3=r27,r11				 //0: 53  319
+  (p13)	sub	r16=64,r27				 //0: 55  317
+  (p8)	sub	r15=64,r3				 //0: 46  318
+ }
+ {   .mmi
+	add	r32=1,r51 ;;				 //0: 64   56
+.pred.rel "mutex", p8, p13
+.pred.rel "mutex", p13, p8
+  (p8)	or	r19=r19,r17				 //2: 45  323
+  (p13)	shr.u	r35=r19,r16				 //2: 55  324
+ }
+ {   .mmi
+	nop.m	0				
+	nop.m	0				
+  (p8)	shr.u	r35=r22,r15 ;;				 //2: 46  325
+ }
+ {   .mib
+	st1	[r51]=r19				 //3: 64   57
+	shr.u	r19=r19,8				 //3: 64   58
+	br.cond.sptk.many	.b1_7 ;;		 //3: 64  320
+// Block 57:  Pred: 4     Succ: 7  -GO
+// Freq 2.8e-001, Prob 1.00
+ }
+.b1_57: 
+ {   .mii
+.pred.rel "mutex", p8, p13
+  (p13)	sub	r3=r27,r11				 //0: 53  321
+  (p13)	sub	r16=64,r27				 //0: 55   27
+  (p8)	sub	r15=64,r3 ;;				 //0: 46   41
+.pred.rel "mutex", p8, p13
+ }
+ {   .mii
+  (p8)	or	r19=r19,r17				 //2: 45   40
+  (p8)	shr.u	r35=r22,r15				 //2: 46   42
+  (p13)	shr.u	r35=r19,r16 ;;				 //2: 55   28
+// Block 7:  Pred: 8 57     Succ: 9 10  -G
+// Freq 5.0e-001, Prob 0.50
+ }
+.b1_7: 
+ {   .mib
+	nop.m	0				
+	tbit.z.unc	p15,p0=r25,1			 //0: 65   54
+  (p15)	br.cond.dpnt.many	.b1_9 ;;		 //0: 65   55
+// Block 10:  Pred: 7     Succ: 9  -G
+// Freq 2.5e-001, Prob 1.00
+ }
+ {   .mii
+	st2	[r32]=r19,2				 //0: 65   61
+	shr.u	r19=r19,16				 //0: 65   63
+	nop.i	0 ;;				
+// Block 9: collapsed  Pred: 7 10     Succ: 5  -G
+// Freq 5.0e-001, Prob 1.00
+ }
+.b1_9: 
+ {   .mib
+  (p6)	st4	[r32]=r19				 //0: 66   64
+	nop.i	0				
+	br.cond.sptk.many	.b1_5 ;;		 //0: 66  274
+// Block 3:  Pred: 2     Succ: 5  -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+.b1_3: 
+ {   .mii
+	sub	r24=r10,r26				 //0: 31  313
+	shr.u	r19=r9,r11				 //0: 37  309
+  (p12)	sub	r18=r27,r11				 //0: 42  310
+ }
+ {   .mmi
+  (p13)	sub	r3=r27,r11 ;;				 //0: 53   26
+  (p12)	add	r3=64,r18				 //1: 42   32
+	nop.i	0 ;;				
+ }
+ {   .mib
+	mov	r35=r19					 //3: 48   35
+	nop.i	0				
+	br.cond.sptk.many	.b1_5 ;;		 //3: 48  308
+// Block 58:  Pred: 0     Succ: 5  -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+.b1_58: 
+ {   .mii
+	shladd	r26=r49,3,r0				 //0: 31   15
+	shladd	r27=r24,3,r0				 //0: 21   10
+	shr.u	r19=r9,r11				 //0: 37   22
+ }
+ {   .mmi
+	and	r20=-8,r25 ;;				 //0: 22   12
+	sub	r24=r10,r26				 //1: 31   16
+  (p13)	sub	r16=64,r27				 //1: 55  314
+ }
+ {   .mii
+	nop.m	0				
+  (p13)	sub	r3=r27,r11 ;;				 //1: 53  316
+  (p13)	shr.u	r35=r19,r16 ;;				 //3: 55  322
+// Block 5:  Pred: 9 3 58     Succ: 12 13  -G
+// Freq 1.0e+000, Prob 0.01
+ }
+.b1_5: 
+ {   .mmi
+	add	r2=8,r20				 //0: 89   46
+	mov	r10=r48					 //0: 89   44
+	shr.u	r14=r49,1				 //0: 89   47
+ }
+ {   .mmi
+	add	r11=8,r48				 //0: 89   43
+	add	r30=1,r0				 //0: 89   65
+	mov	r8=r20 ;;				 //0: 89   45
+ }
+ {   .mib
+	cmp.gt.unc	p14,p0=1,r14			 //1: 89   48
+	shladd	r29=r14,1,r30				 //1: 89   66
+  (p14)	br.cond.dpnt.many	.b1_12 ;;		 //1: 89   49
+// Block 13: prolog  Pred: 5     Succ: 14  -O
+// Freq 2.5e-001, Prob 1.00
+ }
+ {   .mmi
+    add	r30=8000,r48	
+    add	r31=1400,r8				 //0: 70   75
+	mov	pr.rot=0x10000				 //0: 89  172
+ }
+ {   .mii
+	sub	r9=64,r3				 //0: 73   76
+	add	r23=-1,r14				 //0: 73   77
+	mov	ar.ec=3 ;;				 //1: 89  174
+ }
+ {   .mii
+	add	r14=460,r48				
+	mov	ar.lc=r23				 //2: 73   78
+	add	r15=5120,r48				
+}
+{   .mii
+    add	r34=1600,r48				 //0: 70   74
+	add	r33=1000,r20
+    shr.u   r27=r23,16  //How many Megs we have to copy?
+	;;	
+}{   .mib
+	cmp.lt p15,p0=8,r27 
+	nop.i   0
+  (p15)	br.cond.spnt.many  unaligned_huge_loop
+;;		
+// Block 14: lentry lexit ltail collapsed pipelined  Pred: 14 13     Succ: 14 29  -S
+// Freq 5.0e+001, Prob 1.00
+}
+.b1_14:
+{   .mmi
+  (p16)	ld8	r37=[r11],16				 //0: 71   84
+  (p16)	ld8	r45=[r10],16				 //0: 71   79
+  (p17)	shr.u	r35=r38,r9				 //3: 73   88
+ }
+ {   .mmi
+  (p18)	or	r46=r43,r40				 //6: 72   86
+  (p18)	st8	[r8]=r44,16				 //6: 72   82
+  (p17)	shl	r42=r38,r3 ;;				 //3: 72   85
+ }
+ {   .mmi
+  (p18)	st8	[r2]=r46,16				 //7: 72   87
+  (p16)	lfetch.nt1	[r34]					 //1: 70   89
+	nop.i	0 ;;				
+ }
+ {   .mii
+  (p17)	or	r43=r41,r36				 //5: 72   81
+  (p16)	shl	r40=r45,r3				 //2: 72   80
+  (p16)	shr.u	r38=r45,r9				 //2: 73   83
+ }
+ {   .mmb
+  (p16)	add	r32=32,r34				 //2: 70   90
+  (p16) lfetch [r14],16
+    br.ctop.sptk	.b1_14 ;;			 //2: 89   97
+// Block 29: epilog  Pred: 14     Succ: 12  -O
+// Freq 5.0e-001, Prob 1.00
+ }
+// Block 29: epilog  Pred: 14     Succ: 12  -O
+// Freq 5.0e-001, Prob 1.00
+{   .mii
+	mov	r35=r37					 //0: 89  171
+	nop.i	0				
+	nop.i	0 ;;				
+// Block 12:  Pred: 5 29     Succ: 16 26  -G
+// Freq 1.0e+000, Prob 0.01
+ }
+.b1_12: 
+ {   .mii
+	shladd	r28=r29,3,r0				 //0: 89   67
+	cmp.gt.unc	p13,p0=r29,r49 ;;		 //0: 89   72
+	add	r27=r48,r28				 //1: 89   70
+ }
+ {   .mmi
+	add	r26=r20,r28 ;;				 //1: 89   68
+	add	r14=-8,r26				 //2: 89   69
+	add	r15=-8,r27				 //2: 89   71
+ }
+ {   .mib
+	nop.m	0				
+	nop.i	0				
+  (p13)	br.cond.dpnt.many	.b1_16 ;;		 //2: 89   73
+// Block 26: prolog  Pred: 12     Succ: 17  -O
+// Freq 5.0e-003, Prob 1.00
+ }
+ {   .mii
+	sub	r16=64,r3				 //0: 73  130
+	mov	pr.rot=0x10000				 //0: 89  144
+	sub	r22=r49,r29				 //0: 73  131
+ }
+ {   .mfb
+	nop.m	0				
+	nop.f	0				
+	clrrrb.pr	 ;;				 //0: 89  145
+ }
+ {   .mii
+	mov	r32=r35					 //1: 89  142
+	mov	ar.ec=7 ;;				 //1: 89  146
+	mov	ar.lc=r22 ;;				 //2: 73  132
+// Block 17: lentry lexit ltail collapsed pipelined  Pred: 26 17     Succ: 17 28  -S
+// Freq 1.0e+000, Prob 0.99
+ }
+.b1_17: 
+ {   .mii
+  (p21)	or	r45=r41,r37				 //5: 72  103
+  (p18)	shr.u	r33=r44,r16				 //2: 73  105
+  (p18)	shl	r38=r44,r3				 //2: 72  102
+ }
+ {   .mmb
+  (p16)	ld8	r42=[r15],8				 //0: 71  101
+  (p22)	st8	[r14]=r46,8				 //6: 72  104
+	br.ctop.sptk	.b1_17 ;;			 //0: 89  108
+// Block 28: epilog  Pred: 17     Succ: 16  -O
+// Freq 1.0e-002, Prob 1.00
+ }
+ {   .mii
+	mov	r35=r38					 //0: 89  143
+	nop.i	0				
+	nop.i	0 ;;				
+// Block 16:  Pred: 12 28     Succ: 18 19  -G
+// Freq 1.0e+000, Prob 0.50
+ }
+.b1_16: 
+ {   .mmi
+	shladd	r21=r49,3,r20				 //0: 84  109
+	shladd	r17=r24,3,r0				 //0: 80   98
+	tbit.z.unc	p0,p12=r24,0			 //0: 87  265
+ }
+ {   .mii
+	cmp.gtu.unc	p7,p8=4,r24			 //0: 85  110
+	shladd	r20=r49,3,r48 ;;			 //0: 81  112
+	mov	ar.lc=r50				 //1: 88  126
+ }
+ {   .mib
+	cmp.geu.unc	p6,p0=r3,r17			 //1: 80   99
+	nop.i	0				
+  (p6)	br.cond.dpnt.many	.b1_18 ;;		 //1: 80  100
+// Block 19:  Pred: 16     Succ: 18  -G
+// Freq 5.0e-001, Prob 1.00
+ }
+ {   .mmi
+	ld8	r19=[r20] ;;				 //0: 81  113
+	nop.m	0				
+	shl	r18=r19,r3 ;;				 //2: 81  114
+ }
+ {   .mii
+	or	r35=r35,r18				 //5: 81  115
+	nop.i	0				
+	nop.i	0 ;;				
+// Block 18:  Pred: 16 19     Succ: 20 21  -G
+// Freq 1.0e+000, Prob 0.50
+ }
+.b1_18: 
+ {   .mib
+  (p8)	st4	[r21]=r35,4				 //0: 85  118
+	tbit.z.unc	p9,p10=r24,1			 //0: 86  116
+  (p7)	br.cond.dpnt.many	.b1_20 ;;		 //0: 85  111
+// Block 21:  Pred: 18     Succ: 20  -G
+// Freq 5.0e-001, Prob 1.00
+ }
+ {   .mii
+	nop.m	0				
+	shr.u	r35=r35,32				 //0: 85  120
+	nop.i	0 ;;				
+// Block 20:  Pred: 18 21     Succ: 22  -GO
+// Freq 1.0e+000, Prob 1.00
+ }
+.b1_20: 
+// Block 22: exit collapsed  Pred: 20     Succ:  -GO
+// Freq 1.0e+000, Prob 1.00
+ {   .mii
+  (p10)	st2	[r21]=r35,2				 //0: 86  123
+  (p10)	shr.u	r35=r35,16				 //0: 86  125
+	mov	r8=r51 ;;				 //0: 88  127
+ }
+ {   .mib
+  (p12)	st1	[r21]=r35				 //1: 87  129
+	mov	pr=r52,0x1003e				 //1: 88  357
+	br.ret.sptk.many	b0 ;;			 //1: 88  128
+ }
+unaligned_huge_loop:
+ {.mmi
+      (p16)	ld8	r37=[r11],16				 
+      (p16)	ld8	r45=[r10],16				 
+      (p17)	shr.u	r35=r38,r9				 
+ }
+ {.mmi
+      (p18)	st8	[r8]=r44,16
+      (p18)	or	r46=r43,r40				 
+      (p17)	shl	r42=r38,r3 ;;				 
+ }
+ {.mmi
+     (p18)	st8	[r2]=r46,16	 					 
+     (p16)  lfetch.nt1	[r31],16					 //fetch line for writing
+            nop.i   0;;
+ }
+ {.mmi
+     (p16)  lfetch.nta	[r30],16					 
+            nop.m   0
+            nop.i   0;;
+ }
+ {.mmi
+      (p16) lfetch.nt1	[r15],16                 //fetch line to L2 for reading
+      (p16)	lfetch	[r14],16					 //fetch line to L1 for reading
+      (p16)	shr.u	r38=r45,r9
+ }
+ {.mib
+      (p17)	or r43=r41,r36		 
+      (p16)	shl	r40=r45,r3	 
+	        br.ctop.sptk	unaligned_huge_loop;;
+ }
+{   .mib
+	mov	r35=r37					 
+	nop.i	0				
+	br.cond.sptk.many .b1_12;;				
+ }
+
+// -- End  ?0_memcopyA
+	.endp  ?0_memcopyA#
+// End
--- sysdeps/ia64/memcpy.S.intel	2003-11-18 22:11:26.000000000 -0800
+++ sysdeps/ia64/memcpy.S	2006-11-30 11:29:19.000000000 -0800
@@ -1,436 +1,533 @@
-/* Optimized version of the standard memcpy() function.
-   This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
-   Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
-   Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    src
-        in2:    byte count
-
-   An assembly implementation of the algorithm used by the generic C
-   version from glibc.  The case when source and sest are aligned is
-   treated separately, for extra performance.
-
-   In this form, memcpy assumes little endian mode.  For big endian mode,
-   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
-   and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
-   shrp instruction.  */
-
-#define USE_LFETCH
-#define USE_FLP
+// memcpy:     copy a counted number of bytes.
+//
+// Copyright (c) 2000-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// The copy is performed in an ascending (increasing addresses). The code is divided
+// in 6 parts.
+// 1/ First part is straight line code for short copy (i.e. length < 8 Byte)
+// 2/ Second part is testing alignment, if accesses are not aligned on word boundaries
+// the unaligned version is jumped to.
+// 3/ Third part is medium length copy (less than 64 Byte), this part is also used as
+// an epiolog for the unrolled loop (part 4, 5).
+// 4/ Fourth part: used for copying more than 64 bytes, but less than 1K.
+// To do this we use the unrolled SWP loop.
+// 5/ Fifth part: aligned large loops: uses floating loadpair8/store8 to keep
+// data in the L2 cache (avoid moving into L1); also using lfetches.
+// 6/ Sixth part: for very huge moves: > 8 mbytes.  Similar to part 4 but
+// with an additional lfetch.nta to get the data closer.
+//
+//       Author: Steve Skedzielewski, JT Acquaviva 
+//       Date:   February, 2002
+//         + changed by Boris Shurygin, Moscow
+//           September 2004
+ 
 #include <sysdep.h>
 #undef ret
 
-#define LFETCH_DIST     500
-
-#define ALIGN_UNROLL_no   4 // no. of elements
-#define ALIGN_UNROLL_sh	  2 // (shift amount)
-
-#define MEMLAT	8
-#define Nrot	((4*(MEMLAT+2) + 7) & ~7)
-
-#define OP_T_THRES 	16
-#define OPSIZ 		8
-
-#define loopcnt		r14
-#define elemcnt		r15
-#define saved_pr	r16
-#define saved_lc	r17
-#define adest		r18
-#define dest		r19
-#define asrc		r20
-#define src		r21
-#define len		r22
-#define tmp2		r23
-#define tmp3		r24
-#define	tmp4		r25
-#define ptable		r26
-#define ploop56		r27
-#define	loopaddr	r28
-#define	sh1		r29
-#define ptr1		r30
-#define ptr2		r31
-
-#define movi0 		mov
-
-#define p_scr		p6
-#define p_xtr		p7
-#define p_nxtr		p8
-#define p_few		p9
-
-#if defined(USE_FLP)
-#define load		ldf8
-#define store		stf8
-#define tempreg		f6
-#define the_r		fr
-#define the_s		fs
-#define the_t		ft
-#define the_q		fq
-#define the_w		fw
-#define the_x		fx
-#define the_y		fy
-#define the_z		fz
-#elif defined(USE_INT)
-#define load		ld8
-#define store		st8
-#define tempreg		tmp2
-#define the_r		r
-#define the_s		s
-#define the_t		t
-#define the_q		q
-#define the_w		w
-#define the_x		x
-#define the_y		y
-#define the_z		z
-#endif
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-/* Manually force proper loop-alignment.  Note: be sure to
-   double-check the code-layout after making any changes to
-   this routine! */
-# define ALIGN(n)	{ nop 0 }
-#else
-# define ALIGN(n)	.align n
-#endif
-
-#if defined(USE_LFETCH)
-#define LOOP(shift)						\
-		ALIGN(32);					\
-.loop##shift##:							\
-{ .mmb								\
-(p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
-(p[0])	lfetch.nt1 [ptr1], 16 ;					\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;				\
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;		\
- 	nop.b 0 ;;						\
- } { .mmb							\
-(p[0])	ld8.nt1	s[0] = [asrc], 8 ;				\
-(p[0])	lfetch.nt1	[ptr2], 16 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;				\
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;		\
-	br.ctop.sptk.many .loop##shift 				\
-;; }								\
-{ .mib								\
-	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
-}
-#else
-#define LOOP(shift)						\
-		ALIGN(32);					\
-.loop##shift##:							\
-{ .mmb								\
-(p[0])	ld8.nt1	r[0] = [asrc], 8 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ;				\
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ;		\
- 	nop.b 0 ;;						\
- } { .mmb							\
-(p[0])	ld8.nt1	s[0] = [asrc], 8 ;				\
-	nop.b 0 ;						\
-} { .mib							\
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ;				\
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ;		\
-	br.ctop.sptk.many .loop##shift 				\
-;; }								\
-{ .mib								\
-	br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */  \
-}
-#endif
-
-
-ENTRY(memcpy)
-{ .mmi
+        .section .text
+	.proc  memcpy#
+        .align 32
+	.global memcpy#
 	.prologue
-	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
-	.rotr	r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
-	.rotp	p[MEMLAT+2]
-	.rotf	fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
-	mov	ret0 = in0		// return tmp2 = dest
-	.save   pr, saved_pr
-	movi0	saved_pr = pr		// save the predicate registers
-} { .mmi
-	and	tmp4 = 7, in0 		// check if destination is aligned
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-;; }
-{ .mii
-	cmp.eq	p_scr, p0 = in2, r0	// if (len == 0)
-	.save   ar.lc, saved_lc
-        movi0 	saved_lc = ar.lc	// save the loop counter
-	.body
-	cmp.ge	p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
-} { .mbb
-	mov	len = in2		// len
-(p_scr)	br.cond.dpnt.few .restore_and_exit // 	Branch no. 1: return dest
-(p_few) br.cond.dpnt.many .copy_bytes	// Branch no. 2: copy byte by byte
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
-	lfetch.nt1 [dest]		//
-	lfetch.nt1 [src]		//
-#endif
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
-} { .mib
-	cmp.eq	p_scr, p0 = tmp4, r0	// is destination aligned?
-	sub	loopcnt = 7, tmp4	//
-(p_scr) br.cond.dptk.many .dest_aligned
-;; }
-{ .mmi
-	ld1	tmp2 = [src], 1		//
-	sub	len = len, loopcnt, 1	// reduce len
-	movi0	ar.lc = loopcnt		//
-} { .mib
-	cmp.ne  p_scr, p0 = 0, loopcnt	// avoid loading beyond end-point
-;; }
+memcpy:
+ { .mmi                       // -----------------> cycle 0
+	mov    r8=r32         // save dest pointer for return value
+	cmp.eq  p8,p0  = 0,r34         // is length 0 ?
+	cmp.le.unc	p9,p0=8,r34    // length > 8 ?
+ }
+ { .mib                       
+	and	r3=7,r33      // use to compute alignement
+	and	r2=7,r32      // use to compute alignement
+  (p8)  br.ret.sptk.many	b0       // ---> if length is 0 branch out
+        ;;
+ }
+ { .mib                       // -----------------> cycle 1
+        add    r22=1,r32  // second dst pointer  (for short copy)
+        add    r23=1,r33  // second src pointer  (for short copy)
+  (p9)	br.cond.dpnt	.b1_2    	 // ---> if length > 7 goto large memcopy		
+ }
+// start short copy 
+short:
+{ .mmi                       
+        cmp.le p8,p9   = 2,r34    
+        cmp.le p10,p11 = 4,r34
+        tbit.z.unc p0,p14=r34,0  // check if the length is odd
+        ;;
+ }
+ { .mmi                       // -----------------> cycle 2
+(p14)   ld1 r14=[r33],2       // load byte 0
+(p8)    ld1 r15=[r23],2       // load byte 1
+        cmp.le p12,p13 = 6,r34    
+        ;;
+ }
+ { .mmi                     // -----------------> cycle 4
+(p14)   st1 [r32]=r14,2     // store byte 0
+(p8)    st1 [r22]=r15,2     // store byte 1
+        nop.i    0
+ }
+ { .mmb
+(p8)    ld1 r16=[r33],2     // load byte 2
+(p10)   ld1 r17=[r23],2     // load byte 3
+(p9)    br.ret.dpnt b0      //  if length <= 1 everything is done branch out
+        ;;
+ }
+ { .mmb                     // -----------------> cycle 5
+(p8)    st1 [r32]=r16,2     // store byte 2
+(p10)   st1 [r22]=r17,2     // store byte 3
+(p11)    br.ret.dpnt b0     //  if length <= 3 everything is done branch out
+ }
+ { .mmi
+(p10)   ld1 r14=[r33],2     // load byte 4
+(p12)   ld1 r15=[r23],2     // load byte 5
+        nop.i    0
+        ;;
+ }
+ { .mmb                      // -----------------> cycle 6
+(p10)   st1 [r32]=r14,2     // store byte 4
+(p12)   st1 [r22]=r15,2     // store byte 5
+(p13)   br.ret.dptk b0      // if length <= 5 everything is done branch out
+ }
+ { .mii
+(p12)   ld1 r16=[r33],2       // load byte 6
+        nop.i   0
+        nop.i   0
+        ;;
+ }
+ { .mmb                      // -----------------> cycle 7
+(p12)   st1 [r32]=r16,2      // store byte 6
+        nop.m   0
+        br.ret.dptk b0       // done all cases
+		;;       
+ }
+// end of short memcopy
+//
+// large memcopy: branch to huge loop if necessary, else remain
+// on straight code.
+// Definition of BIG, i.e. size used by the unrolled loop
+        BIG=0x40
+.b1_2:
+{ .mmi                       
+        cmp.ne.unc	p7,p6=r2,r3                    
+        sub     r15=8,r2        // for dst 
+        sub     r16=8,r3        // for src
+}
+{ .mii                       
+        sub     r28=r3,r2
+		tbit.nz p8,p0 = r3,0    // src byte-aligned?
+        mov     ar.ec = 3		// Only needed by align_huge
+        ;;
+}
+{.mii
+ (p6)  cmp.ne.unc	p6=r2,r0
+       .save ar.lc,r11
+       mov	r11=ar.lc
+       nop.i  0
+}
+{.mbb
+       mov r17=1
+ (p6)  br.cond.dptk unaligned 
+ (p7)  br.cond.dptk ?0_memcopyA#;;  // --> unaligned branch to it
+}
 
-.l0:	// ---------------------------- // L0: Align src on 8-byte boundary
+//
+// Medium length, aligned copy. Used also as an epilog for loops
+//
+aligned:
+{   .mii                         // --->  cycle 0
+        cmp.leu p10= 0x40, r34           
+        cmp.leu p8 = 0x10, r34
+        cmp.le  p9 = 0x18, r34          
+}
+{ .mib
+        cmp.le  p7= 0x8, r34
+        add     r28 = 8, r33            //second source pointer
+ (p10)  br.cond.dpnt    aligned_huge    
+        ;;
+}
+{ .mmi                                  // ---> cycle 1
+(p7)    ld8     r17 = [r33],16     
+(p8)    ld8     r18 = [r28],16          
+        cmp.le  p11 = 0x28,r34          
+}
 { .mmi
-	st1	[dest] = tmp2, 1	//
-(p_scr)	ld1	tmp2 = [src], 1		//
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few .l0		//
-;; }
-
-.dest_aligned:
+        cmp.le  p10 = 0x20, r34         
+        cmp.le  p12 = 0x30, r34        
+        cmp.le  p13 = 0x38, r34         
+        ;;
+}
+{ .mmi                                 // --> cycle 2
+ (p9)   ld8     r19 = [r33],16          
+ (p10)  ld8     r20 = [r28],16        
+        tbit.nz.unc p6  = r34, 3        // count >= 8 mod 16?
+        ;;
+}
+{ .mmi                                // ---> cycle 3
+ (p11)  ld8     r21 = [r33],16       
+ (p12)  ld8     r22 = [r28],16          
+        tbit.nz.unc p15 = r34, 1        
+}
 { .mmi
-	and	tmp4 = 7, src		// ready for alignment check
-	shr.u	elemcnt = len, 3	// elemcnt = len / 8
-;; }
+        add     r27 = 8, r32            // second destination pointer
+        nop.m   0
+        nop.i   0
+        ;;
+}
+{ .mmi                                // ---> cycle 4
+ (p13)  ld8     r23 = [r33],16          
+ (p7)   st8     [r32] = r17,16          
+        tbit.nz.unc p14 = r34, 2        
+        ;;
+}
+{  .mmi                                  // ---> cycle 5
+ (p8)   st8     [r27] = r18,16          
+ (p9)   st8     [r32] = r19,16          
+ (p6)   add     r33 = 0, r28            // r33 did the last ld8, r28 has next
+        ;;
+}
+{  .mmi                                   // ---> cycle 6
+ (p10)  st8     [r27] = r20,16          
+ (p14)  ld4     r24 = [r33],4           
+        tbit.nz.unc p7  = r34, 0        
+        ;;
+}
+{ .mmi                                   // ---> cycle 7
+ (p11)  st8     [r32] = r21,16          
+ (p15)  ld2     r25 = [r33],2           
+        nop.i   0
+        ;;
+} { .mmi                                   // ---> cycle 8
+ (p12)  st8     [r27] = r22,16          
+ (p7)   ld1     r26 = [r33]             
+        nop.i   0
+        ;;
+} { .mmi                                   // ---> cycle 9
+ (p13)  st8     [r32] = r23,16          
+        ;;
+ (p6)   mov     r32 = r27               // r32 did the last st8, r27 has next
+        ;;
+} { .mmi                                   // ---> cycle 11
+ (p14)  st4     [r32] = r24,4           
+        ;;
+ (p15)  st2     [r32] = r25,2           
+        ;;
+} { .mib                                   // ---> cycle 13
+ (p7)   st1     [r32] = r26             
+        br.ret.sptk     b0              
+        ;;
+}
+//Here we deal with sizes that are greater or equal to 64 bytes
+//First we check if the source address is 16-byte aligned(if not we copy first 8bytes and continue)
+//Then if length is > 8Megs we use very_huge_loop, if not we use huge_loop
+//these two loops are very much alike but very_huge_loop uses lfetch.nta to bring lines from memory to L3
+//They both use ldfp instruction in order to occupy only one memory port(M0 or M1) for loading 16bytes and use the other for lfetch
+//
+// Long copies benefit from prefetch
+//
+aligned_huge:
+{ .mii
+        alloc	r31=ar.pfs,3,29,0,32
+	shl r17=r17,10   //1K
+        .save	pr,r22
+	mov     r22 = pr
+}
+{ .mii
+        mov     r29 = r33
+        shr     r23 = r34,6
+        add     r27 = 16, r32 // one bank away
+        ;;
+}{ .mmi
+        add     r25 = 512, r33   // read stream
+        add     r23 = -1,r23
+      	add     r28 = 16 , r33 // one bank away
+}
 { .mib
-	cmp.ne	p_scr, p0 = tmp4, r0	// is source also aligned
-	tbit.nz p_xtr, p_nxtr = src, 3	// prepare a separate move if src
-} { .mib				// is not 16B aligned
-	add	ptr2 = LFETCH_DIST, dest	// prefetch address
-	add	ptr1 = LFETCH_DIST, src
-(p_scr) br.cond.dptk.many .src_not_aligned
-;; }
-
-// The optimal case, when dest, and src are aligned
-
-.both_aligned:
-{ .mmi
-	.pred.rel "mutex",p_xtr,p_nxtr
-(p_xtr)	cmp.gt  p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
-(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt  // Need only N to qualify
-	movi0	pr.rot = 1 << 16	// set rotating predicates
-} { .mib
-(p_scr) br.cond.dpnt.many .copy_full_words
-;; }
-
+        cmp.le p12=r17,r34 
+        mov     r26 = r32
+ (p12)  br.cond.spnt.many aligned_very_huge // branch to copying very huge ammounts 1K
+        ;;
+}
+{ .mii
+	lfetch      [r25], 64
+    mov     ar.lc = r23
+    nop.i   0
+} {  .mii
+        add     r24 = 512, r32   // write stream
+        mov     r31 = r34
+        mov     pr.rot = 0x10000
+        ;;
+}
+huge_loop: // Accesses are interleaved to avoid bank conflict
+	.body
+{  .mmi
+        lfetch      [r25], 64
+ (p8)   lfetch.excl [r24], 128
+        ;;
+}
 { .mmi
-(p_xtr)	load	tempreg = [src], 8
-(p_xtr) add 	elemcnt = -1, elemcnt
-	movi0	ar.ec = MEMLAT + 1	// set the epilog counter
-;; }
+ (p16)  ld8     r35 = [r29], 8
+ (p16)  ld8     r38 = [r28], 8
+ (p16)  add     r31 = -64, r31
+}
 { .mmi
-(p_xtr) add	len = -8, len		//
-	add 	asrc = 16, src 		// one bank apart (for USE_INT)
-	shr.u	loopcnt = elemcnt, ALIGN_UNROLL_sh  // cater for unrolling
-;;}
+ (p18)  st8     [r26] = r37, 8
+ (p18)  st8     [r27] = r40, 8
+        ;;
+}
 { .mmi
-	add	loopcnt = -1, loopcnt
-(p_xtr)	store	[dest] = tempreg, 8	// copy the "extra" word
-	nop.i	0
-;; }
-{ .mib
-	add	adest = 16, dest
-	movi0	ar.lc = loopcnt 	// set the loop counter
-;; }
-
-#ifdef  GAS_ALIGN_BREAKS_UNWIND_INFO
-	{ nop 0 }
-#else
-	.align	32
-#endif
-#if defined(USE_FLP)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+ (p16)  ld8     r41 = [r29], 24
+ (p16)  ld8     r44 = [r28], 24
+        cmp.le  p9 = 8, r31
+}
 { .mmi
-#if defined(USE_LFETCH)
-(p[0])	lfetch.nt1 [ptr2],32
-#endif
-(p[0])	ldfp8	the_r[0],the_q[0] = [src], 16
-(p[0])	add	len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
-;; }
+ (p18)  st8     [r26] = r43, 24
+ (p18)  st8     [r27] = r46, 24
+        cmp.le  p10=1,r31
+        ;;
+}
 { .mmi
-#if defined(USE_LFETCH)
-(p[0])	lfetch.nt1 [ptr1],32
-#endif
-(p[0])	ldfp8	the_s[0], the_t[0] = [src], 16
-} {.mmb
-(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-	br.ctop.dptk.many .l1
-;; }
-#elif defined(USE_INT)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+ (p16)  ld8     r47 = [r29], 8
+ (p16)  ld8     r50 = [r28], 8
+ (p10)  cmp.gt  p10 = 8, r31
+}
 { .mmi
-(p[0])	load	the_r[0] = [src], 8
-(p[0])	load	the_q[0] = [asrc], 8
-(p[0])	add	len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
-;; }
+ (p18)  st8     [r26] = r49, 8
+ (p18)  st8     [r27] = r52, 8
+	cmp.ne	p11=0,r31
+        ;;
+}
 { .mmi
-(p[0])	load	the_s[0]  = [src], 24
-(p[0])	load	the_t[0] = [asrc], 24
-} {.mmb
-(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-#if defined(USE_LFETCH)
-;; }
+ (p16)  ld8     r53 = [r29], 24
+ (p16)  ld8     r56 = [r28], 24
+        tbit.z.unc  p0,p8=r25,6   // set the parameter for write pf every 2 iteration
+}
 { .mmb
-(p[0])	lfetch.nt1 [ptr2],32
-(p[0])	lfetch.nt1 [ptr1],32
-#endif
-	br.ctop.dptk.many .l1
-;; }
-#endif
-
-.copy_full_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, len	//
-	shr.u	elemcnt = len, 3	//
-(p_scr) br.cond.dpnt.many .copy_bytes
-;; }
-{ .mii
-	load	tempreg = [src], 8
-	add	loopcnt = -1, elemcnt	//
-;; }
-{ .mii
-	cmp.ne	p_scr, p0 = 0, loopcnt	//
-	mov	ar.lc = loopcnt		//
-;; }
-
-.l2: // ------------------------------- // L2: Max 4 words copied separately
-{ .mmi
-	store	[dest] = tempreg, 8
-(p_scr)	load	tempreg = [src], 8	//
-	add	len = -8, len
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few  .l2
-;; }
-
-.copy_bytes:
-{ .mib
-	cmp.eq	p_scr, p0 = len, r0	// is len == 0 ?
-	add	loopcnt = -1, len	// len--;
-(p_scr)	br.cond.spnt	.restore_and_exit
-;; }
-{ .mii
-	ld1	tmp2 = [src], 1
-	movi0	ar.lc = loopcnt
-	cmp.ne	p_scr, p0 = 0, loopcnt	// avoid load beyond end-point
-;; }
-
-.l3: // ------------------------------- // L3: Final byte move
-{ .mmi
-	st1	[dest] = tmp2, 1
-(p_scr)	ld1	tmp2 = [src], 1
-} { .mib
-	cmp.lt	p_scr, p0 = 1, loopcnt	// avoid load beyond end-point
-	add	loopcnt = -1, loopcnt
-	br.cloop.dptk.few  .l3
-;; }
+ (p18)  st8     [r26] = r55, 24
+ (p18)  st8     [r27] = r58, 24
+        br.ctop.dptk    huge_loop
+        ;;
+} { .mmi
+        mov     r33 = r29
+        mov     r34 = r31
+        mov     pr = r22,0x10000        // restore rotating predicates
+        ;;
+} { .mii
+        mov     r32 = r26
+        mov     ar.lc = r11
+        cmp.le  p9 = 8, r31
+}
+{ .mbb
+        cmp.le  p10=1,r31
+ (p11)	br.cond.dptk aligned            // use pred code for 1 <= n < 64
+        br.ret.sptk     b0
+        ;;
+}
+aligned_very_huge:
+{ .mii
+	    alloc	r29=ar.pfs,3,37,0,40
+        tbit.nz  p8,p0=r33,3
+	    shr.u   r17=r34,3  //length in 8-byte blocks
+}
+{.mmi
+        add r30=1,r0
+        add r28=1280,r33
+		adds r31 = 448,r33
+		;;
+}
+{.mmi 
+  (p8)  ldf8 f6=[r33],8
+  (p8)  adds r17=-1,r17
+  (p8)  adds r34=-8,r34
+}
+{.mii
+       // add r24=8000,r0  //- suggested for advance distance for src lfetch.nt1 in very_huge_loop
+        add r27=4000,r28
+		shl r30=r30,23  //8Megs
+        
+		;;
+}
 
-.restore_and_exit:
-{ .mmi
-	movi0	pr = saved_pr, -1	// restore the predicate registers
-;; }
-{ .mib
-	movi0	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-;; }
+{.mii
+  (p8)  stf8 [r32]=f6,8
+        add r25=4000,r0  //advance distance for src lfetch.nta in very_huge_loop
+	  	shr.u r17=r17,2
+        ;;
+}
+{.mii
+        adds r17=-1,r17
+        adds r18=16,r32// 16 bytes away to avoid bank conflict
+        nop.i  0
+}
+{.mii
+       add r21=0,r34
+       mov.i ar.ec=9
+	   cmp.lt p10,p0=r30,r34 //length > 8Megs
+	   ;;
+}
+{.mii
+        mov r19=r32
+        mov r20=r33
+		mov.i ar.lc=r17
+}
+{.mib
+        adds r30=448,r32
+        mov pr.rot = 0x10000
+  (p10) br.cond.dptk.many ultra_huge_loop
+		;;
+}
+very_huge_loop:
+{.mmi
+      (p16) lfetch.nt1 [r31],32
+      (p16) ldfp8 f32,f41=[r20],16
+      (p16) adds r21=-32,r21
+}
+{.mmb 
+      (p24) stf8 [r19]=f40,8
+      (p24) stf8 [r18]=f58,8
+            nop.b 0x0;;
+}
+{.mmi 
+      (p16) lfetch.nt1 [r30],32
+      (p16) ldfp8 f50,f59=[r20],16
+//            tbit.z	p11,p0=r20,6
+}
+{.mmb
+      (p24) stf8 [r19]=f49,24
+      (p24) stf8 [r18]=f67,24
+            br.ctop.dptk.many very_huge_loop;;
+}
+{.mmi
+        mov r32=r19
+		mov r33=r20
+		mov r34=r21
+		;;
+}
+{ .mii
+        cmp.ne	p11,p0=0,r21
+ 		mov 	pr = r22,0x10000        // restore rotating predicates
+        mov.i   ar.lc = r11
+        
+}
+{ .mbb
+        nop.m   0
+ (p11)	br.cond.dptk aligned            // use pred code for 1 <= n < 64
+        br.ret.sptk     b0
+        ;;
+}
 
+//Used to copy more than 8Megs of data
+//Uses lfetch.nta to move data to L3 and lfetch.nt1 to move lines from L3 to L2
+//Copies 32 bytes per iteration
+ultra_huge_loop:
+{.mmi
+      (p10) lfetch.nt1 [r31],128
+      (p16) ldfp8 f32,f41=[r20],16
+      (p16) adds r21=-32,r21
+}
+{.mmi 
+      (p24) stf8 [r19]=f40,8
+      (p24) stf8 [r18]=f58,8
+            cmp.ge	p11,p0=r25,r26;;
+}
+{.mmi 
+      (p11) lfetch.nta [r28],128
+      (p16) lfetch.nt1 [r30],32
+            sub r27=r31,r20 
+	        ;;
+}
+{.mmi 
+            cmp.ge	p10,p0=r25,r27
+      (p16) ldfp8 f50,f59=[r20],16
+            sub r26=r28,r31
+}
+{.mmb
+      (p24) stf8 [r19]=f49,24
+      (p24) stf8 [r18]=f67,24
+            br.ctop.dptk.many ultra_huge_loop;;
+}
+{.mmi
+        mov r32=r19
+		mov r33=r20
+		mov r34=r21
+		;;
+}
+{ .mii
+        cmp.ne	p11,p0=0,r21
+ 		mov 	pr = r22,0x10000        // restore rotating predicates
+        mov.i   ar.lc = r11
+        
+}
+{ .mbb
+        nop.m   0
+ (p11)	br.cond.dptk aligned            // use pred code for 1 <= n < 64
+        br.ret.sptk     b0
+        ;;
+}
 
-.src_not_aligned:
-{ .mmi
-	cmp.gt	p_scr, p0 = 16, len
-	and	sh1 = 7, src 		// sh1 = src % 8
-	shr.u	loopcnt = len, 4	// element-cnt = len / 16
-} { .mib
-	add	tmp4 = @ltoff(.table), gp
-	add 	tmp3 = @ltoff(.loop56), gp
-(p_scr)	br.cond.dpnt.many .copy_bytes	// do byte by byte if too few
-;; }
-{ .mmi
-	and	asrc = -8, src		// asrc = (-8) -- align src for loop
-	add 	loopcnt = -1, loopcnt	// loopcnt--
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-} { .mmi
-	ld8	ptable = [tmp4]		// ptable = &table
-	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
-	and	tmp2 = -16, len		// tmp2 = len & -OPSIZ
-;; }
-{ .mmi
-	add	tmp3 = ptable, sh1	// tmp3 = &table + sh1
-	add	src = src, tmp2		// src += len & (-16)
-	movi0	ar.lc = loopcnt		// set LC
-;; }
-{ .mmi
-	ld8	tmp4 = [tmp3]		// tmp4 = loop offset
-	sub	len = len, tmp2		// len -= len & (-16)
-	movi0	ar.ec = MEMLAT + 2 	// one more pass needed
-;; }
+//Used when src and dst alignements are the same
+//Copies up to 7 bytes by conditionally copying 1,2 and 4 bytes, depending on value of last three bits in address 
+unaligned:
+{ .mii
+        cmp.ne	p14,p0= r28,r0    
+        tbit.nz p9,p0 = r16,1  //src
+        tbit.nz p10,p0 = r16,2  //src
+}
+{ .mii
+ (p8)   ld1 r17 = [r33],1      
+        ;;
+}
+{ .mii
+ (p9)   ld2 r18 = [r33],2      
+   		;;
+}
+{ .mii
+ (p10)  ld4 r19 = [r33],4      
+        ;;
+}
 { .mmi
-	ld8	s[1] = [asrc], 8	// preload
-	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
-	movi0   pr.rot = 1 << 16	// set rotating predicates
-;; }
+(p8)   st1 [r32] = r17,1
+(p8)   add r34 = -1,r34
+        nop.i   0
+		;;
+}
+{ .mii
+(p9)   st2 [r32] = r18,2
+(p9)   add r34 = -2,r34
+        nop.i   0
+        ;;
+}
 { .mib
-	nop.m	0
-	movi0	b6 = loopaddr
-	br	b6			// jump to the appropriate loop
-;; }
+(p10)   st4 [r32] = r19,4
+(p10)   add r34 = -4,r34
+        br.cond.dpnt    aligned
+        ;;
+}
+{ .mib                                   
+        br.ret.sptk     b0              
+        ;;
+}
+// end of memcpy
+        .endp  memcpy#
+        .type   ?0_memcopyA,@function
+        .global ?0_memcopyA#
+// End
 
-	LOOP(8)
-	LOOP(16)
-	LOOP(24)
-	LOOP(32)
-	LOOP(40)
-	LOOP(48)
-	LOOP(56)
-END(memcpy)
 libc_hidden_builtin_def (memcpy)
-
-	.rodata
-	.align 8
-.table:
-	data8	0			// dummy entry
-	data8 	.loop56 - .loop8
-	data8 	.loop56 - .loop16
-	data8 	.loop56 - .loop24
-	data8	.loop56 - .loop32
-	data8	.loop56 - .loop40
-	data8	.loop56 - .loop48
-	data8	.loop56 - .loop56
--- sysdeps/ia64/memmove.S.intel	2003-11-18 22:11:48.000000000 -0800
+++ sysdeps/ia64/memmove.S	2006-11-30 12:42:32.000000000 -0800
@@ -1,251 +1,113 @@
-/* Optimized version of the standard memmove() function.
-   This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
-   Contributed by Dan Pop <Dan.Pop@cern.ch>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    src
-        in2:    byte count
-
-   The core of the function is the memcpy implementation used in memcpy.S.
-   When bytes have to be copied backwards, only the easy case, when
-   all arguments are multiples of 8, is optimised.
-
-   In this form, it assumes little endian mode.  For big endian mode,
-   sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
-   or the UM.be bit should be cleared at the beginning and set at the end.  */
+// memmove:	copy a counted number of bytes. 
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// The first step is to test if the 2 memory regions are overlapping or not. 
+// Depending the results, the copy is performed in an ascending (increasing 
+// addresses) or descending way (decreasing addresses)
+//	
+// The main loop use address interleaving to avoid, or at least limit, bank conflicts.
+//
+//       Author: Steve Skedzielewski, JT Acquaviva
+//       Date:   February, 2002
+// 
 
 #include <sysdep.h>
 #undef ret
 
-#define OP_T_THRES 	16
-#define OPSIZ 		 8
-
-#define adest		r15
-#define saved_pr	r17
-#define saved_lc	r18
-#define dest		r19
-#define src		r20
-#define len		r21
-#define asrc		r22
-#define tmp2		r23
-#define tmp3		r24
-#define	tmp4		r25
-#define ptable		r26
-#define ploop56		r27
-#define	loopaddr	r28
-#define	sh1		r29
-#define loopcnt		r30
-#define	value		r31
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-# define ALIGN(n)	{ nop 0 }
-#else
-# define ALIGN(n)	.align n
+#ifdef IS_IN_rtld
+#undef HIDDEN_JUMPTARGET
+#define HIDDEN_JUMPTARGET(name) name
 #endif
 
-#define LOOP(shift)							\
-		ALIGN(32);						\
-.loop##shift##:								\
-(p[0])		ld8	r[0] = [asrc], 8 ;	/* w1 */		\
-(p[MEMLAT+1])	st8	[dest] = value, 8 ;				\
-(p[MEMLAT])	shrp	value = r[MEMLAT], r[MEMLAT+1], shift ;		\
-		nop.b	0 ;						\
-		nop.b	0 ;						\
-		br.ctop.sptk .loop##shift ;				\
-		br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
-
-#define MEMLAT	21
-#define Nrot	(((2*MEMLAT+3) + 7) & ~7)
-
-ENTRY(memmove)
-	.prologue
-	alloc 	r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
-	.rotr	r[MEMLAT + 2], q[MEMLAT + 1]
-	.rotp	p[MEMLAT + 2]
-	mov	ret0 = in0		// return value = dest
-	.save pr, saved_pr
-	mov	saved_pr = pr		// save the predicate registers
-	.save ar.lc, saved_lc
-        mov 	saved_lc = ar.lc	// save the loop counter
-	.body
-	or	tmp3 = in0, in1 ;;	// tmp3 = dest | src
-	or	tmp3 = tmp3, in2	// tmp3 = dest | src | len
-	mov 	dest = in0		// dest
-	mov 	src = in1		// src
-	mov	len = in2		// len
-	sub	tmp2 = r0, in0		// tmp2 = -dest
-	cmp.eq	p6, p0 = in2, r0	// if (len == 0)
-(p6)	br.cond.spnt .restore_and_exit;;// 	return dest;
-	and	tmp4 = 7, tmp3 		// tmp4 = (dest | src | len) & 7
-	cmp.le	p6, p0 = dest, src	// if dest <= src it's always safe
-(p6)	br.cond.spnt .forward		// to copy forward
-	add	tmp3 = src, len;;
-	cmp.lt	p6, p0 = dest, tmp3	// if dest > src && dest < src + len
-(p6)	br.cond.spnt .backward		// we have to copy backward
-
-.forward:
-	shr.u	loopcnt = len, 4 ;;	// loopcnt = len / 16
-	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
-(p6)	br.cond.sptk .next		//	goto next;
-
-// The optimal case, when dest, src and len are all multiples of 8
-
-	and	tmp3 = 0xf, len
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	mov	ar.ec = MEMLAT + 1 ;;	// set the epilog counter
-	cmp.ne	p6, p0 = tmp3, r0	// do we have to copy an extra word?
-	adds	loopcnt = -1, loopcnt;;	// --loopcnt
-(p6)	ld8	value = [src], 8;;
-(p6)	st8	[dest] = value, 8	// copy the "odd" word
-	mov	ar.lc = loopcnt 	// set the loop counter
-	cmp.eq	p6, p0 = 8, len
-(p6)	br.cond.spnt .restore_and_exit;;// the one-word special case
-	adds	adest = 8, dest		// set adest one word ahead of dest
-	adds	asrc = 8, src ;;	// set asrc one word ahead of src
-	nop.b	0			// get the "golden" alignment for
-	nop.b	0			// the next loop
-.l0:
-(p[0])		ld8	r[0] = [src], 16
-(p[0])		ld8	q[0] = [asrc], 16
-(p[MEMLAT])	st8	[dest] = r[MEMLAT], 16
-(p[MEMLAT])	st8	[adest] = q[MEMLAT], 16
-		br.ctop.dptk .l0 ;;
-
-	mov	pr = saved_pr, -1	// restore the predicate registers
-	mov	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-.next:
-	cmp.ge	p6, p0 = OP_T_THRES, len	// is len <= OP_T_THRES
-	and	loopcnt = 7, tmp2 		// loopcnt = -dest % 8
-(p6)	br.cond.spnt	.cpyfew			// copy byte by byte
+	.section .text
+	.proc  memmove#
+	.align 32
+	.global memmove#
+memmove:
+ { .mmi				// -----------------> cycle 0
+	mov	r8=r32		// save dest pointer for return value
+	cmp.eq	p8=0,r34	// is length 0 ?
+        cmp.gtu p7=r32,r33	//  dst  < src ?
+ }
+ { .mib                       
+        add     r14=r34,r33
+	cmp.leu     p9=8,r34	// length > 8 ?
+  (p8)  br.ret.dpnt.many	b0       // ---> if length is 0 branch out
+        ;;
+ }
+// determine whether we need memcpy with ascending or descending address.
+// if both comparison are true jump to memcpyD
+ { .mmi				// -----------------> cycle 1
+  (p7)  cmp.gtu.unc p8=r14,r32	//   (src + len) < dst ?
+        cmp.le p6=3,r34
+        add    r23=2,r33	// second src pointer  (for short copy)
+ } { .mbb
+        add	r22=2,r32	// second dst pointer  (for short copy)
+  (p8)  br.cond.dpnt	?0_memcpyD#  
+  (p9)	br.cond.dpnt	HIDDEN_JUMPTARGET (memcpy) // ---> if length > 7 goto large memcopy
 	;;
-	cmp.eq	p6, p0 = loopcnt, r0
-(p6)	br.cond.sptk	.dest_aligned
-	sub	len = len, loopcnt	// len -= -dest % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
+ }
+// start short copy
+/// move the first byte to load the first level cache
+ { .mmi				// -----------------> cycle 2
+	ld1 r16=[r33],1		// load byte 1
+        ;;
+	st1 [r32]=r16,1		// store byte 1
+        cmp.le p8=5,r34
 	;;
-	mov	ar.lc = loopcnt
-.l1:					// copy -dest % 8 bytes
-	ld1	value = [src], 1	// value = *src++
+ } { .mmi			// -----------------> cycle 3
+(p6)    ld1 r14=[r33],2		// load byte 2
+(p6)    ld1 r15=[r23],2		// load byte 3
+	cmp.le p12,p13=7,r34
 	;;
-	st1	[dest] = value, 1	// *dest++ = value
-	br.cloop.dptk .l1
-.dest_aligned:
-	and	sh1 = 7, src 		// sh1 = src % 8
-	and	tmp2 = -8, len   	// tmp2 = len & -OPSIZ
-	and	asrc = -8, src		// asrc = src & -OPSIZ  -- align src
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	and	len = 7, len;;		// len = len % 8
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	addl	tmp4 = @ltoff(.table), gp
-	addl	tmp3 = @ltoff(.loop56), gp
-	mov     ar.ec = MEMLAT + 1	// set EC
-	mov     pr.rot = 1 << 16;;	// set rotating predicates
-	mov	ar.lc = loopcnt		// set LC
-	cmp.eq  p6, p0 = sh1, r0 	// is the src aligned?
-(p6)    br.cond.sptk .src_aligned
-	add	src = src, tmp2		// src += len & -OPSIZ
-	shl	sh1 = sh1, 3		// sh1 = 8 * (src % 8)
-	ld8	ploop56 = [tmp3]	// ploop56 = &loop56
-	ld8	ptable = [tmp4];;	// ptable = &table
-	add	tmp3 = ptable, sh1;;	// tmp3 = &table + sh1
-	mov	ar.ec = MEMLAT + 1 + 1 // one more pass needed
-	ld8	tmp4 = [tmp3];;		// tmp4 = loop offset
-	sub	loopaddr = ploop56,tmp4	// loopadd = &loop56 - loop offset
-	ld8	r[1] = [asrc], 8;;	// w0
-	mov	b6 = loopaddr;;
-	br	b6			// jump to the appropriate loop
-
-	LOOP(8)
-	LOOP(16)
-	LOOP(24)
-	LOOP(32)
-	LOOP(40)
-	LOOP(48)
-	LOOP(56)
-
-.src_aligned:
-.l3:
-(p[0])		ld8	r[0] = [src], 8
-(p[MEMLAT])	st8	[dest] = r[MEMLAT], 8
-		br.ctop.dptk .l3
-.cpyfew:
-	cmp.eq	p6, p0 = len, r0	// is len == 0 ?
-	adds	len = -1, len		// --len;
-(p6)	br.cond.spnt	.restore_and_exit ;;
-	mov	ar.lc = len
-.l4:
-	ld1	value = [src], 1
+ } { .mmi
+(p8)	ld1 r16=[r33],2		// load byte 4
+(p8)	ld1 r17=[r23],2		// load byte 5
+(p13)	tbit.z	p12=r34,0	// move last byte if len=7 or len is even
+ } { .mmi			// -----------------> cycle 4
+(p6)	st1 [r32]=r14,2		// store byte 2
+(p6)    st1 [r22]=r15,2		// store byte 3
+	cmp.le p14=7,r34
 	;;
-	st1	[dest] = value, 1
-	br.cloop.dptk	.l4 ;;
-.restore_and_exit:
-	mov     pr = saved_pr, -1    	// restore the predicate registers
-	mov 	ar.lc = saved_lc	// restore the loop counter
-	br.ret.sptk.many b0
-
-// In the case of a backward copy, optimise only the case when everything
-// is a multiple of 8, otherwise copy byte by byte.  The backward copy is
-// used only when the blocks are overlapping and dest > src.
-
-.backward:
-	shr.u	loopcnt = len, 3	// loopcnt = len / 8
-	add	src = src, len		// src points one byte past the end
-	add	dest = dest, len ;; 	// dest points one byte past the end
-	mov	ar.ec = MEMLAT + 1	// set the epilog counter
-	mov	pr.rot = 1 << 16	// set rotating predicates
-	adds	loopcnt = -1, loopcnt	// --loopcnt
-	cmp.ne	p6, p0 = tmp4, r0	// if ((dest | src | len) & 7 != 0)
-(p6)	br.cond.sptk .bytecopy ;;	// copy byte by byte backward
-	adds	src = -8, src		// src points to the last word
-	adds	dest = -8, dest 	// dest points to the last word
-	mov	ar.lc = loopcnt;;	// set the loop counter
-.l5:
-(p[0])		ld8	r[0] = [src], -8
-(p[MEMLAT])	st8	[dest] = r[MEMLAT], -8
-		br.ctop.dptk .l5
-		br.cond.sptk .restore_and_exit
-.bytecopy:
-	adds	src = -1, src		// src points to the last byte
-	adds	dest = -1, dest		// dest points to the last byte
-	adds	loopcnt = -1, len;;	// loopcnt = len - 1
-	mov	ar.lc = loopcnt;;	// set the loop counter
-.l6:
-(p[0])		ld1	r[0] = [src], -1
-(p[MEMLAT])	st1	[dest] = r[MEMLAT], -1
-		br.ctop.dptk .l6
-		br.cond.sptk .restore_and_exit
-END(memmove)
-
-	.rodata
-	.align 8
-.table:
-	data8	0			// dummy entry
-	data8 	.loop56 - .loop8
-	data8 	.loop56 - .loop16
-	data8 	.loop56 - .loop24
-	data8	.loop56 - .loop32
-	data8	.loop56 - .loop40
-	data8	.loop56 - .loop48
-	data8	.loop56 - .loop56
+ } { .mmi			// -----------------> cycle 5
+(p12)   ld1 r14=[r33],2		// load byte 2, 4, or 6
+(p14)   ld1 r15=[r23],2		// load byte 7
+ } { .mmi
+(p8)	st1 [r32]=r16,2		// store byte 4
+(p8)	st1 [r22]=r17,2		// store byte 5
+        ;;
+ } { .mmb			// -----------------> cycle 6
+(p12)   st1 [r32]=r14,2		// store byte 2, 4, or 6
+(p14)   st1 [r22]=r15,2		// store byte 7
+	br.ret.dptk b0
+ }
+// end of short memcopy
+// -- End  memmove
+	.endp  memmove#
+	.type	HIDDEN_JUMPTARGET (memcpy),@function
+	.global HIDDEN_JUMPTARGET (memcpy)
+	.type	?0_memcpyD#,@function
+	.global ?0_memcpyD#
+// End
 
 libc_hidden_builtin_def (memmove)
--- sysdeps/ia64/memset.S.intel	2003-11-18 22:11:26.000000000 -0800
+++ sysdeps/ia64/memset.S	2006-11-30 11:29:24.000000000 -0800
@@ -1,400 +1,388 @@
-/* Optimized version of the standard memset() function.
-   This file is part of the GNU C Library.
-   Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
-   Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
-   Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Return: dest
-
-   Inputs:
-        in0:    dest
-        in1:    value
-        in2:    count
-
-   The algorithm is fairly straightforward: set byte by byte until we
-   we get to a 16B-aligned address, then loop on 128 B chunks using an
-   early store as prefetching, then loop on 32B chucks, then clear remaining
-   words, finally clear remaining bytes.
-   Since a stf.spill f0 can store 16B in one go, we use this instruction
-   to get peak speed when value = 0.  */
+// memset:	function to set a number of bytes to a char value
+//
+// Copyright (c) 2000-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+//       Author: Steve Skedzielewski
+//       Date:   June, 2000
+//       Date:   February, 2002 - update
+// 
 
 #include <sysdep.h>
 #undef ret
 
-#define dest		in0
-#define value		in1
-#define	cnt		in2
-
-#define tmp		r31
-#define save_lc		r30
-#define ptr0		r29
-#define ptr1		r28
-#define ptr2		r27
-#define ptr3		r26
-#define ptr9 		r24
-#define	loopcnt		r23
-#define linecnt		r22
-#define bytecnt		r21
-
-#define fvalue		f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr		p6			// default register for same-cycle branches
-#define p_nz		p7
-#define p_zr		p8
-#define p_unalgn	p9
-#define p_y		p11
-#define p_n		p12
-#define p_yy		p13
-#define p_nn		p14
-
-#define movi0		mov
-
-#define MIN1		15
-#define MIN1P1HALF	8
-#define LINE_SIZE	128
-#define LSIZE_SH        7			// shift amount
-#define PREF_AHEAD	8
-
-#define USE_FLP
-#if defined(USE_INT)
-#define store		st8
-#define myval           value
-#elif defined(USE_FLP)
-#define store		stf8
-#define myval		fvalue
-#endif
-
-.align	64
-ENTRY(memset)
-{ .mmi
+	.section .text
+// -- Begin  memset
+	.proc  memset#
+	.align 32
+// Replicate the value into all bytes using mmx broadcast
+// Fall through to aligned short (<16 bytes) code
+// live out:	r21 (alignment), r31(replicated c),
+//		r32(s), r33(c), r34(n)
+	.global memset#
 	.prologue
-	alloc	tmp = ar.pfs, 3, 0, 0, 0
-	lfetch.nt1 [dest]
-	.save   ar.lc, save_lc
-	movi0	save_lc = ar.lc
+memset:
+	mov	r8=r32			// Return value
+	cmp.le	p14=16,r34
+	and	r22=0xF,r32		// Spec test for 16-byte boundary
+	and	r21=7,r32		// Spec test for 8-byte boundary
+	mux1	r31=r33,@brcst		// Replicate byte value
+ (p14)	br.cond.dpnt	Not_short
+	;;
+// Handle short values quickly
+	cmp.ne	p15=0,r21		// If zero, skip alignment
+	cmp.le	p11,p10=8,r34		// Spec test for st8 safety
+	tbit.nz	p13,p12=r32,0		// Spec test for st1 alignment
+	cmp.ge	p14=0,r34		// Spec test for early exit
+ (p14)	br.ret.dpnt	b0
+ (p15)	br.cond.dpnt	Align_short
+	;;
+// We're aligned and p11/p10 is set/clear if we need to do the st8
+// Use complementary predicates to allow length tests in parallel with store
+Short:
+{ .mmi
+	.pred.rel "mutex",p10,p11
+ (p11)	st8	[r32]=r31,8
+ (p11)	cmp.le	p13,p12=12,r34
+ (p10)	cmp.le	p13,p12=4,r34
 } { .mmi
-	.body
-	mov	ret0 = dest		// return value
-	cmp.ne	p_nz, p_zr = value, r0	// use stf.spill if value is zero
-	cmp.eq	p_scr, p0 = cnt, r0
-;; }
-{ .mmi
-	and	ptr2 = -(MIN1+1), dest	// aligned address
-	and	tmp = MIN1, dest	// prepare to check for alignment
-	tbit.nz p_y, p_n = dest, 0	// Do we have an odd address? (M_B_U)
-} { .mib
-	mov	ptr1 = dest
-	mux1	value = value, @brcst	// create 8 identical bytes in word
-(p_scr)	br.ret.dpnt.many rp		// return immediately if count = 0
-;; }
-{ .mib
-	cmp.ne	p_unalgn, p0 = tmp, r0
-} { .mib				// NB: # of bytes to move is 1 higher
-	sub	bytecnt = (MIN1+1), tmp	//     than loopcnt
-	cmp.gt	p_scr, p0 = 16, cnt		// is it a minimalistic task?
-(p_scr)	br.cond.dptk.many .move_bytes_unaligned	// go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add	ptr1 = (MIN1+1), ptr2		// after alignment
-(p_unalgn) add	ptr2 = MIN1P1HALF, ptr2		// after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3	// should we do a st8 ?
-;; }
-{ .mib
-(p_y)	add	cnt = -8, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2	// should we do a st4 ?
-} { .mib
-(p_y)	st8	[ptr2] = value, -4
-(p_n)	add	ptr2 = 4, ptr2
-;; }
-{ .mib
-(p_yy)	add	cnt = -4, cnt
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1	// should we do a st2 ?
+	.pred.rel "mutex",p12,p13
+ (p11)	add	r34=-8,r34
+	;;
+ (p13)	st4	[r32]=r31,4
+ (p13)	cmp.le	p11,p10=6,r34
+} { .mii
+ (p12)	cmp.le	p11,p10=2,r34
+	.pred.rel "mutex",p10,p11
+ (p13)	add	r34=-4,r34
+	;;
+ (p11)	cmp.le	p13=3,r34
+} { .mii
+ (p11)	st2	[r32]=r31,2
+ (p10)	cmp.le	p13=1,r34
+	;;
 } { .mib
-(p_yy)	st4	[ptr2] = value, -2
-(p_nn)	add	ptr2 = 2, ptr2
-;; }
-{ .mmi
-	mov	tmp = LINE_SIZE+1		// for compare
-(p_y)	add	cnt = -2, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0	// should we do a st1 ?
+ (p13)	st1	[r32]=r31
+	br.ret.sptk	b0
+	;;
+}
+// Align, while taking care not to exceed length
+// Similar to aligned code above, but adds an alignment test to length test
+Align_short:
+{ .mmi
+	.pred.rel "mutex",p12,p13
+ (p13)	st1	[r32]=r33,1
+ (p13)	cmp.le	p11,p10=3,r34
+ (p12)	cmp.le	p11,p10=2,r34
+} { .mii
+ (p13)	add	r34=-1,r34
+	;;
+ (p11)	tbit.nz	p11,p10=r32,1		// length is OK, are we on 2-byte boundary?
+	;;
 } { .mmi
-	setf.sig fvalue=value			// transfer value to FLP side
-(p_y)	st2	[ptr2] = value, -1
-(p_n)	add	ptr2 = 1, ptr2
-;; }
-
-{ .mmi
-(p_yy)	st1	[ptr2] = value
-  	cmp.gt	p_scr, p0 = tmp, cnt		// is it a minimalistic task?
-} { .mbb
-(p_yy)	add	cnt = -1, cnt
-(p_scr)	br.cond.dpnt.many .fraction_of_line	// go move just a few
-;; }
-
-{ .mib
-	nop.m 0
-	shr.u	linecnt = cnt, LSIZE_SH
-(p_zr)	br.cond.dptk.many .l1b			// Jump to use stf.spill
-;; }
-
-#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
-	.align 32 // -------- //  L1A: store ahead into cache lines; fill later
-#endif
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
+	.pred.rel "mutex",p10,p11
+ (p11)	st2	[r32]=r31,2
+ (p10)	cmp.le	p13,p12=4,r34
+ (p11)	cmp.le	p13,p12=6,r34
 } { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt		// start of stores
-	add	ptr2 = 8, ptr1			// (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1		// first address beyond total
-;; }						// range
-{ .mmi
-	add	tmp = -1, linecnt		// next loop count
-	movi0	ar.lc = loopcnt
-;; }
-.pref_l1a:
-{ .mib
-	store [ptr9] = myval, 128	// Do stores one cache line apart
-	nop.i	0
-	br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2		// Two stores in parallel
-	movi0	ar.lc = tmp
-;; }
-.l1ax:
- { .mmi
-	store [ptr2] = myval, 8
-	store [ptr0] = myval, 8
- ;; }
- { .mmi
-	store [ptr2] = myval, 24
-	store [ptr0] = myval, 24
- ;; }
- { .mmi
-	store [ptr2] = myval, 8
-	store [ptr0] = myval, 8
- ;; }
- { .mmi
-	store [ptr2] = myval, 24
-	store [ptr0] = myval, 24
- ;; }
- { .mmi
-	store [ptr2] = myval, 8
-	store [ptr0] = myval, 8
- ;; }
- { .mmi
-	store [ptr2] = myval, 24
-	store [ptr0] = myval, 24
- ;; }
- { .mmi
-	store [ptr2] = myval, 8
-	store [ptr0] = myval, 32
- 	cmp.lt	p_scr, p0 = ptr9, ptr1		// do we need more prefetching?
- ;; }
-{ .mmb
-	store [ptr2] = myval, 24
-(p_scr)	store [ptr9] = myval, 128
-	br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
-	cmp.le  p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr) br.cond.dpnt.many  .fraction_of_line	// Branch no. 2
-	br.cond.dpnt.many  .move_bytes_from_alignment	// Branch no. 3
-;; }
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-	{ nop 0 }
-#else
-	.align 32
-#endif
-.l1b:	// ------------------ //  L1B: store ahead into cache lines; fill later
-{ .mmi
-	and	tmp = -(LINE_SIZE), cnt		// compute end of range
-	mov	ptr9 = ptr1			// used for prefetching
-	and	cnt = (LINE_SIZE-1), cnt	// remainder
+ (p11)	add	r34=-2,r34
+	;;
+ (p13)	tbit.nz	p13,p12=r32,2
+	;;
 } { .mmi
-	mov	loopcnt = PREF_AHEAD-1		// default prefetch loop
-	cmp.gt	p_scr, p0 = PREF_AHEAD, linecnt	// check against actual value
-;; }
-{ .mmi
-(p_scr)	add	loopcnt = -1, linecnt
-	add	ptr2 = 16, ptr1	// start of stores (beyond prefetch stores)
-	add	ptr1 = tmp, ptr1	// first address beyond total range
-;; }
-{ .mmi
-	add	tmp = -1, linecnt	// next loop count
-	movi0	ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
-	stf.spill [ptr9] = f0, 128	// Do stores one cache line apart
-	nop.i   0
-	br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
-	add	ptr0 = 16, ptr2		// Two stores in parallel
-	movi0	ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
-	stf.spill [ptr2] = f0, 32
-	stf.spill [ptr0] = f0, 64
- 	cmp.lt	p_scr, p0 = ptr9, ptr1	// do we need more prefetching?
- ;; }
-{ .mmb
-	stf.spill [ptr2] = f0, 32
-(p_scr)	stf.spill [ptr9] = f0, 128
-	br.cloop.dptk.few .l1bx
-;; }
-{ .mib
-	cmp.gt  p_scr, p0 = 8, cnt	// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many  .move_bytes_from_alignment
-;; }
-
-.fraction_of_line:
-{ .mib
-	add	ptr2 = 16, ptr1
-	shr.u	loopcnt = cnt, 5   	// loopcnt = cnt / 32
-;; }
-{ .mib
-	cmp.eq	p_scr, p0 = loopcnt, r0
-	add	loopcnt = -1, loopcnt
-(p_scr)	br.cond.dpnt.many store_words
-;; }
-{ .mib
-	and	cnt = 0x1f, cnt		// compute the remaining cnt
-	movi0   ar.lc = loopcnt
-;; }
-#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
-	.align 32
-#endif
-.l2:	// ---------------------------- //  L2A:  store 32B in 2 cycles
-{ .mmb
-	store	[ptr1] = myval, 8
-	store	[ptr2] = myval, 8
-;; } { .mmb
-	store	[ptr1] = myval, 24
-	store	[ptr2] = myval, 24
-	br.cloop.dptk.many .l2
-;; }
-store_words:
-{ .mib
-	cmp.gt	p_scr, p0 = 8, cnt		// just a few bytes left ?
-(p_scr)	br.cond.dpnt.many .move_bytes_from_alignment	// Branch
-;; }
-
-{ .mmi
-	store	[ptr1] = myval, 8		// store
-	cmp.le	p_y, p_n = 16, cnt		//
-	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi
-(p_y)	store	[ptr1] = myval, 8		// store
-(p_y)	cmp.le.unc p_yy, p_nn = 16, cnt		//
-(p_y)	add	cnt = -8, cnt			// subtract
-;; }
-{ .mmi						// store
-(p_yy)	store	[ptr1] = myval, 8		//
-(p_yy)	add	cnt = -8, cnt			// subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
-	cmp.eq	p_scr, p0 = cnt, r0
-	tbit.nz.unc p_y, p0 = cnt, 2	// should we terminate with a st4 ?
-(p_scr)	br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y)	st4	[ptr1] = value, 4
-	tbit.nz.unc p_yy, p0 = cnt, 1	// should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy)	st2	[ptr1] = value, 2
-	tbit.nz.unc p_y, p0 = cnt, 0
-;; }
-
-{ .mib
-(p_y)	st1	[ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
-	nop.m	0
-	movi0	ar.lc = save_lc
-	br.ret.sptk.many rp
-;; }
+	.pred.rel "mutex",p12,p13
+ (p13)	st4	[r32]=r31,4
+ (p12)	cmp.le	p11,p10=8,r34
+ (p13)	cmp.le	p11,p10=12,r34
+} { .mib
+ (p13)	add	r34=-4,r34
+ 	br.cond.sptk	Short
+	;;
+}	
+// Code for lengths >= 16
+// If we're not on a 16-byte boundary, move to one
+// live out: r31 (replicated c), r33(unsigned c), r32(s), r34(unsigned n)
+Not_short:
+	cmp.ne	p15=0,r22		//0: Low 4 bits zero?
+	cmp.ne	p11,p10=0,r33
+	tbit.nz	p13,p12=r32,0		// Spec test for st1 alignment
+  (p15)	br.cond.dpnt	Align_long
+	;;
+// OK, it's long, it's aligned to a 16-byte boundary.
+// If r33 is not zero, skip to st8 code, otherwise fall into spill f0 version
+Is_aligned:
+	cmp.ne	p14=0,r33		// Check value of fill character
+	add	r16=128,r32	// prefetch pointer
+	.save	ar.lc,r11
+	mov	r11=ar.lc
+	mov	r24=r34
+  (p14)	br.cond.dpnt	Nonzero
+	;;
+//
+// Version when memset is clearing memory
+//
+	.body
+	add	r17=16,r32	// second spill pointer
+	cmp.le	p13=32,r34	// Spec for first set of spills
+	cmp.ge	p14=127,r34
+	and	r24=127,r34
+	mov	r21=144		// = 128+16, length needed for second prefetch
+ (p14)	br.cond.dpnt		Zero_medium
+//
+/// Enter loop code when length is at least 128
+/// Prefetch each line with a spill
+///
+	stf.spill	[r32]=f0,32
+	;;
+	cmp.le		p9=r21,r34
+	shr.u		r22=r34,7	// line size is 128
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	stf.spill	[r16]=f0,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	add		r22=-1,r22	// Loop count
+	;;
+	mov		ar.lc=r22
+ (p9)	stf.spill	[r16]=f0,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	stf.spill	[r16]=f0,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	stf.spill	[r16]=f0,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	stf.spill	[r16]=f0,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+// Counted loop storing 128 bytes/iteration,
+/// with out-of-order spills causing line prefetch
+// live out:	r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
+//              r33(replicated c), r34(n), p13(n&15>32)
+Zero_loop:
+ (p9)	stf.spill	[r16]=f0,128
+	stf.spill	[r17]=f0,32
+	cmp.le		p9=r21,r34
+	;;
+	stf.spill	[r32]=f0,32
+	stf.spill	[r17]=f0,32
+	add		r21=128,r21	// next prefetch safe length
+	;;
+	stf.spill	[r32]=f0,32
+	stf.spill	[r17]=f0,32
+	cmp.le		p13=32,r24
+	;;
+	stf.spill	[r32]=f0,64
+	stf.spill	[r17]=f0,32
+	br.cloop.sptk	Zero_loop
+	;;
+	add		r32=-32,r32
+	;;
+Zero_medium:
+ (p13)	stf.spill	[r32]=f0,32	// Redundant if entered from loop path
+ (p13)	stf.spill	[r17]=f0,32
+	cmp.le		p12=64,r24
+	;;
+ (p12)	stf.spill	[r32]=f0,32
+ (p12)	stf.spill	[r17]=f0,32
+	cmp.le		p11=96,r24
+	;;
+ (p11)	stf.spill	[r32]=f0,32
+ (p11)	stf.spill	[r17]=f0,32
+	tbit.nz		p10=r24,4
+	;;
+ (p10)	stf.spill	[r32]=f0,16
+	tbit.nz		p9=r24,3
+	;;
+ (p9)	st8		[r32]=r0,8
+	tbit.nz		p13=r24,2
+	;;
+// 
+// Clean up any partial word stores.
+//	
+	tbit.nz		p12=r24,1
+ (p13)	st4		[r32]=r0,4
+	;;
+ (p12)	st2		[r32]=r0,2
+	tbit.nz		p11=r24,0
+	;;
+ (p11)	st1		[r32]=r0,1
+	mov		ar.lc=r11
+	br.ret.sptk.many	b0
+	;;
+//
+// Fill character is not zero
+// Now that p is aligned to a 16-byte boundary
+//     use straight-line code for n<=64, a loop otherwise
+// live out:	r8 (return value, original value of r32)
+//		p14 (n>=MINIMUM_LONG)
+//
+Nonzero:
+	MINIMUM_LONG=0x40
+	add	r17=8,r32		//0: second pointer
+	mov	r21=136		// = 128+8, length needed for second prefetch
+	add	r22=64,r34	// May need extra 1/2 iteration
+	cmp.le	p13=16,r34	// Spec for use when loop is skipped
+	cmp.gt	p14=MINIMUM_LONG,r34
+ (p14)	br.cond.dpnt	Nonzero_medium
+	;;
+//
+/// Enter loop code when length is at least 128
+/// Prefetch each line with a st8
+///
+	st8		[r32]=r31,16
+	cmp.le		p9=r21,r34
+	shr.u		r22=r22,7	// line size is 128
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	st8		[r16]=r31,128
+	add		r22=-1,r22	// Loop count
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+	mov		ar.lc=r22
+ (p9)	st8		[r16]=r31,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	st8		[r16]=r31,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	st8		[r16]=r31,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+ (p9)	st8		[r16]=r31,128
+	cmp.le		p9=r21,r34
+	add		r21=128,r21	// next prefetch safe length
+	;;
+// Counted loop storing 128 bytes/iteration,
+/// with out-of-order spills causing line prefetch
+// live out:	r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
+//              r33(replicated c), r34(n), p13(n&15>32)
+Nonzero_loop:
+ (p9)	st8		[r16]=r31,128
+	st8		[r17]=r31,16
+	cmp.lt		p10,p11=127,r24	// should we store the last 64?
+	;;
+	st8		[r32]=r31,16
+	st8		[r17]=r31,16
+ (p10)	add		r24=-128,r24	// Update count of remaining bytes
+	;;
+	st8		[r32]=r31,16
+	st8		[r17]=r31,16
+ (p11)	add		r24=-64,r24	// Update count of remaining bytes
+	;;
+	st8		[r32]=r31,16
+	st8		[r17]=r31,16
+	cmp.le		p9=r21,r34	// Compare prefetch offset with length
+	;;
+ (p10)	st8		[r32]=r31,16
+ (p10)	st8		[r17]=r31,16
+	add		r21=128,r21	// next prefetch-safe length
+	;;
+ (p10)	st8		[r32]=r31,16
+ (p10)	st8		[r17]=r31,16
+	cmp.le		p13=16,r24	// Spec for epilog
+	;;
+ (p10)	st8		[r32]=r31,16
+ (p10)	st8		[r17]=r31,16
+// (p10)	cmp.lt.unc	p11,p12=64,r24	// p11 true if we need another iter
+	;;
+//  {.mmi
+ (p10)	st8		[r32]=r31,32
+ (p10)	st8		[r17]=r31,16
+//} {.mib
+//	.pred.rel "mutex",p11,p12
+// (p11)	add		r32=32,r32	// skip the bytes stored out-of-order
+// (p12)	add		r32=16,r32	// prepare for epilogue
+	br.cloop.sptk	Nonzero_loop
+	;;
+//}
+ (p10)	add	r32=-16,r32
+	;;
+// Short memsets are done with predicated straightline code
+// live out:	r8 (return value, original value of r32)
+Nonzero_medium:
+ (p13)	st8	[r32]=r31,16
+ (p13)	st8	[r17]=r31,16
+	cmp.le	p12=0x20,r24		//0: 32 <= n?
+	;;
+ (p12)	st8	[r32]=r31,16
+ (p12)	st8	[r17]=r31,16
+	cmp.le	p11=0x30,r24		//0: 48 <= n?
+	;;
+ (p11)	st8	[r32]=r31,16
+ (p11)	st8	[r17]=r31,16
+	tbit.nz	p10=r24,3
+	;;
+ (p10)	st8	[r32]=r31,8
+	tbit.nz	p9=r24,2
+	;;
+// 
+// Clean up any partial word stores.
+//	
+	tbit.nz	p8=r24,1
+ (p9)	st4	[r32]=r31,4
+	;;
+ (p8)	st2	[r32]=r31,2
+	tbit.nz	p7=r24,0
+	;;
+ (p7)	st1	[r32]=r31,1
+	mov	ar.lc=r11
+	br.ret.sptk.many	b0
+	;;
+Align_long:
+ (p13)	st1	[r32]=r33,1
+ (p13)	add	r34=-1,r34
+	;;
+	tbit.nz	p13=r32,1
+	;;
+ (p13)	st2	[r32]=r31,2
+ (p13)	add	r34=-2,r34
+	;;
+	tbit.nz	p13=r32,2
+	;;
+ (p13)	st4	[r32]=r31,4
+ (p13)	add	r34=-4,r34
+	;;
+	tbit.nz	p13,p12=r32,3
+	;;
+ (p13)	st8	[r32]=r31,8
+ (p13)	add	r34=-8,r34
+	;;
+ 	cmp.le	p11,p10=8,r34		// Spec for entry to Short
+ 	cmp.le	p13,p12=16,r34	
+ (p12)	br.cond.dpnt	Short
+	br.cond.dptk	Is_aligned
+	;;
+//
+// -- End  memset
+	.endp  memset#
+// End
 
-.move_bytes_unaligned:
-{ .mmi
-       .pred.rel "mutex",p_y, p_n
-       .pred.rel "mutex",p_yy, p_nn
-(p_n)	cmp.le  p_yy, p_nn = 4, cnt
-(p_y)	cmp.le  p_yy, p_nn = 5, cnt
-(p_n)	add	ptr2 = 2, ptr1
-} { .mmi
-(p_y)	add	ptr2 = 3, ptr1
-(p_y)	st1	[ptr1] = value, 1	// fill 1 (odd-aligned) byte
-(p_y)	add	cnt = -1, cnt		// [15, 14 (or less) left]
-;; }
-{ .mmi
-(p_yy)	cmp.le.unc p_y, p0 = 8, cnt
-	add	ptr3 = ptr1, cnt	// prepare last store
-	movi0	ar.lc = save_lc
-} { .mmi
-(p_yy)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	add	cnt = -4, cnt		// [11, 10 (o less) left]
-;; }
-{ .mmi
-(p_y)	cmp.le.unc p_yy, p0 = 8, cnt
-	add	ptr3 = -1, ptr3		// last store
-	tbit.nz p_scr, p0 = cnt, 1	// will there be a st2 at the end ?
-} { .mmi
-(p_y)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_y)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-(p_y)	add	cnt = -4, cnt		// [7, 6 (or less) left]
-;; }
-{ .mmi
-(p_yy)	st2	[ptr1] = value, 4	// fill 2 (aligned) bytes
-(p_yy)	st2	[ptr2] = value, 4	// fill 2 (aligned) bytes
-					// [3, 2 (or less) left]
-	tbit.nz p_y, p0 = cnt, 0	// will there be a st1 at the end ?
-} { .mmi
-(p_yy)	add	cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr)	st2	[ptr1] = value		// fill 2 (aligned) bytes
-(p_y)	st1	[ptr3] = value		// fill last byte (using ptr3)
-	br.ret.sptk.many rp
-;; }
-END(memset)
 libc_hidden_builtin_def (memset)
--- sysdeps/ia64/serial-memmove.S.intel	2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/serial-memmove.S	2006-11-30 12:39:45.000000000 -0800
@@ -0,0 +1,518 @@
+// ?1__serial_memmove:	
+// 
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+//  This routine test two memory zones selected as source / destination
+//  for a copy loop.
+//  1/ if there is no overlapping (source + length) < destination call memcpy
+//  2/ if source < destination a Write-After-Read dependency is assumed,
+// 	 therefore make a call to memcpy in a descending address mode.
+//  3/ if destination > (source + length),
+//	 again there is no overlapping and memcpy can be safely called
+//  4/ Otherwise we have overlapping and Read-After-Write dependency.
+//	 Therefore the copy loop is done explicitly using the ld size given
+//       by the fourth argument
+//  
+//  NOTE: To avoid versioning for aligned and unaligned access, coalescing
+//        is not done in the unrolled loops
+//
+//       Author: Steve Skedzielewski, JT Acquaviva
+//       Date:   July, 2002
+// 
+
+#include <sysdep.h>
+#undef ret
+
+#ifdef IS_IN_rtld
+#undef HIDDEN_JUMPTARGET
+#define HIDDEN_JUMPTARGET(name) name
+#endif
+
+	.section .text
+	.proc  ?1__serial_memmove#
+	.align 32
+	.global ?1__serial_memmove#
+// Arguments: r32 is dest, r33 is src, r34 is length in bytes, r35 is element size.
+// Length is in Bytes to allow a faster call to memcpy than if the length were
+// given in number of elements
+//
+?1__serial_memmove: 
+	.prologue
+ { .mmi				// -----------------> cycle 0
+        add     r15=r34,r33       // r15= src + length
+        add     r16=r34,r32       // r16= dst + length
+        cmp.gtu p9=r33,r32	  // dest  < src ? ie. no overlapp, or a WAR dependency
+} { .mib
+	mov	r8=r32		// save dest pointer for return value
+	cmp.gtu	p8=r35,r34	// is length <= object size ?
+  (p8)  br.ret.dpnt.many	b0       // ---> if length is <= object size branch out
+        ;;
+ }
+ { .mmi 
+        sub r28=r32,r33   // distance is destination - source in Byte
+        cmp.gtu p10,p11=r32,r15   // is src + length < dest ? i.e no overlapp
+	.save	ar.lc, r31
+	mov r31 = ar.lc   // saving the loop iteration counter
+        ;;
+ }
+	.body
+ { .mmb
+ // if one of the 2 following comparisons is true, it means either no overlapp or a WAR 
+ // dependency therefore it is safe to branch to memcpy
+        cmp.eq  p6=1,r35  // various compare for code versionning
+  (p11) cmp.gtu    p10,p0=r33,r16    // is dst + length < src ? i.e no overlapp
+  (p10) br.cond.dpnt	HIDDEN_JUMPTARGET (memcpy)   //  no overlapp, branch to memcpy
+        ;;
+ }
+ { .mib
+        add     r23=r33,r35       // source duplication
+        add     r22=r32,r35       // dest duplication
+  (p9)  br.cond.dpnt   HIDDEN_JUMPTARGET (memmove)  //  good overlap (WAR) branch to memmove
+        ;;
+ }
+ { .mmi // various compare for code versionning
+        cmp.eq  p7=2,r35
+        cmp.eq  p8=4,r35
+        cmp.eq  p9=16,r35 // notice, we do not test size of 8, which is the DEFAULT case
+ }
+//
+// Note for length < 4: 
+// Despite unrolling it seems that size is never tested to know if we can jump 
+// or not to the unrolled case. i.e. for length < 4 seems not to handled.
+// In fact it done implicitelry: 
+//  1/ if there is no or a 'good' dependency, memcpy is going to be called and
+// is going to handle this right.
+//  2/ if there is a bad dependency, the distance will always be < 4 therefore
+// the unrolled case (unrolled 4 times) will never be called and it will always
+// jump to the serial loop which is ok for short length.
+
+//
+// branch to the loop corresponding to element size
+//
+ { .bbb
+  (p6)  br.cond.dpnt .copy_loop_size_1
+  (p7)  br.cond.dpnt .copy_loop_size_2
+  (p8)  br.cond.dpnt .copy_loop_size_4
+        ;;
+ }
+ { .mib
+// for size of 16 we just proceed as a size of 8. This is functionally safe and
+// there is no easy optimization to add for size 16. Performance of size 8 are already ok.
+// Since this is the last case, if we reach this point this is an unconditionnal branch
+// But the pointer duplication is predicated and done with 8 Byte object and not 16
+// Therefore, instead of using r35 we used the immediate value 8.
+  (p9)  add r22=8,r32   // dest duplication
+  (p9)  add r23=8,r33   // source duplication
+        br.cond.dpnt .copy_loop_size_8 // by DEFAULT branch to size of 8
+        ;;
+ }
+
+// If we have not yet branched to memcpy there is a RAW dependency,
+// so we need to perform an iteration per iteration copy loop. // An optimization is to proceed to distance computation for the dependency. 
+// If distance is > 4 times the size of the element we unrolling by 4,
+//	 else we do the one element per iteration copy loop.
+// To do this, we compute the distance in ytes of the dependency,
+//       then we divide it by the size of the element (using shift right). 
+// 
+
+.copy_loop_size_1:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 1 to get numbers of 
+// iteration for a 4 times unrolled loop with 1 Byte element. r14 is the number of iterations
+// in element of 1 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mmi          
+        cmp.gtu p13=4,r28  // check distance, if distance < 4 there is dependency branch to serial
+        add     r17=-1,r34 // number of iteration for the serial loop
+        shr.u   r11=r34,2  // number of iteration+1 for the unrolled loop
+        ;;
+ }
+ { .mib  // Check if unrolled version is ok or not
+        add     r10=-1,r11               // exact number of iteration for the unrolled loop
+  (p13) mov     ar.lc=r17                // if serial, set loop counter to r17
+  (p13) br.cond.dptk .serial_loop_size_1 // if serial jump to serial part
+        ;;
+ }
+ { .mmi
+        nop.m   0
+        and     r14=3,r34    // look at the epilog size for the unrolled loop
+        mov     ar.lc=r10    // set the loop counter for unrolled loop
+        ;;
+ }
+.copy_loop_unrolled_size_1:
+ { .mmi
+        ld1     r24=[r33],2
+        ld1     r25=[r23],2
+        cmp.leu   p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+        ;;
+ }
+ { .mmi
+        ld1     r26=[r33],2
+        ld1     r27=[r23],2
+        cmp.leu   p7=2,r14
+ } { .mmi
+        st1     [r32]=r24,2
+        st1     [r22]=r25,2
+        cmp.leu   p8=3,r14
+        ;;
+ } { .mmb
+        st1     [r32]=r26,2
+        st1     [r22]=r27,2
+        br.cloop.sptk   .copy_loop_unrolled_size_1
+        ;;
+ } { .mmi
+  (p6)  ld1 r24=[r33],2
+  (p7)  ld1 r25=[r23]
+        nop.i   0
+        ;;
+ }
+ { .mmi
+  (p8)  ld1 r26=[r33]
+  (p6)  st1 [r32]=r24,2
+        nop.i    0
+ }
+ { .mmi
+  (p7)  st1 [r22]=r25
+        nop.m   0
+        nop.i   0
+        ;;
+ }
+ { .mib
+  (p8)  st1 [r32]=r26
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+.serial_loop_size_1:
+ { .mmi
+        ld1     r3=[r33],1
+        ;;
+        st1     [r32]=r3,1
+        nop.i   0
+ }
+ { .mib
+        nop.m    0
+        nop.i    0
+        br.cloop.sptk   .serial_loop_size_1
+        ;;
+ }
+// end of copy loop 
+ { .mib
+        nop.m    0
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+//
+// End of copy for elements of size 1 Byte
+//
+
+.copy_loop_size_2:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 3 to get numbers of 
+// iteration for a 4 times unrolled loop with 8 Byte element. r14 is the number of iterations
+// in element of 2 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+        nop.m   0
+        shr.u   r11=r34,3    // number of iteration for the unrolled loop 
+        ;;
+        shr.u   r17=r34,1    // number of iteration for the serial loop
+        ;;
+ }
+ { .mmi          
+        and     r14=3,r17    // epilog length 
+        add     r10=-1,r11   // unrolled loop exact counter
+        shr.u   r28=r28,1    // distance in element, where element size is 8 Byte
+        ;;
+ }
+ { .mmi
+        cmp.gtu p13=4,r28  // check distance, if distance < 4 there is dependency branch to epilog
+        add     r17=-1,r17   // serial loop exact counter
+        mov     ar.lc=r10    // by default set the loop counter to the unrolled loop
+        ;;
+ }
+ { .mib  
+  (p13) mov     ar.lc=r17
+  (p13) br.cond.dptk .serial_loop_size_2
+ }
+.copy_loop_unrolled_size_2:
+ { .mmi
+        ld2     r24=[r33],4
+        ld2     r25=[r23],4
+        cmp.leu   p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+        ;;
+ } { .mmi
+        ld2     r26=[r33],4
+        ld2     r27=[r23],4
+        cmp.leu   p7=2,r14
+ } { .mmi
+        st2     [r32]=r24,4
+        st2     [r22]=r25,4
+        cmp.leu   p8=3,r14
+        ;;
+ } { .mmb
+        st2     [r32]=r26,4
+        st2     [r22]=r27,4
+        br.cloop.sptk   .copy_loop_unrolled_size_2
+        ;;
+ } { .mmi
+  (p6)  ld2 r24=[r33],4
+  (p7)  ld2 r25=[r23]
+        nop.i   0
+        ;;
+ }
+ { .mmi
+  (p8)  ld2 r26=[r33]
+  (p6)  st2 [r32]=r24,4
+        nop.i    0
+ }
+ { .mmi
+  (p7)  st2 [r22]=r25
+        nop.m   0
+        nop.i   0
+        ;;
+ }
+ { .mib
+  (p8)  st2 [r32]=r26
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+.serial_loop_size_2:
+ { .mmi
+        ld2     r3=[r33],2
+        ;;
+        st2     [r32]=r3,2
+        nop.i   0
+ }
+ { .mib
+        nop.m    0
+        nop.i    0
+        br.cloop.sptk   .serial_loop_size_2
+        ;;
+ }
+// end of copy loop 
+ { .mib
+        nop.m    0
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+//
+// End of copy for 2 Byte elements
+//
+
+.copy_loop_size_4:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 4 to get numbers of 
+// iteration for a 4 times unrolled loop with 4 Byte element. r14 is the number of iterations
+// in element of 4 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+        nop.m   0
+        shr.u   r11=r34,4  
+        ;;
+        shr.u   r17=r34,2  
+        ;;
+ }
+ { .mmi          
+        add     r10=-1,r11    // unrolled loop counter
+        and     r14=3,r17     // looks at the last 2 bits for epilog length
+        shr.u   r28=r28,2     // distance in element, where element size is 4 Byte
+        ;;
+ }
+ { .mmi
+      
+        cmp.gtu p13=4,r28  // check distance, if distance < 4 there is dependency branch to epilog
+        add     r17=-1,r17         // serial loop counter
+        mov     ar.lc=r10
+        ;;
+ }
+ { .mib  // if we reach this bundle it means than we can unroll
+  (p13) mov     ar.lc=r17
+  (p13) br.cond.dptk .serial_loop_size_4
+        ;;
+ }
+.copy_loop_unrolled_size_4:
+ { .mmi
+        ld4     r24=[r33],8
+        ld4     r25=[r23],8
+        cmp.leu   p6=1,r14  // since loop is unrolled 4 time, epilog length is at most 3
+        ;;
+ } { .mmi
+        ld4     r26=[r33],8
+        ld4     r27=[r23],8
+        cmp.leu   p7=2,r14
+ } { .mmi
+        st4     [r32]=r24,8
+        st4     [r22]=r25,8
+        cmp.leu   p8=3,r14
+        ;;
+ } { .mmb
+        st4     [r32]=r26,8
+        st4     [r22]=r27,8
+        br.cloop.sptk   .copy_loop_unrolled_size_4 
+        ;;
+ } { .mmi
+  (p6)  ld4 r24=[r33],8
+  (p7)  ld4 r25=[r23]
+        nop.i   0
+        ;;
+ }
+ { .mmi
+  (p8)  ld4 r26=[r33]
+  (p6)  st4 [r32]=r24,8
+        nop.i    0
+ }
+ { .mmi
+  (p7)  st4 [r22]=r25
+        nop.m   0
+        nop.i   0
+        ;;
+ }
+ { .mib
+  (p8)  st4 [r32]=r26
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+.serial_loop_size_4:
+ {   .mmi
+       ld4     r3=[r33],4
+       ;;
+       st4     [r32]=r3,4
+       nop.i   0
+ }
+ { .mib
+       nop.m    0
+       nop.i    0
+       br.cloop.sptk   .serial_loop_size_4 
+       ;;
+ }
+// end of copy loop 
+{    .mib
+       nop.m    0
+       mov     ar.lc=r31         // restoring loop counter
+       br.ret.dpnt.many	b0       // end of program return
+       ;;
+}
+//
+// End of copy for elements of size 4 Byte.
+//
+
+.copy_loop_size_8:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 5 to get numbers of 
+// iteration for a 4 times unrolled loop with 8 Byte element. r14 is the number of iterations
+// in element of 8 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+        nop.m   0
+        shr.u r11=r34,5  
+        ;;
+        shr.u r17=r34,3  
+        ;;
+ }
+ { .mmi          
+        add     r10=-1,r11 
+        and     r14=3,r17    // looks at the last 2 bits
+        shr.u   r28=r28,3    // distance in element, where element size is 8 Byte
+        ;;
+ }
+ { .mmi
+        cmp.gtu p13=4,r28  // check distance, if distance < 4 there is dependency branch to epilog
+        add     r17=-1,r17
+        mov     ar.lc=r10
+        ;;
+ }
+ { .mib  // if we reach this bundle it means than we can unroll
+  (p13) mov     ar.lc=r17
+  (p13) br.cond.dptk .serial_loop_size_8
+        ;;
+ }
+.copy_loop_unrolled_size_8:
+ { .mmi
+        ld8     r24=[r33],16
+        ld8     r25=[r23],16
+        cmp.leu   p6=1,r14  // since loop is unrolled 4 time, epilog length is at most 3
+        ;;
+ } { .mmi
+        ld8     r26=[r33],16
+        ld8     r27=[r23],16
+        cmp.leu   p7=2,r14
+ } { .mmi
+        st8     [r32]=r24,16
+        st8     [r22]=r25,16
+        cmp.leu   p8=3,r14
+        ;;
+ } { .mmb
+        st8     [r32]=r26,16
+        st8     [r22]=r27,16
+        br.cloop.sptk   .copy_loop_unrolled_size_8
+        ;;
+ } { .mmi
+  (p6)  ld8 r24=[r33],16
+  (p7)  ld8 r25=[r23]
+        nop.i   0
+        ;;
+ }
+ { .mmi
+  (p8)  ld8 r26=[r33]
+  (p6)  st8 [r32]=r24,16
+        nop.i    0
+ }
+ { .mmi
+  (p7)  st8 [r22]=r25
+        nop.m   0
+        nop.i   0
+        ;;
+ }
+ { .mib
+  (p8)  st8 [r32]=r26
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+.serial_loop_size_8:
+ { .mmi
+        ld8     r3=[r33],8
+        ;;
+        st8     [r32]=r3,8
+        nop.i   0
+ }
+ { .mib
+        nop.m    0
+        nop.i    0
+        br.cloop.sptk   .serial_loop_size_8
+        ;;
+ }
+// end of copy loop 
+ { .mib
+        nop.m    0
+        mov     ar.lc=r31                // restoring loop counter
+        br.ret.dpnt.many	b0       // end of program return
+        ;;
+ }
+//
+// End of copy for element of 8 Bytes.
+//
+
+// -- End  ?1__serial_memmove
+	.endp  ?1__serial_memmove#
+	.type	HIDDEN_JUMPTARGET (memcpy),@function
+	.global HIDDEN_JUMPTARGET (memcpy)
+	.type	HIDDEN_JUMPTARGET (memmove),@function
+	.global HIDDEN_JUMPTARGET (memmove)
+// End


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]