This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
PATCH: Update ia64 memory functions
- From: "H. J. Lu" <hjl at lucon dot org>
- To: GNU C Library <libc-alpha at sources dot redhat dot com>
- Date: Fri, 12 Jan 2007 14:16:23 -0800
- Subject: PATCH: Update ia64 memory functions
Intel updated ia64 memory functions:
http://www3.intel.com/cd/software/products/asmo-na/eng/219884.htm
This patch ported them to glibc. The speedups on Montecito are
Average speedup for memcmp: 53.8154%
Average speedup for memcpy: 22.6089%
Average speedup for memmove: 381.785%
Average speedup for memset: 1.27461%
H.J.
----
2007-01-12 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/ia64/Makefile (sysdep_routines): Add
memcopyD-large-al, memcopyD-large-ual, memcpy-a0-mt-array
and serial-memmove.
* sysdeps/ia64/memcmp.S: Replaced with the one contributed by
Intel.
* sysdeps/ia64/memcpy.S: Likewise.
* sysdeps/ia64/memmove.S: Likewise.
* sysdeps/ia64/memset.S: Likewise.
* sysdeps/ia64/memcopyD-large-al.S: New. Contributed by Intel.
* sysdeps/ia64/memcopyD-large-ual.S: Likewise.
* sysdeps/ia64/memcpy-a0-mt-array.S: Likewise.
* sysdeps/ia64/serial-memmove.S: Likewise.
--- sysdeps/ia64/Makefile.intel 2004-08-15 23:46:14.000000000 -0700
+++ sysdeps/ia64/Makefile 2006-11-30 11:08:05.000000000 -0800
@@ -22,3 +22,8 @@ sysdep-dl-routines += dl-symaddr dl-fptr
sysdep_routines += $(sysdep-dl-routines)
sysdep-rtld-routines += $(sysdep-dl-routines)
endif
+
+ifeq ($(subdir),string)
+sysdep_routines += memcopyD-large-al memcopyD-large-ual \
+ memcpy-a0-mt-array serial-memmove
+endif
--- sysdeps/ia64/memcmp.S.intel 2004-04-17 15:58:49.000000000 -0700
+++ sysdeps/ia64/memcmp.S 2006-12-01 09:08:27.000000000 -0800
@@ -1,165 +1,503 @@
-/* Optimized version of the standard memcmp() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2004 Free Software Foundation, Inc.
- Contributed by Dan Pop <Dan.Pop@cern.ch>.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-/* Return: the result of the comparison
-
- Inputs:
- in0: dest (aka s1)
- in1: src (aka s2)
- in2: byte count
-
- In this form, it assumes little endian mode. For big endian mode, the
- the two shifts in .l2 must be inverted:
-
- shl tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 << sh1
- shr.u tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 >> sh2
-
- and all the mux1 instructions should be replaced by plain mov's. */
+// memcmp - compare memory areas
+//
+// Copyright (c) 2004-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// Basicly the code is divided in these parts:
+// 1)short - linear code for comparing < 16 bytes
+// 2)long(size > 16), this one further devided into:
+// a)unaligned - first compares first unaligned bytes < 8 and than uses ld8 to find a pair of 8B that differ
+// than branches to unaligned_cmp, if such pair not found - do the rest by branch to short
+// b)aligned - uses ld8 to find a pair of 8B that differ than branches to aligned_cmp, if such pair not found - do the rest by branch to short
+//
+//
+// Author: Boris Shurygin, Moscow
+// Date: December, 2004
+//
+// -- Begin memcmp
#include <sysdep.h>
#undef ret
-#define OP_T_THRES 16
-#define OPSIZ 8
-#define MEMLAT 2
-
-#define start r15
-#define saved_pr r17
-#define saved_lc r18
-#define dest r19
-#define src r20
-#define len r21
-#define asrc r22
-#define tmp r23
-#define value1 r24
-#define value2 r25
-#define sh2 r28
-#define sh1 r29
-#define loopcnt r30
-
-ENTRY(memcmp)
+ .section .text
+ .proc memcmp#
+ .align 32
+ .global memcmp#
+ bcmp==memcmp
+ .weak bcmp#
.prologue
- alloc r2 = ar.pfs, 3, 37, 0, 40
-
- .rotr r[MEMLAT + 2], q[MEMLAT + 5], tmp1[4], tmp2[4], val[2]
- .rotp p[MEMLAT + 4 + 1]
-
- mov ret0 = r0 // by default return value = 0
- .save pr, saved_pr
- mov saved_pr = pr // save the predicate registers
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc // save the loop counter
- .body
- mov dest = in0 // dest
- mov src = in1 // src
- mov len = in2 // len
- sub tmp = r0, in0 // tmp = -dest
- ;;
- and loopcnt = 7, tmp // loopcnt = -dest % 8
- cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
-(p6) br.cond.spnt .cmpfew // compare byte by byte
- ;;
- cmp.eq p6, p0 = loopcnt, r0
-(p6) br.cond.sptk .dest_aligned
- sub len = len, loopcnt // len -= -dest % 8
- adds loopcnt = -1, loopcnt // --loopcnt
- ;;
- mov ar.lc = loopcnt
-.l1: // copy -dest % 8 bytes
- ld1 value1 = [src], 1 // value = *src++
- ld1 value2 = [dest], 1
- ;;
- cmp.ne p6, p0 = value1, value2
-(p6) br.cond.spnt .done
- br.cloop.dptk .l1
-.dest_aligned:
- and sh1 = 7, src // sh1 = src % 8
- and tmp = -8, len // tmp = len & -OPSIZ
- and asrc = -8, src // asrc = src & -OPSIZ -- align src
- shr.u loopcnt = len, 3 // loopcnt = len / 8
- and len = 7, len ;; // len = len % 8
- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
- adds loopcnt = -1, loopcnt // --loopcnt
- mov pr.rot = 1 << 16 ;; // set rotating predicates
- sub sh2 = 64, sh1 // sh2 = 64 - sh1
- mov ar.lc = loopcnt // set LC
- cmp.eq p6, p0 = sh1, r0 // is the src aligned?
-(p6) br.cond.sptk .src_aligned
- add src = src, tmp // src += len & -OPSIZ
- mov ar.ec = MEMLAT + 4 + 1 // four more passes needed
- ld8 r[1] = [asrc], 8 ;; // r[1] = w0
- .align 32
-
-// We enter this loop with p6 cleared by the above comparison
-
-.l2:
-(p[0]) ld8 r[0] = [asrc], 8 // r[0] = w1
-(p[0]) ld8 q[0] = [dest], 8
-(p[MEMLAT]) shr.u tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 >> sh1
-(p[MEMLAT]) shl tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 << sh2
-(p[MEMLAT+4]) cmp.ne p6, p0 = q[MEMLAT + 4], val[1]
-(p[MEMLAT+3]) or val[0] = tmp1[3], tmp2[3] // val = tmp1 | tmp2
-(p6) br.cond.spnt .l2exit
- br.ctop.sptk .l2
- br.cond.sptk .cmpfew
-.l3exit:
- mux1 value1 = r[MEMLAT], @rev
- mux1 value2 = q[MEMLAT], @rev
- cmp.ne p6, p0 = r0, r0 ;; // clear p6
-.l2exit:
-(p6) mux1 value1 = val[1], @rev
-(p6) mux1 value2 = q[MEMLAT + 4], @rev ;;
- cmp.ltu p6, p7 = value2, value1 ;;
-(p6) mov ret0 = -1
-(p7) mov ret0 = 1
- mov pr = saved_pr, -1 // restore the predicate registers
- mov ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-.src_aligned:
- cmp.ne p6, p0 = r0, r0 // clear p6
- mov ar.ec = MEMLAT + 1 ;; // set EC
-.l3:
-(p[0]) ld8 r[0] = [src], 8
-(p[0]) ld8 q[0] = [dest], 8
-(p[MEMLAT]) cmp.ne p6, p0 = r[MEMLAT], q[MEMLAT]
-(p6) br.cond.spnt .l3exit
- br.ctop.dptk .l3 ;;
-.cmpfew:
- cmp.eq p6, p0 = len, r0 // is len == 0 ?
- adds len = -1, len // --len;
-(p6) br.cond.spnt .restore_and_exit ;;
- mov ar.lc = len
-.l4:
- ld1 value1 = [src], 1
- ld1 value2 = [dest], 1
- ;;
- cmp.ne p6, p0 = value1, value2
-(p6) br.cond.spnt .done
- br.cloop.dptk .l4 ;;
-.done:
-(p6) sub ret0 = value2, value1 // don't execute it if falling thru
-.restore_and_exit:
- mov pr = saved_pr, -1 // restore the predicate registers
- mov ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-END(memcmp)
+memcmp:
+{.mii
+ mov r19=r32
+ mov.i r18=ar.lc
+ mov r20=r33
+}
+{.mib
+ mov r21=r34
+ cmp.lt p6,p0=16,r34
+ (p6) br.cond.spnt.many long;;//branch to long
+}
+{.mii
+ mov r8=r0
+ mov r15=pr
+ nop.i 0
+}
+short://short compare
+{.mib
+ cmp.eq p6,p0=r21,r0 //length == 0?
+ add r16=r19,r21
+ (p6) br.cond.spnt.few restore_exit;;
+}
+
+ { .mmi
+ cmp.le p6,p7=2,r21 // 1
+ ld1 r24=[r19],1 // 1
+ cmp.eq p8,p14=0,r0 // 1
+ }
+ { .mmi
+ ld1 r25=[r20],1 // 1
+ add r17=-1,r16 // 1
+ nop.i 0 ;; // 1
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r25,r24 // 2
+ (p6) ld1 r27=[r20],1 // 2
+ cmp.lt p8,p9=r19,r17 // 2
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 2
+ (p15) br.cond.dptk diff_exit // 2
+ (p7) br.cond.dptk restore_exit ;; // 2
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 3
+ (p8) ld1 r25=[r20],1 // 3
+ cmp.lt p6,p7=r19,r17 // 3
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 3
+ (p14) br.cond.dptk diff_exit_dup // 3
+ (p9) br.cond.dptk restore_exit ;; // 3
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 4
+ (p6) ld1 r27=[r20],1 // 4
+ cmp.lt p8,p9=r19,r17 // 4
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 4
+ (p15) br.cond.dptk diff_exit // 4
+ (p7) br.cond.dptk restore_exit ;; // 4
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 5
+ (p8) ld1 r25=[r20],1 // 5
+ cmp.lt p6,p7=r19,r17 // 5
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 5
+ (p14) br.cond.dptk diff_exit_dup // 5
+ (p9) br.cond.dptk restore_exit ;; // 5
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 6
+ (p6) ld1 r27=[r20],1 // 6
+ cmp.lt p8,p9=r19,r17 // 6
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 6
+ (p15) br.cond.dptk diff_exit // 6
+ (p7) br.cond.dptk restore_exit ;; // 6
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 7
+ (p8) ld1 r25=[r20],1 // 7
+ cmp.lt p6,p7=r19,r17 // 7
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 7
+ (p14) br.cond.dptk diff_exit_dup // 7
+ (p9) br.cond.dptk restore_exit ;; // 7
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 8
+ (p6) ld1 r27=[r20],1 // 8
+ cmp.lt p8,p9=r19,r17 // 8
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 8
+ (p15) br.cond.dptk diff_exit // 8
+ (p7) br.cond.dptk restore_exit ;; // 8
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 9
+ (p8) ld1 r25=[r20],1 // 9
+ cmp.lt p6,p7=r19,r17 // 9
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 9
+ (p14) br.cond.dptk diff_exit_dup // 9
+ (p9) br.cond.dptk restore_exit ;; // 9
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 10
+ (p6) ld1 r27=[r20],1 // 10
+ cmp.lt p8,p9=r19,r17 // 10
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 10
+ (p15) br.cond.dptk diff_exit // 10
+ (p7) br.cond.dptk restore_exit ;; // 10
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 11
+ (p8) ld1 r25=[r20],1 // 11
+ cmp.lt p6,p7=r19,r17 // 11
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 11
+ (p14) br.cond.dptk diff_exit_dup // 11
+ (p9) br.cond.dptk restore_exit ;; // 11
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 12
+ (p6) ld1 r27=[r20],1 // 12
+ cmp.lt p8,p9=r19,r17 // 12
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 12
+ (p15) br.cond.dptk diff_exit // 12
+ (p7) br.cond.dptk restore_exit ;; // 12
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 13
+ (p8) ld1 r25=[r20],1 // 13
+ cmp.lt p6,p7=r19,r17 // 13
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 13
+ (p14) br.cond.dptk diff_exit_dup // 13
+ (p9) br.cond.dptk restore_exit ;; // 13
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 14
+ (p6) ld1 r27=[r20],1 // 14
+ cmp.lt p8,p9=r19,r17 // 14
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 14
+ (p15) br.cond.dptk diff_exit // 14
+ (p7) br.cond.dptk restore_exit ;; // 14
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 15
+ (p8) ld1 r25=[r20],1 // 15
+ cmp.lt p6,p7=r19,r17 // 15
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 15
+ (p14) br.cond.dptk diff_exit_dup // 15
+ (p9) br.cond.dptk restore_exit ;; // 15
+ }
+ { .mmi
+ (p8) cmp.ne.unc p15,p0=r24,r25 // 16
+ (p6) ld1 r27=[r20],1 // 16
+ cmp.lt p8,p9=r19,r17 // 16
+ }
+ { .mbb
+ (p6) ld1 r26=[r19],1 // 16
+ (p15) br.cond.dptk diff_exit // 16
+ (p7) br.cond.dptk restore_exit ;; // 16
+ }
+ { .mmi
+ (p6) cmp.ne.unc p14,p0=r26,r27 // 17
+ (p8) ld1 r25=[r20],1 // 17
+ cmp.lt p6,p7=r19,r17 // 17
+ }
+ { .mbb
+ (p8) ld1 r24=[r19],1 // 17
+ (p14) br.cond.dptk diff_exit_dup // 17
+ (p9) br.cond.dptk restore_exit ;; // 17
+ }
+diff_exit:
+{.mii
+ sub r8=r24,r25
+ mov pr=r15,0x10000 // restore rotating predicates
+ mov.i ar.lc=r18
+}
+{.mfb
+ nop.m 0
+ nop.f 0
+ br.ret.sptk.many b0;;
+}
+diff_exit_dup:
+{.mii
+ sub r8=r26,r27
+ mov pr=r15,0x10000 // restore rotating predicates
+ mov.i ar.lc=r18
+}
+{.mfb
+ nop.m 0
+ nop.f 0
+ br.ret.sptk.many b0;;
+}
+restore_exit:
+{.mii
+ nop.m 0
+ mov pr=r15,0x10000 // restore rotating predicates
+ mov.i ar.lc=r18
+}
+{.mfb
+ nop.m 0
+ nop.f 0
+ br.ret.sptk.many b0;;
+}
+long:
+{.mii
+ alloc r2=ar.pfs,3,37,0,40
+ mov r15=pr
+ and r28=7,r19
+}
+{.mii
+ and r29=7,r20
+ mov.i r18=ar.lc
+ mov r8=r0
+ ;;
+}
+{.mib
+ nop.m 0
+ shr.u r22=r21,3 //needed in aligned case, we'll recalculate r22 for unaligned later
+ nop.b 0
+}
+{.mii
+ cmp.eq p7,p0=r28,r0 //check if first address is 8-bytes aligned
+ cmp.ne p6,p0=r0,r0
+ sub r30=8,r28;;
+}
+{.mib
+ (p7) cmp.eq p6,p0=r29,r0 //check if second address is 8-bytes aligned
+ add r30=-1,r30
+ (p6) br.cond.sptk.many aligned;;//both aligned, use ld8 SWP loop
+}
+{.mib
+ nop.m 0
+ mov.i ar.lc=r30
+ (p7) br.cond.sptk.many unaligned;;
+}
+pre_loop://compare first n bytes (1<=n<=8), so r19 is aligned after this
+{.mmi
+ ld1 r24=[r19],1
+ ld1 r25=[r20],1
+ add r21=-1,r21;;
+}
+{.mbb
+ cmp.eq p0,p10=r24,r25
+ (p10) br.cond.spnt.few diff_exit//<memcmp+528>
+ br.cloop.dptk.few pre_loop//<memcmp+96>
+}
+{.mib
+ nop.m 0
+ cmp.gt p6,p0=16,r21
+ (p6) br.cond.spnt.many short;;//branch to short
+}
+unaligned:
+{.mii
+ and r23=-8,r20 //aligned pointer
+ and r28=7,r20 //shift ammount calculation
+ shr.u r22=r21,3;;//for LC
+}
+{.mii
+ and r29=7,r20
+ shl r28=r28,3
+ adds r22=-2,r22;;
+}
+{.mmi
+ add r29=-8,r29
+ nop.m 0
+ nop.i 0 ;;
+}
+{.mii
+ sub r30=64,r28
+ mov pr.rot=0x10000
+ mov.i ar.ec=7;;
+}
+{.mii
+ ld8 r33=[r23],8
+ add r20=r20,r29
+ mov.i ar.lc=r22;;
+}
+unaligned_loop:
+{.mmi
+ (p16) ld8 r32=[r23],8
+ (p16) ld8 r36=[r19],8
+ (p18) shr.u r43=r35,r28
+}
+{.mii
+ (p21) or r51=r46,r50
+ (p18) shl r47=r34,r30
+ (p22) cmp.eq p0,p6=r42,r52
+}
+{.mbb
+ (p22) add r21=-8,r21
+ (p6) br.cond.spnt.few unaligned_cmp
+ br.ctop.sptk.few unaligned_loop;;
+}
+{.mmb
+ add r20=r23,r29
+ nop.m 0
+ br.sptk.many short;;
+}
+aligned:
+{.mii
+ cmp.eq p0,p6=r0,r0
+ adds r22=-1,r22//calculate LC
+ mov pr.rot=0x10000;;
+}
+{.mii
+ nop.m 0
+ mov.i ar.lc=r22
+ mov.i ar.ec=3;;
+}
+aligned_loop:
+{.mmi
+ (p16) ld8 r32=[r19],8
+ (p16) ld8 r36=[r20],8
+ (p18) cmp.eq p0,p6=r34,r38
+}
+{.mbb
+ (p18) add r21=-8,r21
+ (p6) br.cond.spnt.few aligned_cmp
+ br.ctop.dptk.few aligned_loop;;
+}
+{.mfb
+ nop.m 0
+ nop.f 0
+ br.sptk.many short;;
+}
+unaligned_cmp:
+{.mii
+ nop.m 0
+ mux1 r25=r52,@rev
+ mux1 r24=r42,@rev;;
+}
+{.mii
+ cmp.ltu p10,p11=r24,r25
+ mov pr=r15,0x10000 // restore rotating predicates
+ mov.i ar.lc=r18;;
+}
+{.mii
+ (p11) sub r24=r24,r25
+ (p10) sub r24=r25,r24
+ nop.i 0;;
+}
+{.mii
+ nop.m 0
+ shr.u r25=r24,32;;
+ cmp.ne p6,p7 = r25,r0;;
+}
+{.mii
+ nop.m 0
+ (p6) shr.u r26=r25,16
+ (p7) shr.u r26=r24,16;;
+}
+{.mmi
+ (p7) mov r25=r24
+ cmp.ne p8,p9 = r26,r0
+ nop.i 0;;
+}
+{.mii
+ nop.m 0
+ (p8) shr.u r24=r26,8
+ (p9) shr.u r24=r25,8;;
+}
+{.mmi
+ (p9) mov r26=r25
+ cmp.ne p6,p7 = r24,r0
+ nop.i 0;;
+}
+{.mii
+ (p6) mov r8=r24
+ (p7) mov r8=r26
+ nop.i 0;;
+}
+{.mib
+ (p10) sub r8=r0,r8
+ nop.i 0
+ br.ret.sptk.many b0;;
+}
+
+aligned_cmp:
+{.mii
+ nop.m 0
+ mux1 r24=r34,@rev
+ mux1 r25=r38,@rev;;
+}
+{.mii
+ cmp.ltu p10,p11=r24,r25
+ mov pr=r15,0x10000 // restore rotating predicates
+ mov.i ar.lc=r18;;
+}
+{.mii
+ (p11) sub r24=r24,r25
+ (p10) sub r24=r25,r24
+ nop.i 0;;
+}
+{.mii
+ nop.m 0
+ shr.u r25=r24,32;;
+ cmp.ne p6,p7 = r25,r0;;
+}
+{.mii
+ nop.m 0
+ (p6) shr.u r26=r25,16
+ (p7) shr.u r26=r24,16;;
+}
+{.mmi
+ (p7) mov r25=r24
+ cmp.ne p8,p9 = r26,r0
+ nop.i 0;;
+}
+{.mii
+ nop.m 0
+ (p8) shr.u r24=r26,8
+ (p9) shr.u r24=r25,8;;
+}
+{.mmi
+ (p9) mov r26=r25
+ cmp.ne p6,p7 = r24,r0
+ nop.i 0;;
+}
+{.mii
+ (p6) mov r8=r24
+ (p7) mov r8=r26
+ nop.i 0;;
+}
+{.mib
+ (p10) sub r8=r0,r8
+ nop.i 0
+ br.ret.sptk.many b0;;
+}
+
+// end of memcmp
+ .endp memcmp#
+// End
-weak_alias (memcmp, bcmp)
libc_hidden_builtin_def (memcmp)
--- sysdeps/ia64/memcopyD-large-al.S.intel 2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcopyD-large-al.S 2006-11-30 11:03:41.000000000 -0800
@@ -0,0 +1,316 @@
+// memmove: copy a counted number of bytes.
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// copy length greater or equal to 8 using descending method (decreasing address).
+// To use only if data are aligned on 8 Byte boundaries.
+// This version supports interleaved accesses to limit bank conflict
+//
+// Author: Steve Skedzielewski, JT Acquaviva
+// Date: May, 2002
+//
+
+// -- Begin ?0_memcpyD
+ .section .text
+ .proc ?0_memcpyD#
+ .align 32
+ .global ?0_memcpyD#
+?0_memcpyD:
+ { .mib // ---> cycle 0
+// bundle for quick exit in case of length equal to 0
+ cmp.eq p13,p0=r34,r0
+ mov r8=r32
+ (p13) br.ret.sptk.many b0
+ }
+// computing the base address for src and destination
+ { .mmi
+ and r10=7,r34
+ add r25=r33,r34
+ add r24=r32,r34
+ ;;
+ }
+ { .mmi // ---> cycle 1
+ alloc r56=ar.pfs,3,26,3,24
+ cmp4.le p6,p0 = 2,r10
+ tbit.z.unc p0,p9 = r34,0 // check if the length is odd
+ }
+// bunch of comparisons for short length copy
+// short copy: duplication of address for handling 2 streams of ls and st
+ { .mmi
+ cmp4.le p7 = 4,r10
+ add r42=-1,r25
+ add r43=-2,r25
+ ;;
+ }
+ { .mmi // ---> cycle 2
+ cmp4.le p8 = 6,r10
+ add r40=-1,r24
+ add r41=-2,r24
+ }
+ { .mmi
+ (p9) ld1 r44=[r42], -2
+ (p6) ld1 r45=[r43], -2
+ and r14=-8,r33 // aligned source address
+ ;;
+ }
+ // short copy: start to move byte around !
+ { .mmi // ---> cycle 3
+ (p6) ld1 r46=[r42], -2
+ (p7) ld1 r47=[r43], -2
+ and r15=-8,r32 // aligned destination address
+ }
+ {
+ (p9) st1 [r40]=r44, -2
+ (p6) st1 [r41]=r45, -2
+ sub r63=r34,r10 // testing length to know if eveything is done
+ ;;
+ }
+ { .mmi // --> cycle 4
+ (p7) ld1 r44=[r42], -2
+ (p8) ld1 r45=[r43], -2
+ cmp.ne p11 = 0,r0 // p11 is initialized to 0 (false)
+ }
+ { .mmi
+ (p6) st1 [r40]=r46, -2
+ (p7) st1 [r41]=r47, -2
+ cmp.ne p12 = 0,r0 // p12 is initialized to 0 (false)
+ ;;
+ }
+ { .mmi // --> cycle 5
+ (p8) ld1 r46=[r42], -2
+ nop.m 0
+ cmp.eq.unc p13=r0,r63
+ }
+ { .mmi
+ (p7) st1 [r40]=r44, -2
+ (p8) st1 [r41]=r45, -2
+ cmp.ne.unc p11,p12=r33,r14 // if src not aligned, p11 is set to 1, else p12 is set to 1
+ ;;
+ }
+ { .mib // --> cycle 6
+ (p8) st1 [r40]=r46, -2
+ (p12) cmp.ne p11=r32,r15 // if dest not aligned, p11 is set to 1.
+ (p13) br.ret.spnt.many b0 // if everything is done, just branch out
+ }
+//
+// end of short copy
+//
+// If there are more by to copy continue. First of all the short copy was used to
+// copy the 'tail' (i.e. length mod. 8) therefore we need to decrease the amount
+// of byte to copy by the tail length.
+//
+ { .mmi
+ and r34=-8,r34 // decrease the length by what was already done
+ nop.m 0
+ nop.i 0
+ }
+ { .mib // if p11 set then there is unaligned addresses, branch to the corresponfing routine
+ add r26=-8,r25 // rewind 8 Byte address for src + length address (used later)
+ add r29=-8,r24 // rewind 8 Byte address for dst + length address (used later)
+ (p11) br.cond.spnt ?0_memcopyDu#
+ ;;
+ }
+ { .mmi
+ and r3=-8,r26 // aligned src to the word base address
+ and r2=-8,r29 // aligned dst to the word base address
+ shr.u r25=r63,3 // word_count is length >> 3
+ ;;
+ }
+ { .mmi
+ nop.m 0
+ add r63= -128, r2 // base address for prefetch write stream
+ shr.u r28=r25,3 // count is word_count >> 3 (i.e. r63 >> 6)
+ }
+//
+// start prefetch, then after branch if short case
+//
+ { .mmi
+ add r62= -256, r3 // base address for prefetch read stream
+ add r23= -8, r3
+ and r10= 7,r25
+ ;;
+ }
+ { .mmi
+ add r18=-8,r2
+ add r24=-1,r28
+ mov r58=ar.lc
+ ;;
+ }
+// Prolog copy: start to move byte around !
+// The idea is to proceed to up to 7 8-byte chunk copy, all the remainder (if any)
+// will be do by an unrolled copy-loop. This is equivalent to start by the tail of the
+// unrolled loop.
+//
+ { .mmi
+ cmp4.le p6,p0 = 2,r10
+ cmp4.le p7,p0 = 4,r10
+ tbit.z.unc p10,p9 = r10,0 // check if the length is odd
+ ;;
+ }
+ { .mmi
+ (p9) ld8 r44=[r3], -16
+ (p6) ld8 r45=[r23], -16
+ cmp4.le p8,p0 = 6,r10
+ ;;
+ }
+ { .mmi
+ (p6) ld8 r46=[r3], -16
+ (p7) ld8 r47=[r23], -16
+ mov r59=pr
+ }
+ { .mmi
+ (p9) st8 [r2]=r44, -16
+ (p6) st8 [r18]=r45, -16
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p7) ld8 r44=[r3], -16
+ (p8) ld8 r45=[r23], -16
+ sub r25=r25,r10 // decrease the length of the to do work
+ }
+ { .mmi
+ (p6) st8 [r2]=r46, -16
+ (p7) st8 [r18]=r47, -16
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p8) ld8 r46=[r3], -16
+ nop.m 0
+ cmp.eq p13,p0=r25,r0
+ }
+ { .mmi
+ (p7) st8 [r2]=r44, -16
+ (p8) st8 [r18]=r45, -16
+ nop.i 0
+ ;;
+ }
+ { .mib
+ (p8) st8 [r2]=r46, -16
+ nop.i 0
+ (p13) br.ret.dptk.many b0 // if everything is done, just branch out
+ ;;
+ }
+//
+// end of the copy prolog
+//
+// Now the 8-times unrolled copy loop can be used.
+//
+ { .mmi
+ lfetch [r62],-64 // prefetch read stream
+ lfetch.excl [r63],-128 // prefetch write stream
+ mov ar.ec=3
+ ;;
+ }
+// spacing the addresses to avoid bank conflict in the core loop
+// this operation is legal due to overlapping of accesses: load
+// are done 2 cache line ahead preventing any store to overwrite
+// source data.
+//
+// check p9 to know with of the duplicate pointers was the latest used
+ { .mmi
+ (p10) add r23=-8,r23 // now, r3 and r23 are 16 Byte away to limit bank conflict
+ (p10) add r18=-8,r18 // now, r2 and r18 are 16 Byte away to limit bank conflict
+ mov ar.lc=r24
+ }
+ { .mmi
+ (p9) add r3=r0,r23 // r23 was the last, switch both pointers
+ (p9) add r2=r0,r18 // r18 was the last, switch both pointers
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p9) add r23=-16,r3 // now, r3 and r23 are 16 Byte away to limit bank conflict
+ (p9) add r18=-16,r2 // now, r3 and r23 are 16 Byte away to limit bank conflict
+ mov pr.rot=0x10000
+ ;;
+ }
+
+.b1_11:
+ { .mmi
+ (p16) ld8 r32=[r3],-8
+ (p16) ld8 r35=[r23],-8
+ nop.i 0
+ }
+ { .mmi
+ (p18) st8 [r2]=r34,-8
+ (p18) st8 [r18]=r37,-8
+ tbit.z.unc p0,p8=r62,6
+ ;;
+ }
+ { .mmi
+ (p16) ld8 r38=[r3],-24
+ (p16) ld8 r41=[r23],-24
+ nop.i 0
+ }
+ { .mmi
+ (p18) st8 [r2]=r40,-24
+ (p18) st8 [r18]=r43,-24
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p16) ld8 r44=[r3],-8
+ (p16) ld8 r47=[r23],-8
+ nop.i 0
+ }
+ { .mmi
+ (p18) st8 [r2]=r46,-8
+ (p18) st8 [r18]=r49,-8
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ lfetch [r62],-64 // prefetch read stream
+ (p8) lfetch.excl [r63],-128 // prefetch write stream
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p16) ld8 r50=[r3],-24
+ (p16) ld8 r53=[r23],-24
+ }
+ { .mmb
+ (p18) st8 [r2]=r52,-24
+ (p18) st8 [r18]=r55,-24
+ br.ctop.sptk .b1_11
+ ;;
+ }
+ { .mii
+ nop.m 0
+ mov ar.lc=r58
+ mov pr=r59
+ ;;
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ br.ret.sptk.many b0 // just branch out
+ }
+//
+// End of memcopyD !
+//
+ .endp ?0_memcpyD#
+ .type ?0_memcopyDu#,@function
+ .global ?0_memcopyDu#
+// End
--- sysdeps/ia64/memcopyD-large-ual.S.intel 2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcopyD-large-ual.S 2006-11-30 09:12:55.000000000 -0800
@@ -0,0 +1,426 @@
+// memmove: copy a counted number of bytes.
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// routine specialized for the copy of large (length >8 byte) chunk of
+// of memory by decreasing address. Unaligned boundaries.
+//
+// Author: Steve Skedzielewski, JT Acquaviva
+// Date: February, 2002
+//
+
+ .section .text
+ .proc ?0_memcopyDu#
+ .align 32
+ .global ?0_memcopyDu#
+?0_memcopyDu:
+ { .mii
+ alloc r3=ar.pfs,3,34,0,16
+ mov r62=pr
+ add r21=r33,r34
+ }
+ { .mmi
+ mov r8=r32
+ add r65=0,r0
+ add r31=0,r32
+ ;;
+ }
+ { .mii
+ sub r20=8,r21
+ sxt4 r23=r65
+ and r19=-8,r21
+ }
+ { .mmf
+ add r22=r8,r34
+ mov r66=ar.unat
+ nop.f 0 ;;
+ }
+ { .mmi
+ cmp.ne.unc p0,p8=r19,r21
+ and r29=7,r20
+ mov r63=ar.lc
+ }
+ { .mmi
+ shladd r39=r23,3,r0
+ sub r9=8,r22
+ and r2=-8,r22 ;;
+ }
+ { .mii
+ (p8) add r19=-8,r19
+ and r3=7,r9
+ cmp.ne.unc p0,p9=r2,r22
+ }
+ { .mmf
+ shladd r14=r29,3,r0
+ nop.m 0
+ nop.f 0 ;;
+ }
+ { .mmi
+ ld8 r11=[r19]
+ cmp.leu.unc p7,p6=r29,r3
+ shladd r28=r3,3,r0
+ }
+ { .mmi
+ add r15=r34,r3
+ add r26=-8,r19
+ (p9) add r2=-8,r2 ;;
+ }
+ { .mmi
+ ld8 r29=[r2]
+ sub r25=64,r28
+ shr.u r60=r15,3
+ }
+ { .mmi
+ (p7) sub r52=r28,r14
+ add r10=-8,r2
+ (p6) sub r27=r28,r14 ;;
+ }
+ { .mmi
+ shladd r19=r60,3,r0
+ (p7) sub r21=64,r52
+ shl r16=r11,r14
+ }
+ { .mmi
+ (p7) add r60=-1,r60
+ mov r3=r26
+ (p6) add r52=64,r27 ;;
+ }
+ { .mmi
+ add r58=-8,r26
+ add r22=-24,r26
+ shr.u r20=r29,r25
+ }
+ { .mmi
+ add r45=-32,r26
+ add r24=-48,r26
+ sub r64=r15,r19 ;;
+ }
+ { .mmi
+ shladd r61=r64,3,r0
+ add r23=-56,r26
+ shr.u r15=r60,3
+ }
+ { .mmi
+ add r29=-40,r26
+ add r19=-16,r26
+ shr.u r9=r16,r28 ;;
+ }
+ { .mmi
+ ld8.s r17=[r3]
+ sub r59=64,r61
+ shl r27=r20,r25
+ }
+ { .mmi
+ cmp.leu.unc p15,p0=r15,r0
+ sub r26=r60,r39
+ (p7) shl r18=r11,r21
+ ;;
+ }
+ { .mmi
+ or r25=r9,r27 ;;
+ (p7) st8 [r2]=r25
+ (p7) mov r2=r10
+ }
+ { .mmi
+ (p7) add r10=-8,r10
+ add r67=-64,r3 // prefetch read stream
+ (p6) mov r18=r25
+ ;;
+ }
+ { .mmi
+ add r20=-40,r2
+ add r11=-24,r2
+ add r28=-16,r2
+ }
+ { .mmi
+ add r16=-48,r2
+ add r25=-32,r2
+ add r14=-56,r2
+ ;;
+ }
+ { .mib
+ add r68=-64,r29 // prefetch write stream
+ add r21=8,r2
+ (p15) br.cond.dpnt .b1_8
+ ;;
+//
+// prolog of the core loop. This loop should take benefit from prefetch
+//
+ }
+ { .mmi
+ lfetch [r67], -64 // first read prefetch
+ lfetch.excl [r68], -128 // first write prefetch
+ mov pr.rot=0x10000
+ }
+ { .mmi
+ mov r30=r45
+ mov r33=r65
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ lfetch [r67], -64 // second read prefetch
+ lfetch.excl [r68], -128 // second write prefetch
+ add r17=-1,r15
+ }
+ { .mii
+ nop.m 0
+ mov ar.ec=2
+ mov r8=r52 ;;
+ }
+ { .mmi
+ lfetch [r67], -64 // third read prefetch
+ lfetch.excl [r68], -128 // third write prefetch
+ sub r27=64,r52
+ }
+ { .mii
+ nop.m 0
+ mov ar.lc=r17
+ nop.i 0 ;;
+ }
+.b1_9:
+ { .mmi
+ (p16) ld8 r49=[r3],-64
+ (p16) ld8 r36=[r58],-64
+ (p17) shr.u r48=r21,r8
+ }
+ { .mmi
+ (p17) or r54=r17,r37
+ (p17) st8 [r28]=r9,-64
+ (p17) shl r34=r21,r27 ;;
+ }
+ { .mmi
+ (p16) ld8 r37=[r22],-64
+ (p16) ld8 r57=[r19],-64
+ (p17) shl r18=r39,r27
+ }
+ { .mmi
+ (p17) or r56=r15,r35
+ (p17) st8 [r11]=r54,-64
+ (p17) shr.u r55=r39,r8 ;;
+ }
+ { .mmi
+ (p16) ld8 r54=[r30],-64
+ (p16) ld8 r35=[r29],-64
+ (p16) shr.u r53=r49,r8
+ }
+ { .mmi
+ (p17) or r50=r26,r38
+ (p17) st8 [r25]=r56,-64
+ (p16) shl r39=r49,r27 ;;
+ }
+ { .mmi
+ (p16) ld8 r21=[r24],-64
+ (p16) ld8 r38=[r23],-64
+ (p16) shr.u r52=r36,r8
+ }
+ { .mmi
+ (p17) or r49=r48,r40
+ (p17) st8 [r20]=r50,-64
+ (p16) shl r51=r36,r27 ;;
+ }
+ { .mmi
+ (p9) lfetch.excl [r68], -128 // write stream prefetch
+ lfetch [r67], -64 // read stream prefetch
+ (p17) or r48=r55,r34
+ }
+ { .mii
+ (p17) st8 [r16]=r49,-64
+ (p16) shr.u r50=r57,r8
+ (p16) shl r36=r57,r27
+ ;; //12:107 107
+ }
+ { .mii
+ (p16) or r49=r53,r18
+ tbit.z.unc p0,p9=r67,6 // test to make a write lfetch every 2 iteration
+ (p16) shr.u r17=r37,r8
+ }
+ { .mii
+ (p17) st8 [r14]=r48,-64
+ (p16) shl r34=r37,r27 ;;
+ (p16) shr.u r15=r54,r8
+ }
+ { .mmi
+ (p16) or r48=r52,r39
+ (p16) st8 [r2]=r49,-64
+ (p16) shl r37=r54,r27 ;;
+ }
+ { .mii
+ (p16) or r9=r50,r51
+ (p16) shr.u r26=r35,r8
+ (p16) shl r39=r35,r27
+ }
+ { .mmb
+ (p16) st8 [r10]=r48,-64
+ (p16) add r32=1,r33
+ br.ctop.sptk .b1_9 ;;
+ }
+ { .mii
+ mov r65=r34
+ nop.i 0
+ mov r52=r8
+ }
+ { .bbb
+ nop.b 0
+ nop.b 0
+ nop.b 0 ;;
+ }
+ { .mii
+ ld8.s r17=[r3]
+ sxt4 r23=r65
+ add r21=8,r2 ;;
+ }
+ { .mmi
+ shladd r39=r23,3,r0 ;;
+ sub r26=r60,r39
+ nop.i 0 ;;
+ }
+.b1_8:
+ { .mib
+ cmp.leu.unc p14,p0=r26,r0
+ add r20=-4,r21
+ (p14) br.cond.dpnt .b1_10 ;;
+// Block 26: prolog Pred: 8 Succ: 11
+// Freq 1.0e+001, Prob 1.00, Ipc 3.33
+ }
+ { .mii
+ add r15=-1,r26
+ mov pr.rot=0x10000
+ nop.i 0
+ }
+ { .mfb
+ sub r9=64,r52
+ nop.f 0
+ clrrrb.pr ;;
+ }
+ { .mii
+ nop.m 0
+ mov ar.ec=7
+ mov r32=r18
+ }
+ { .mmi
+ mov r8=r52 ;;
+ nop.m 0
+ mov ar.lc=r15 ;;
+ }
+.b1_11:
+ { .mii
+ (p21) or r45=r41,r37
+ (p18) shl r33=r44,r9
+ (p18) shr.u r38=r44,r8
+ }
+ { .mmb
+ (p16) ld8 r42=[r3],-8
+ (p22) st8 [r2]=r46,-8
+ br.ctop.sptk .b1_11 ;;
+ }
+ { .mii
+ mov r18=r38
+ nop.i 0
+ mov r52=r8 ;;
+ }
+ { .mii
+ ld8.s r17=[r3]
+ add r21=8,r2
+ nop.i 0 ;;
+ }
+ { .mmi
+ add r20=-4,r21
+ nop.m 0
+ nop.i 0 ;;
+ }
+.b1_10:
+ { .mmi
+ cmp.geu.unc p0,p13=r52,r61 ;;
+ cmp.gtu.unc p0,p12=4,r64
+ (p13) shr.u r19=r17,r52 ;;
+ }
+ { .mii
+ (p13) chk.s r17,.b1_66
+ (p12) tbit.z.unc p0,p10=r64,0
+ (p12) mov r21=r20 ;;
+ }
+.b1_67:
+ { .mii
+ add r14=-2,r21
+ (p12) tbit.z.unc p0,p9=r64,1
+ (p13) or r18=r18,r19 ;;
+ }
+ { .mii
+ nop.m 0
+ tbit.z.unc p0,p8=r64,1 ;;
+ (p8) tbit.z.unc p0,p7=r64,0
+ }
+ { .mii
+ (p8) mov r21=r14
+ shr.u r11=r18,r59 ;;
+ tbit.z.unc p0,p6=r64,0 ;;
+ }
+ { .mii
+ (p6) add r10=-1,r21
+ mov ar.lc=r63
+ nop.i 0 ;;
+ }
+ { .mmi
+ mov r9=r11 ;;
+ nop.m 0
+ (p10) shr.u r11=r11,8 ;;
+ }
+ { .mii
+ nop.m 0
+ (p9) shr.u r11=r11,16
+ nop.i 0 ;;
+ }
+ { .mii
+ (p12) st4 [r20]=r11
+ (p12) mov r11=r9 ;;
+ (p7) shr.u r11=r11,8 ;;
+ }
+ { .mii
+ (p8) st2 [r14]=r11
+ (p8) mov r11=r9
+ nop.i 0 ;;
+ }
+ { .mii
+ (p6) st1 [r10]=r11
+ mov pr=r62,0x1003e
+ mov r8=r31
+ }
+ { .mmb
+ nop.m 0
+ mov ar.unat=r66
+ br.ret.sptk.many b0 ;;
+ }
+.b1_66:
+ { .mii
+ ld8 r17=[r3]
+ (p12) tbit.z.unc p0,p10=r64,0
+ (p12) mov r21=r20 ;;
+ }
+ { .mib
+ nop.m 0
+ (p13) shr.u r19=r17,r52
+ br.cond.sptk .b1_67
+ ;;
+ }
+ .endp ?0_memcopyDu#
+// End
--- sysdeps/ia64/memcpy-a0-mt-array.S.intel 2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/memcpy-a0-mt-array.S 2006-11-30 09:12:55.000000000 -0800
@@ -0,0 +1,442 @@
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+//
+// Author: Steve Skedzielewski
+// Date: January, 2002
+// + changed by Boris Shurygin, Moscow
+// October 2004
+//
+ .section .text
+// -- Begin ?0_memcopyA
+ .proc ?0_memcopyA#
+ .align 32
+ .global ?0_memcopyA#
+ .prologue
+?0_memcopyA:
+ { .mii
+ alloc r3=ar.pfs,3,18,0,16 //0: 5 358
+ add r26=-8,r0 //0: 37 19
+ nop.i 0 ;;
+ }
+ { .mmi
+ and r24=7,r32 //1: 18 7
+ and r27=-8,r33 //1: 37 17
+ .save ar.lc,r52
+ mov r52=pr //1: 5 356
+ }
+ { .mmi
+ and r14=r33,r26 //1: 37 20
+ mov r51=r32 //1: 5 3
+ and r28=7,r33 ;; //1: 20 8
+ }
+ { .mmi
+ add r10=r34,r24 //2: 29 13
+ ld8 r9=[r14] //2: 37 21
+ .save ar.lc,r50
+ mov r50=ar.lc //2: 5 2
+ }
+ .body
+ { .mmi
+ add r25=7,r32 //2: 22 11
+ shladd r11=r28,3,r0 //2: 20 9
+ cmp.leu.unc p13,p12=r28,r24 ;; //2: 40 267
+ }
+ { .mii
+ (p12) cmp.eq.unc p9,p8=r24,r0 //3: 43 271
+ shr.u r49=r10,3 //3: 30 14
+ add r48=8,r27 //3: 37 18
+ }
+ { .mfb
+ (p13) cmp.eq.unc p10,p0=r24,r0 //3: 61 269
+ nop.f 0
+ (p10) br.cond.dpnt.many .b1_58 ;; //3: 61 30
+// Block 2: collapsed Pred: 0 Succ: 3 4 -G
+// Freq 7.5e-001, Prob 0.25
+ }
+ { .mii
+ (p8) ld8 r22=[r48] //0: 44 36
+ (p8) sub r21=64,r11 //0: 45 38
+ and r20=-8,r25 //0: 22 306
+ }
+ { .mmb
+ shladd r27=r24,3,r0 //0: 21 305
+ shladd r26=r49,3,r0 //0: 31 307
+ (p9) br.cond.dpnt.many .b1_3 ;; //0: 43 34
+// Block 4: collapsed Pred: 2 Succ: 57 8 -GO
+// Freq 5.6e-001, Prob 0.50
+ }
+ { .mmi
+ sub r25=8,r24 //0: 63 51
+ add r49=-1,r49 //0: 62 50
+ shr.u r19=r9,r11 //0: 37 311
+ }
+ { .mmi
+ (p8) add r48=8,r48 //0: 44 37
+ sub r24=r10,r26 //0: 31 312
+ (p12) sub r18=r27,r11 ;; //0: 42 31
+ }
+ { .mii
+ cmp.gtu.unc p0,p6=4,r25 //1: 66 266
+ tbit.z.unc p7,p0=r25,0 //1: 64 52
+ (p8) shl r17=r22,r21 //1: 45 39
+ }
+ { .mfb
+ (p12) add r3=64,r18 //1: 42 315
+ nop.f 0
+ (p7) br.cond.dpnt.many .b1_57 ;; //1: 64 53
+// Block 8: Pred: 4 Succ: 7 -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+ { .mii
+ (p13) sub r3=r27,r11 //0: 53 319
+ (p13) sub r16=64,r27 //0: 55 317
+ (p8) sub r15=64,r3 //0: 46 318
+ }
+ { .mmi
+ add r32=1,r51 ;; //0: 64 56
+.pred.rel "mutex", p8, p13
+.pred.rel "mutex", p13, p8
+ (p8) or r19=r19,r17 //2: 45 323
+ (p13) shr.u r35=r19,r16 //2: 55 324
+ }
+ { .mmi
+ nop.m 0
+ nop.m 0
+ (p8) shr.u r35=r22,r15 ;; //2: 46 325
+ }
+ { .mib
+ st1 [r51]=r19 //3: 64 57
+ shr.u r19=r19,8 //3: 64 58
+ br.cond.sptk.many .b1_7 ;; //3: 64 320
+// Block 57: Pred: 4 Succ: 7 -GO
+// Freq 2.8e-001, Prob 1.00
+ }
+.b1_57:
+ { .mii
+.pred.rel "mutex", p8, p13
+ (p13) sub r3=r27,r11 //0: 53 321
+ (p13) sub r16=64,r27 //0: 55 27
+ (p8) sub r15=64,r3 ;; //0: 46 41
+.pred.rel "mutex", p8, p13
+ }
+ { .mii
+ (p8) or r19=r19,r17 //2: 45 40
+ (p8) shr.u r35=r22,r15 //2: 46 42
+ (p13) shr.u r35=r19,r16 ;; //2: 55 28
+// Block 7: Pred: 8 57 Succ: 9 10 -G
+// Freq 5.0e-001, Prob 0.50
+ }
+.b1_7:
+ { .mib
+ nop.m 0
+ tbit.z.unc p15,p0=r25,1 //0: 65 54
+ (p15) br.cond.dpnt.many .b1_9 ;; //0: 65 55
+// Block 10: Pred: 7 Succ: 9 -G
+// Freq 2.5e-001, Prob 1.00
+ }
+ { .mii
+ st2 [r32]=r19,2 //0: 65 61
+ shr.u r19=r19,16 //0: 65 63
+ nop.i 0 ;;
+// Block 9: collapsed Pred: 7 10 Succ: 5 -G
+// Freq 5.0e-001, Prob 1.00
+ }
+.b1_9:
+ { .mib
+ (p6) st4 [r32]=r19 //0: 66 64
+ nop.i 0
+ br.cond.sptk.many .b1_5 ;; //0: 66 274
+// Block 3: Pred: 2 Succ: 5 -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+.b1_3:
+ { .mii
+ sub r24=r10,r26 //0: 31 313
+ shr.u r19=r9,r11 //0: 37 309
+ (p12) sub r18=r27,r11 //0: 42 310
+ }
+ { .mmi
+ (p13) sub r3=r27,r11 ;; //0: 53 26
+ (p12) add r3=64,r18 //1: 42 32
+ nop.i 0 ;;
+ }
+ { .mib
+ mov r35=r19 //3: 48 35
+ nop.i 0
+ br.cond.sptk.many .b1_5 ;; //3: 48 308
+// Block 58: Pred: 0 Succ: 5 -GO
+// Freq 2.5e-001, Prob 1.00
+ }
+.b1_58:
+ { .mii
+ shladd r26=r49,3,r0 //0: 31 15
+ shladd r27=r24,3,r0 //0: 21 10
+ shr.u r19=r9,r11 //0: 37 22
+ }
+ { .mmi
+ and r20=-8,r25 ;; //0: 22 12
+ sub r24=r10,r26 //1: 31 16
+ (p13) sub r16=64,r27 //1: 55 314
+ }
+ { .mii
+ nop.m 0
+ (p13) sub r3=r27,r11 ;; //1: 53 316
+ (p13) shr.u r35=r19,r16 ;; //3: 55 322
+// Block 5: Pred: 9 3 58 Succ: 12 13 -G
+// Freq 1.0e+000, Prob 0.01
+ }
+.b1_5:
+ { .mmi
+ add r2=8,r20 //0: 89 46
+ mov r10=r48 //0: 89 44
+ shr.u r14=r49,1 //0: 89 47
+ }
+ { .mmi
+ add r11=8,r48 //0: 89 43
+ add r30=1,r0 //0: 89 65
+ mov r8=r20 ;; //0: 89 45
+ }
+ { .mib
+ cmp.gt.unc p14,p0=1,r14 //1: 89 48
+ shladd r29=r14,1,r30 //1: 89 66
+ (p14) br.cond.dpnt.many .b1_12 ;; //1: 89 49
+// Block 13: prolog Pred: 5 Succ: 14 -O
+// Freq 2.5e-001, Prob 1.00
+ }
+ { .mmi
+ add r30=8000,r48
+ add r31=1400,r8 //0: 70 75
+ mov pr.rot=0x10000 //0: 89 172
+ }
+ { .mii
+ sub r9=64,r3 //0: 73 76
+ add r23=-1,r14 //0: 73 77
+ mov ar.ec=3 ;; //1: 89 174
+ }
+ { .mii
+ add r14=460,r48
+ mov ar.lc=r23 //2: 73 78
+ add r15=5120,r48
+}
+{ .mii
+ add r34=1600,r48 //0: 70 74
+ add r33=1000,r20
+ shr.u r27=r23,16 //How many Megs we have to copy?
+ ;;
+}{ .mib
+ cmp.lt p15,p0=8,r27
+ nop.i 0
+ (p15) br.cond.spnt.many unaligned_huge_loop
+;;
+// Block 14: lentry lexit ltail collapsed pipelined Pred: 14 13 Succ: 14 29 -S
+// Freq 5.0e+001, Prob 1.00
+}
+.b1_14:
+{ .mmi
+ (p16) ld8 r37=[r11],16 //0: 71 84
+ (p16) ld8 r45=[r10],16 //0: 71 79
+ (p17) shr.u r35=r38,r9 //3: 73 88
+ }
+ { .mmi
+ (p18) or r46=r43,r40 //6: 72 86
+ (p18) st8 [r8]=r44,16 //6: 72 82
+ (p17) shl r42=r38,r3 ;; //3: 72 85
+ }
+ { .mmi
+ (p18) st8 [r2]=r46,16 //7: 72 87
+ (p16) lfetch.nt1 [r34] //1: 70 89
+ nop.i 0 ;;
+ }
+ { .mii
+ (p17) or r43=r41,r36 //5: 72 81
+ (p16) shl r40=r45,r3 //2: 72 80
+ (p16) shr.u r38=r45,r9 //2: 73 83
+ }
+ { .mmb
+ (p16) add r32=32,r34 //2: 70 90
+ (p16) lfetch [r14],16
+ br.ctop.sptk .b1_14 ;; //2: 89 97
+// Block 29: epilog Pred: 14 Succ: 12 -O
+// Freq 5.0e-001, Prob 1.00
+ }
+// Block 29: epilog Pred: 14 Succ: 12 -O
+// Freq 5.0e-001, Prob 1.00
+{ .mii
+ mov r35=r37 //0: 89 171
+ nop.i 0
+ nop.i 0 ;;
+// Block 12: Pred: 5 29 Succ: 16 26 -G
+// Freq 1.0e+000, Prob 0.01
+ }
+.b1_12:
+ { .mii
+ shladd r28=r29,3,r0 //0: 89 67
+ cmp.gt.unc p13,p0=r29,r49 ;; //0: 89 72
+ add r27=r48,r28 //1: 89 70
+ }
+ { .mmi
+ add r26=r20,r28 ;; //1: 89 68
+ add r14=-8,r26 //2: 89 69
+ add r15=-8,r27 //2: 89 71
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ (p13) br.cond.dpnt.many .b1_16 ;; //2: 89 73
+// Block 26: prolog Pred: 12 Succ: 17 -O
+// Freq 5.0e-003, Prob 1.00
+ }
+ { .mii
+ sub r16=64,r3 //0: 73 130
+ mov pr.rot=0x10000 //0: 89 144
+ sub r22=r49,r29 //0: 73 131
+ }
+ { .mfb
+ nop.m 0
+ nop.f 0
+ clrrrb.pr ;; //0: 89 145
+ }
+ { .mii
+ mov r32=r35 //1: 89 142
+ mov ar.ec=7 ;; //1: 89 146
+ mov ar.lc=r22 ;; //2: 73 132
+// Block 17: lentry lexit ltail collapsed pipelined Pred: 26 17 Succ: 17 28 -S
+// Freq 1.0e+000, Prob 0.99
+ }
+.b1_17:
+ { .mii
+ (p21) or r45=r41,r37 //5: 72 103
+ (p18) shr.u r33=r44,r16 //2: 73 105
+ (p18) shl r38=r44,r3 //2: 72 102
+ }
+ { .mmb
+ (p16) ld8 r42=[r15],8 //0: 71 101
+ (p22) st8 [r14]=r46,8 //6: 72 104
+ br.ctop.sptk .b1_17 ;; //0: 89 108
+// Block 28: epilog Pred: 17 Succ: 16 -O
+// Freq 1.0e-002, Prob 1.00
+ }
+ { .mii
+ mov r35=r38 //0: 89 143
+ nop.i 0
+ nop.i 0 ;;
+// Block 16: Pred: 12 28 Succ: 18 19 -G
+// Freq 1.0e+000, Prob 0.50
+ }
+.b1_16:
+ { .mmi
+ shladd r21=r49,3,r20 //0: 84 109
+ shladd r17=r24,3,r0 //0: 80 98
+ tbit.z.unc p0,p12=r24,0 //0: 87 265
+ }
+ { .mii
+ cmp.gtu.unc p7,p8=4,r24 //0: 85 110
+ shladd r20=r49,3,r48 ;; //0: 81 112
+ mov ar.lc=r50 //1: 88 126
+ }
+ { .mib
+ cmp.geu.unc p6,p0=r3,r17 //1: 80 99
+ nop.i 0
+ (p6) br.cond.dpnt.many .b1_18 ;; //1: 80 100
+// Block 19: Pred: 16 Succ: 18 -G
+// Freq 5.0e-001, Prob 1.00
+ }
+ { .mmi
+ ld8 r19=[r20] ;; //0: 81 113
+ nop.m 0
+ shl r18=r19,r3 ;; //2: 81 114
+ }
+ { .mii
+ or r35=r35,r18 //5: 81 115
+ nop.i 0
+ nop.i 0 ;;
+// Block 18: Pred: 16 19 Succ: 20 21 -G
+// Freq 1.0e+000, Prob 0.50
+ }
+.b1_18:
+ { .mib
+ (p8) st4 [r21]=r35,4 //0: 85 118
+ tbit.z.unc p9,p10=r24,1 //0: 86 116
+ (p7) br.cond.dpnt.many .b1_20 ;; //0: 85 111
+// Block 21: Pred: 18 Succ: 20 -G
+// Freq 5.0e-001, Prob 1.00
+ }
+ { .mii
+ nop.m 0
+ shr.u r35=r35,32 //0: 85 120
+ nop.i 0 ;;
+// Block 20: Pred: 18 21 Succ: 22 -GO
+// Freq 1.0e+000, Prob 1.00
+ }
+.b1_20:
+// Block 22: exit collapsed Pred: 20 Succ: -GO
+// Freq 1.0e+000, Prob 1.00
+ { .mii
+ (p10) st2 [r21]=r35,2 //0: 86 123
+ (p10) shr.u r35=r35,16 //0: 86 125
+ mov r8=r51 ;; //0: 88 127
+ }
+ { .mib
+ (p12) st1 [r21]=r35 //1: 87 129
+ mov pr=r52,0x1003e //1: 88 357
+ br.ret.sptk.many b0 ;; //1: 88 128
+ }
+unaligned_huge_loop:
+ {.mmi
+ (p16) ld8 r37=[r11],16
+ (p16) ld8 r45=[r10],16
+ (p17) shr.u r35=r38,r9
+ }
+ {.mmi
+ (p18) st8 [r8]=r44,16
+ (p18) or r46=r43,r40
+ (p17) shl r42=r38,r3 ;;
+ }
+ {.mmi
+ (p18) st8 [r2]=r46,16
+ (p16) lfetch.nt1 [r31],16 //fetch line for writing
+ nop.i 0;;
+ }
+ {.mmi
+ (p16) lfetch.nta [r30],16
+ nop.m 0
+ nop.i 0;;
+ }
+ {.mmi
+ (p16) lfetch.nt1 [r15],16 //fetch line to L2 for reading
+ (p16) lfetch [r14],16 //fetch line to L1 for reading
+ (p16) shr.u r38=r45,r9
+ }
+ {.mib
+ (p17) or r43=r41,r36
+ (p16) shl r40=r45,r3
+ br.ctop.sptk unaligned_huge_loop;;
+ }
+{ .mib
+ mov r35=r37
+ nop.i 0
+ br.cond.sptk.many .b1_12;;
+ }
+
+// -- End ?0_memcopyA
+ .endp ?0_memcopyA#
+// End
--- sysdeps/ia64/memcpy.S.intel 2003-11-18 22:11:26.000000000 -0800
+++ sysdeps/ia64/memcpy.S 2006-11-30 11:29:19.000000000 -0800
@@ -1,436 +1,533 @@
-/* Optimized version of the standard memcpy() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
- Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
- Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-/* Return: dest
-
- Inputs:
- in0: dest
- in1: src
- in2: byte count
-
- An assembly implementation of the algorithm used by the generic C
- version from glibc. The case when source and sest are aligned is
- treated separately, for extra performance.
-
- In this form, memcpy assumes little endian mode. For big endian mode,
- sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
- and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
- shrp instruction. */
-
-#define USE_LFETCH
-#define USE_FLP
+// memcpy: copy a counted number of bytes.
+//
+// Copyright (c) 2000-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// The copy is performed in an ascending (increasing addresses). The code is divided
+// in 6 parts.
+// 1/ First part is straight line code for short copy (i.e. length < 8 Byte)
+// 2/ Second part is testing alignment, if accesses are not aligned on word boundaries
+// the unaligned version is jumped to.
+// 3/ Third part is medium length copy (less than 64 Byte), this part is also used as
+// an epiolog for the unrolled loop (part 4, 5).
+// 4/ Fourth part: used for copying more than 64 bytes, but less than 1K.
+// To do this we use the unrolled SWP loop.
+// 5/ Fifth part: aligned large loops: uses floating loadpair8/store8 to keep
+// data in the L2 cache (avoid moving into L1); also using lfetches.
+// 6/ Sixth part: for very huge moves: > 8 mbytes. Similar to part 4 but
+// with an additional lfetch.nta to get the data closer.
+//
+// Author: Steve Skedzielewski, JT Acquaviva
+// Date: February, 2002
+// + changed by Boris Shurygin, Moscow
+// September 2004
+
#include <sysdep.h>
#undef ret
-#define LFETCH_DIST 500
-
-#define ALIGN_UNROLL_no 4 // no. of elements
-#define ALIGN_UNROLL_sh 2 // (shift amount)
-
-#define MEMLAT 8
-#define Nrot ((4*(MEMLAT+2) + 7) & ~7)
-
-#define OP_T_THRES 16
-#define OPSIZ 8
-
-#define loopcnt r14
-#define elemcnt r15
-#define saved_pr r16
-#define saved_lc r17
-#define adest r18
-#define dest r19
-#define asrc r20
-#define src r21
-#define len r22
-#define tmp2 r23
-#define tmp3 r24
-#define tmp4 r25
-#define ptable r26
-#define ploop56 r27
-#define loopaddr r28
-#define sh1 r29
-#define ptr1 r30
-#define ptr2 r31
-
-#define movi0 mov
-
-#define p_scr p6
-#define p_xtr p7
-#define p_nxtr p8
-#define p_few p9
-
-#if defined(USE_FLP)
-#define load ldf8
-#define store stf8
-#define tempreg f6
-#define the_r fr
-#define the_s fs
-#define the_t ft
-#define the_q fq
-#define the_w fw
-#define the_x fx
-#define the_y fy
-#define the_z fz
-#elif defined(USE_INT)
-#define load ld8
-#define store st8
-#define tempreg tmp2
-#define the_r r
-#define the_s s
-#define the_t t
-#define the_q q
-#define the_w w
-#define the_x x
-#define the_y y
-#define the_z z
-#endif
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-/* Manually force proper loop-alignment. Note: be sure to
- double-check the code-layout after making any changes to
- this routine! */
-# define ALIGN(n) { nop 0 }
-#else
-# define ALIGN(n) .align n
-#endif
-
-#if defined(USE_LFETCH)
-#define LOOP(shift) \
- ALIGN(32); \
-.loop##shift##: \
-{ .mmb \
-(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
-(p[0]) lfetch.nt1 [ptr1], 16 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
-(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
-(p[0]) lfetch.nt1 [ptr2], 16 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
-;; } \
-{ .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
-}
-#else
-#define LOOP(shift) \
- ALIGN(32); \
-.loop##shift##: \
-{ .mmb \
-(p[0]) ld8.nt1 r[0] = [asrc], 8 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp3, 8 ; \
-(p[MEMLAT]) shrp tmp3 = r[MEMLAT], s[MEMLAT+1], shift ; \
- nop.b 0 ;; \
- } { .mmb \
-(p[0]) ld8.nt1 s[0] = [asrc], 8 ; \
- nop.b 0 ; \
-} { .mib \
-(p[MEMLAT+1]) st8 [dest] = tmp4, 8 ; \
-(p[MEMLAT]) shrp tmp4 = s[MEMLAT], r[MEMLAT], shift ; \
- br.ctop.sptk.many .loop##shift \
-;; } \
-{ .mib \
- br.cond.sptk.many .copy_bytes ; /* deal with the remaining bytes */ \
-}
-#endif
-
-
-ENTRY(memcpy)
-{ .mmi
+ .section .text
+ .proc memcpy#
+ .align 32
+ .global memcpy#
.prologue
- alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
- .rotr r[MEMLAT+1], s[MEMLAT+2], q[MEMLAT+1], t[MEMLAT+1]
- .rotp p[MEMLAT+2]
- .rotf fr[MEMLAT+1], fq[MEMLAT+1], fs[MEMLAT+1], ft[MEMLAT+1]
- mov ret0 = in0 // return tmp2 = dest
- .save pr, saved_pr
- movi0 saved_pr = pr // save the predicate registers
-} { .mmi
- and tmp4 = 7, in0 // check if destination is aligned
- mov dest = in0 // dest
- mov src = in1 // src
-;; }
-{ .mii
- cmp.eq p_scr, p0 = in2, r0 // if (len == 0)
- .save ar.lc, saved_lc
- movi0 saved_lc = ar.lc // save the loop counter
- .body
- cmp.ge p_few, p0 = OP_T_THRES, in2 // is len <= OP_T_THRESH
-} { .mbb
- mov len = in2 // len
-(p_scr) br.cond.dpnt.few .restore_and_exit // Branch no. 1: return dest
-(p_few) br.cond.dpnt.many .copy_bytes // Branch no. 2: copy byte by byte
-;; }
-{ .mmi
-#if defined(USE_LFETCH)
- lfetch.nt1 [dest] //
- lfetch.nt1 [src] //
-#endif
- shr.u elemcnt = len, 3 // elemcnt = len / 8
-} { .mib
- cmp.eq p_scr, p0 = tmp4, r0 // is destination aligned?
- sub loopcnt = 7, tmp4 //
-(p_scr) br.cond.dptk.many .dest_aligned
-;; }
-{ .mmi
- ld1 tmp2 = [src], 1 //
- sub len = len, loopcnt, 1 // reduce len
- movi0 ar.lc = loopcnt //
-} { .mib
- cmp.ne p_scr, p0 = 0, loopcnt // avoid loading beyond end-point
-;; }
+memcpy:
+ { .mmi // -----------------> cycle 0
+ mov r8=r32 // save dest pointer for return value
+ cmp.eq p8,p0 = 0,r34 // is length 0 ?
+ cmp.le.unc p9,p0=8,r34 // length > 8 ?
+ }
+ { .mib
+ and r3=7,r33 // use to compute alignement
+ and r2=7,r32 // use to compute alignement
+ (p8) br.ret.sptk.many b0 // ---> if length is 0 branch out
+ ;;
+ }
+ { .mib // -----------------> cycle 1
+ add r22=1,r32 // second dst pointer (for short copy)
+ add r23=1,r33 // second src pointer (for short copy)
+ (p9) br.cond.dpnt .b1_2 // ---> if length > 7 goto large memcopy
+ }
+// start short copy
+short:
+{ .mmi
+ cmp.le p8,p9 = 2,r34
+ cmp.le p10,p11 = 4,r34
+ tbit.z.unc p0,p14=r34,0 // check if the length is odd
+ ;;
+ }
+ { .mmi // -----------------> cycle 2
+(p14) ld1 r14=[r33],2 // load byte 0
+(p8) ld1 r15=[r23],2 // load byte 1
+ cmp.le p12,p13 = 6,r34
+ ;;
+ }
+ { .mmi // -----------------> cycle 4
+(p14) st1 [r32]=r14,2 // store byte 0
+(p8) st1 [r22]=r15,2 // store byte 1
+ nop.i 0
+ }
+ { .mmb
+(p8) ld1 r16=[r33],2 // load byte 2
+(p10) ld1 r17=[r23],2 // load byte 3
+(p9) br.ret.dpnt b0 // if length <= 1 everything is done branch out
+ ;;
+ }
+ { .mmb // -----------------> cycle 5
+(p8) st1 [r32]=r16,2 // store byte 2
+(p10) st1 [r22]=r17,2 // store byte 3
+(p11) br.ret.dpnt b0 // if length <= 3 everything is done branch out
+ }
+ { .mmi
+(p10) ld1 r14=[r33],2 // load byte 4
+(p12) ld1 r15=[r23],2 // load byte 5
+ nop.i 0
+ ;;
+ }
+ { .mmb // -----------------> cycle 6
+(p10) st1 [r32]=r14,2 // store byte 4
+(p12) st1 [r22]=r15,2 // store byte 5
+(p13) br.ret.dptk b0 // if length <= 5 everything is done branch out
+ }
+ { .mii
+(p12) ld1 r16=[r33],2 // load byte 6
+ nop.i 0
+ nop.i 0
+ ;;
+ }
+ { .mmb // -----------------> cycle 7
+(p12) st1 [r32]=r16,2 // store byte 6
+ nop.m 0
+ br.ret.dptk b0 // done all cases
+ ;;
+ }
+// end of short memcopy
+//
+// large memcopy: branch to huge loop if necessary, else remain
+// on straight code.
+// Definition of BIG, i.e. size used by the unrolled loop
+ BIG=0x40
+.b1_2:
+{ .mmi
+ cmp.ne.unc p7,p6=r2,r3
+ sub r15=8,r2 // for dst
+ sub r16=8,r3 // for src
+}
+{ .mii
+ sub r28=r3,r2
+ tbit.nz p8,p0 = r3,0 // src byte-aligned?
+ mov ar.ec = 3 // Only needed by align_huge
+ ;;
+}
+{.mii
+ (p6) cmp.ne.unc p6=r2,r0
+ .save ar.lc,r11
+ mov r11=ar.lc
+ nop.i 0
+}
+{.mbb
+ mov r17=1
+ (p6) br.cond.dptk unaligned
+ (p7) br.cond.dptk ?0_memcopyA#;; // --> unaligned branch to it
+}
-.l0: // ---------------------------- // L0: Align src on 8-byte boundary
+//
+// Medium length, aligned copy. Used also as an epilog for loops
+//
+aligned:
+{ .mii // ---> cycle 0
+ cmp.leu p10= 0x40, r34
+ cmp.leu p8 = 0x10, r34
+ cmp.le p9 = 0x18, r34
+}
+{ .mib
+ cmp.le p7= 0x8, r34
+ add r28 = 8, r33 //second source pointer
+ (p10) br.cond.dpnt aligned_huge
+ ;;
+}
+{ .mmi // ---> cycle 1
+(p7) ld8 r17 = [r33],16
+(p8) ld8 r18 = [r28],16
+ cmp.le p11 = 0x28,r34
+}
{ .mmi
- st1 [dest] = tmp2, 1 //
-(p_scr) ld1 tmp2 = [src], 1 //
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l0 //
-;; }
-
-.dest_aligned:
+ cmp.le p10 = 0x20, r34
+ cmp.le p12 = 0x30, r34
+ cmp.le p13 = 0x38, r34
+ ;;
+}
+{ .mmi // --> cycle 2
+ (p9) ld8 r19 = [r33],16
+ (p10) ld8 r20 = [r28],16
+ tbit.nz.unc p6 = r34, 3 // count >= 8 mod 16?
+ ;;
+}
+{ .mmi // ---> cycle 3
+ (p11) ld8 r21 = [r33],16
+ (p12) ld8 r22 = [r28],16
+ tbit.nz.unc p15 = r34, 1
+}
{ .mmi
- and tmp4 = 7, src // ready for alignment check
- shr.u elemcnt = len, 3 // elemcnt = len / 8
-;; }
+ add r27 = 8, r32 // second destination pointer
+ nop.m 0
+ nop.i 0
+ ;;
+}
+{ .mmi // ---> cycle 4
+ (p13) ld8 r23 = [r33],16
+ (p7) st8 [r32] = r17,16
+ tbit.nz.unc p14 = r34, 2
+ ;;
+}
+{ .mmi // ---> cycle 5
+ (p8) st8 [r27] = r18,16
+ (p9) st8 [r32] = r19,16
+ (p6) add r33 = 0, r28 // r33 did the last ld8, r28 has next
+ ;;
+}
+{ .mmi // ---> cycle 6
+ (p10) st8 [r27] = r20,16
+ (p14) ld4 r24 = [r33],4
+ tbit.nz.unc p7 = r34, 0
+ ;;
+}
+{ .mmi // ---> cycle 7
+ (p11) st8 [r32] = r21,16
+ (p15) ld2 r25 = [r33],2
+ nop.i 0
+ ;;
+} { .mmi // ---> cycle 8
+ (p12) st8 [r27] = r22,16
+ (p7) ld1 r26 = [r33]
+ nop.i 0
+ ;;
+} { .mmi // ---> cycle 9
+ (p13) st8 [r32] = r23,16
+ ;;
+ (p6) mov r32 = r27 // r32 did the last st8, r27 has next
+ ;;
+} { .mmi // ---> cycle 11
+ (p14) st4 [r32] = r24,4
+ ;;
+ (p15) st2 [r32] = r25,2
+ ;;
+} { .mib // ---> cycle 13
+ (p7) st1 [r32] = r26
+ br.ret.sptk b0
+ ;;
+}
+//Here we deal with sizes that are greater or equal to 64 bytes
+//First we check if the source address is 16-byte aligned(if not we copy first 8bytes and continue)
+//Then if length is > 8Megs we use very_huge_loop, if not we use huge_loop
+//these two loops are very much alike but very_huge_loop uses lfetch.nta to bring lines from memory to L3
+//They both use ldfp instruction in order to occupy only one memory port(M0 or M1) for loading 16bytes and use the other for lfetch
+//
+// Long copies benefit from prefetch
+//
+aligned_huge:
+{ .mii
+ alloc r31=ar.pfs,3,29,0,32
+ shl r17=r17,10 //1K
+ .save pr,r22
+ mov r22 = pr
+}
+{ .mii
+ mov r29 = r33
+ shr r23 = r34,6
+ add r27 = 16, r32 // one bank away
+ ;;
+}{ .mmi
+ add r25 = 512, r33 // read stream
+ add r23 = -1,r23
+ add r28 = 16 , r33 // one bank away
+}
{ .mib
- cmp.ne p_scr, p0 = tmp4, r0 // is source also aligned
- tbit.nz p_xtr, p_nxtr = src, 3 // prepare a separate move if src
-} { .mib // is not 16B aligned
- add ptr2 = LFETCH_DIST, dest // prefetch address
- add ptr1 = LFETCH_DIST, src
-(p_scr) br.cond.dptk.many .src_not_aligned
-;; }
-
-// The optimal case, when dest, and src are aligned
-
-.both_aligned:
-{ .mmi
- .pred.rel "mutex",p_xtr,p_nxtr
-(p_xtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no+1, elemcnt // Need N + 1 to qualify
-(p_nxtr) cmp.gt p_scr, p0 = ALIGN_UNROLL_no, elemcnt // Need only N to qualify
- movi0 pr.rot = 1 << 16 // set rotating predicates
-} { .mib
-(p_scr) br.cond.dpnt.many .copy_full_words
-;; }
-
+ cmp.le p12=r17,r34
+ mov r26 = r32
+ (p12) br.cond.spnt.many aligned_very_huge // branch to copying very huge ammounts 1K
+ ;;
+}
+{ .mii
+ lfetch [r25], 64
+ mov ar.lc = r23
+ nop.i 0
+} { .mii
+ add r24 = 512, r32 // write stream
+ mov r31 = r34
+ mov pr.rot = 0x10000
+ ;;
+}
+huge_loop: // Accesses are interleaved to avoid bank conflict
+ .body
+{ .mmi
+ lfetch [r25], 64
+ (p8) lfetch.excl [r24], 128
+ ;;
+}
{ .mmi
-(p_xtr) load tempreg = [src], 8
-(p_xtr) add elemcnt = -1, elemcnt
- movi0 ar.ec = MEMLAT + 1 // set the epilog counter
-;; }
+ (p16) ld8 r35 = [r29], 8
+ (p16) ld8 r38 = [r28], 8
+ (p16) add r31 = -64, r31
+}
{ .mmi
-(p_xtr) add len = -8, len //
- add asrc = 16, src // one bank apart (for USE_INT)
- shr.u loopcnt = elemcnt, ALIGN_UNROLL_sh // cater for unrolling
-;;}
+ (p18) st8 [r26] = r37, 8
+ (p18) st8 [r27] = r40, 8
+ ;;
+}
{ .mmi
- add loopcnt = -1, loopcnt
-(p_xtr) store [dest] = tempreg, 8 // copy the "extra" word
- nop.i 0
-;; }
-{ .mib
- add adest = 16, dest
- movi0 ar.lc = loopcnt // set the loop counter
-;; }
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
- { nop 0 }
-#else
- .align 32
-#endif
-#if defined(USE_FLP)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+ (p16) ld8 r41 = [r29], 24
+ (p16) ld8 r44 = [r28], 24
+ cmp.le p9 = 8, r31
+}
{ .mmi
-#if defined(USE_LFETCH)
-(p[0]) lfetch.nt1 [ptr2],32
-#endif
-(p[0]) ldfp8 the_r[0],the_q[0] = [src], 16
-(p[0]) add len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_s[MEMLAT], 8
-;; }
+ (p18) st8 [r26] = r43, 24
+ (p18) st8 [r27] = r46, 24
+ cmp.le p10=1,r31
+ ;;
+}
{ .mmi
-#if defined(USE_LFETCH)
-(p[0]) lfetch.nt1 [ptr1],32
-#endif
-(p[0]) ldfp8 the_s[0], the_t[0] = [src], 16
-} {.mmb
-(p[MEMLAT]) store [dest] = the_q[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
- br.ctop.dptk.many .l1
-;; }
-#elif defined(USE_INT)
-.l1: // ------------------------------- // L1: Everything a multiple of 8
+ (p16) ld8 r47 = [r29], 8
+ (p16) ld8 r50 = [r28], 8
+ (p10) cmp.gt p10 = 8, r31
+}
{ .mmi
-(p[0]) load the_r[0] = [src], 8
-(p[0]) load the_q[0] = [asrc], 8
-(p[0]) add len = -32, len
-} {.mmb
-(p[MEMLAT]) store [dest] = the_r[MEMLAT], 8
-(p[MEMLAT]) store [adest] = the_q[MEMLAT], 8
-;; }
+ (p18) st8 [r26] = r49, 8
+ (p18) st8 [r27] = r52, 8
+ cmp.ne p11=0,r31
+ ;;
+}
{ .mmi
-(p[0]) load the_s[0] = [src], 24
-(p[0]) load the_t[0] = [asrc], 24
-} {.mmb
-(p[MEMLAT]) store [dest] = the_s[MEMLAT], 24
-(p[MEMLAT]) store [adest] = the_t[MEMLAT], 24
-#if defined(USE_LFETCH)
-;; }
+ (p16) ld8 r53 = [r29], 24
+ (p16) ld8 r56 = [r28], 24
+ tbit.z.unc p0,p8=r25,6 // set the parameter for write pf every 2 iteration
+}
{ .mmb
-(p[0]) lfetch.nt1 [ptr2],32
-(p[0]) lfetch.nt1 [ptr1],32
-#endif
- br.ctop.dptk.many .l1
-;; }
-#endif
-
-.copy_full_words:
-{ .mib
- cmp.gt p_scr, p0 = 8, len //
- shr.u elemcnt = len, 3 //
-(p_scr) br.cond.dpnt.many .copy_bytes
-;; }
-{ .mii
- load tempreg = [src], 8
- add loopcnt = -1, elemcnt //
-;; }
-{ .mii
- cmp.ne p_scr, p0 = 0, loopcnt //
- mov ar.lc = loopcnt //
-;; }
-
-.l2: // ------------------------------- // L2: Max 4 words copied separately
-{ .mmi
- store [dest] = tempreg, 8
-(p_scr) load tempreg = [src], 8 //
- add len = -8, len
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l2
-;; }
-
-.copy_bytes:
-{ .mib
- cmp.eq p_scr, p0 = len, r0 // is len == 0 ?
- add loopcnt = -1, len // len--;
-(p_scr) br.cond.spnt .restore_and_exit
-;; }
-{ .mii
- ld1 tmp2 = [src], 1
- movi0 ar.lc = loopcnt
- cmp.ne p_scr, p0 = 0, loopcnt // avoid load beyond end-point
-;; }
-
-.l3: // ------------------------------- // L3: Final byte move
-{ .mmi
- st1 [dest] = tmp2, 1
-(p_scr) ld1 tmp2 = [src], 1
-} { .mib
- cmp.lt p_scr, p0 = 1, loopcnt // avoid load beyond end-point
- add loopcnt = -1, loopcnt
- br.cloop.dptk.few .l3
-;; }
+ (p18) st8 [r26] = r55, 24
+ (p18) st8 [r27] = r58, 24
+ br.ctop.dptk huge_loop
+ ;;
+} { .mmi
+ mov r33 = r29
+ mov r34 = r31
+ mov pr = r22,0x10000 // restore rotating predicates
+ ;;
+} { .mii
+ mov r32 = r26
+ mov ar.lc = r11
+ cmp.le p9 = 8, r31
+}
+{ .mbb
+ cmp.le p10=1,r31
+ (p11) br.cond.dptk aligned // use pred code for 1 <= n < 64
+ br.ret.sptk b0
+ ;;
+}
+aligned_very_huge:
+{ .mii
+ alloc r29=ar.pfs,3,37,0,40
+ tbit.nz p8,p0=r33,3
+ shr.u r17=r34,3 //length in 8-byte blocks
+}
+{.mmi
+ add r30=1,r0
+ add r28=1280,r33
+ adds r31 = 448,r33
+ ;;
+}
+{.mmi
+ (p8) ldf8 f6=[r33],8
+ (p8) adds r17=-1,r17
+ (p8) adds r34=-8,r34
+}
+{.mii
+ // add r24=8000,r0 //- suggested for advance distance for src lfetch.nt1 in very_huge_loop
+ add r27=4000,r28
+ shl r30=r30,23 //8Megs
+
+ ;;
+}
-.restore_and_exit:
-{ .mmi
- movi0 pr = saved_pr, -1 // restore the predicate registers
-;; }
-{ .mib
- movi0 ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-;; }
+{.mii
+ (p8) stf8 [r32]=f6,8
+ add r25=4000,r0 //advance distance for src lfetch.nta in very_huge_loop
+ shr.u r17=r17,2
+ ;;
+}
+{.mii
+ adds r17=-1,r17
+ adds r18=16,r32// 16 bytes away to avoid bank conflict
+ nop.i 0
+}
+{.mii
+ add r21=0,r34
+ mov.i ar.ec=9
+ cmp.lt p10,p0=r30,r34 //length > 8Megs
+ ;;
+}
+{.mii
+ mov r19=r32
+ mov r20=r33
+ mov.i ar.lc=r17
+}
+{.mib
+ adds r30=448,r32
+ mov pr.rot = 0x10000
+ (p10) br.cond.dptk.many ultra_huge_loop
+ ;;
+}
+very_huge_loop:
+{.mmi
+ (p16) lfetch.nt1 [r31],32
+ (p16) ldfp8 f32,f41=[r20],16
+ (p16) adds r21=-32,r21
+}
+{.mmb
+ (p24) stf8 [r19]=f40,8
+ (p24) stf8 [r18]=f58,8
+ nop.b 0x0;;
+}
+{.mmi
+ (p16) lfetch.nt1 [r30],32
+ (p16) ldfp8 f50,f59=[r20],16
+// tbit.z p11,p0=r20,6
+}
+{.mmb
+ (p24) stf8 [r19]=f49,24
+ (p24) stf8 [r18]=f67,24
+ br.ctop.dptk.many very_huge_loop;;
+}
+{.mmi
+ mov r32=r19
+ mov r33=r20
+ mov r34=r21
+ ;;
+}
+{ .mii
+ cmp.ne p11,p0=0,r21
+ mov pr = r22,0x10000 // restore rotating predicates
+ mov.i ar.lc = r11
+
+}
+{ .mbb
+ nop.m 0
+ (p11) br.cond.dptk aligned // use pred code for 1 <= n < 64
+ br.ret.sptk b0
+ ;;
+}
+//Used to copy more than 8Megs of data
+//Uses lfetch.nta to move data to L3 and lfetch.nt1 to move lines from L3 to L2
+//Copies 32 bytes per iteration
+ultra_huge_loop:
+{.mmi
+ (p10) lfetch.nt1 [r31],128
+ (p16) ldfp8 f32,f41=[r20],16
+ (p16) adds r21=-32,r21
+}
+{.mmi
+ (p24) stf8 [r19]=f40,8
+ (p24) stf8 [r18]=f58,8
+ cmp.ge p11,p0=r25,r26;;
+}
+{.mmi
+ (p11) lfetch.nta [r28],128
+ (p16) lfetch.nt1 [r30],32
+ sub r27=r31,r20
+ ;;
+}
+{.mmi
+ cmp.ge p10,p0=r25,r27
+ (p16) ldfp8 f50,f59=[r20],16
+ sub r26=r28,r31
+}
+{.mmb
+ (p24) stf8 [r19]=f49,24
+ (p24) stf8 [r18]=f67,24
+ br.ctop.dptk.many ultra_huge_loop;;
+}
+{.mmi
+ mov r32=r19
+ mov r33=r20
+ mov r34=r21
+ ;;
+}
+{ .mii
+ cmp.ne p11,p0=0,r21
+ mov pr = r22,0x10000 // restore rotating predicates
+ mov.i ar.lc = r11
+
+}
+{ .mbb
+ nop.m 0
+ (p11) br.cond.dptk aligned // use pred code for 1 <= n < 64
+ br.ret.sptk b0
+ ;;
+}
-.src_not_aligned:
-{ .mmi
- cmp.gt p_scr, p0 = 16, len
- and sh1 = 7, src // sh1 = src % 8
- shr.u loopcnt = len, 4 // element-cnt = len / 16
-} { .mib
- add tmp4 = @ltoff(.table), gp
- add tmp3 = @ltoff(.loop56), gp
-(p_scr) br.cond.dpnt.many .copy_bytes // do byte by byte if too few
-;; }
-{ .mmi
- and asrc = -8, src // asrc = (-8) -- align src for loop
- add loopcnt = -1, loopcnt // loopcnt--
- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
-} { .mmi
- ld8 ptable = [tmp4] // ptable = &table
- ld8 ploop56 = [tmp3] // ploop56 = &loop56
- and tmp2 = -16, len // tmp2 = len & -OPSIZ
-;; }
-{ .mmi
- add tmp3 = ptable, sh1 // tmp3 = &table + sh1
- add src = src, tmp2 // src += len & (-16)
- movi0 ar.lc = loopcnt // set LC
-;; }
-{ .mmi
- ld8 tmp4 = [tmp3] // tmp4 = loop offset
- sub len = len, tmp2 // len -= len & (-16)
- movi0 ar.ec = MEMLAT + 2 // one more pass needed
-;; }
+//Used when src and dst alignements are the same
+//Copies up to 7 bytes by conditionally copying 1,2 and 4 bytes, depending on value of last three bits in address
+unaligned:
+{ .mii
+ cmp.ne p14,p0= r28,r0
+ tbit.nz p9,p0 = r16,1 //src
+ tbit.nz p10,p0 = r16,2 //src
+}
+{ .mii
+ (p8) ld1 r17 = [r33],1
+ ;;
+}
+{ .mii
+ (p9) ld2 r18 = [r33],2
+ ;;
+}
+{ .mii
+ (p10) ld4 r19 = [r33],4
+ ;;
+}
{ .mmi
- ld8 s[1] = [asrc], 8 // preload
- sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
- movi0 pr.rot = 1 << 16 // set rotating predicates
-;; }
+(p8) st1 [r32] = r17,1
+(p8) add r34 = -1,r34
+ nop.i 0
+ ;;
+}
+{ .mii
+(p9) st2 [r32] = r18,2
+(p9) add r34 = -2,r34
+ nop.i 0
+ ;;
+}
{ .mib
- nop.m 0
- movi0 b6 = loopaddr
- br b6 // jump to the appropriate loop
-;; }
+(p10) st4 [r32] = r19,4
+(p10) add r34 = -4,r34
+ br.cond.dpnt aligned
+ ;;
+}
+{ .mib
+ br.ret.sptk b0
+ ;;
+}
+// end of memcpy
+ .endp memcpy#
+ .type ?0_memcopyA,@function
+ .global ?0_memcopyA#
+// End
- LOOP(8)
- LOOP(16)
- LOOP(24)
- LOOP(32)
- LOOP(40)
- LOOP(48)
- LOOP(56)
-END(memcpy)
libc_hidden_builtin_def (memcpy)
-
- .rodata
- .align 8
-.table:
- data8 0 // dummy entry
- data8 .loop56 - .loop8
- data8 .loop56 - .loop16
- data8 .loop56 - .loop24
- data8 .loop56 - .loop32
- data8 .loop56 - .loop40
- data8 .loop56 - .loop48
- data8 .loop56 - .loop56
--- sysdeps/ia64/memmove.S.intel 2003-11-18 22:11:48.000000000 -0800
+++ sysdeps/ia64/memmove.S 2006-11-30 12:42:32.000000000 -0800
@@ -1,251 +1,113 @@
-/* Optimized version of the standard memmove() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2003 Free Software Foundation, Inc.
- Contributed by Dan Pop <Dan.Pop@cern.ch>.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-/* Return: dest
-
- Inputs:
- in0: dest
- in1: src
- in2: byte count
-
- The core of the function is the memcpy implementation used in memcpy.S.
- When bytes have to be copied backwards, only the easy case, when
- all arguments are multiples of 8, is optimised.
-
- In this form, it assumes little endian mode. For big endian mode,
- sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
- or the UM.be bit should be cleared at the beginning and set at the end. */
+// memmove: copy a counted number of bytes.
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// The first step is to test if the 2 memory regions are overlapping or not.
+// Depending the results, the copy is performed in an ascending (increasing
+// addresses) or descending way (decreasing addresses)
+//
+// The main loop use address interleaving to avoid, or at least limit, bank conflicts.
+//
+// Author: Steve Skedzielewski, JT Acquaviva
+// Date: February, 2002
+//
#include <sysdep.h>
#undef ret
-#define OP_T_THRES 16
-#define OPSIZ 8
-
-#define adest r15
-#define saved_pr r17
-#define saved_lc r18
-#define dest r19
-#define src r20
-#define len r21
-#define asrc r22
-#define tmp2 r23
-#define tmp3 r24
-#define tmp4 r25
-#define ptable r26
-#define ploop56 r27
-#define loopaddr r28
-#define sh1 r29
-#define loopcnt r30
-#define value r31
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
-# define ALIGN(n) { nop 0 }
-#else
-# define ALIGN(n) .align n
+#ifdef IS_IN_rtld
+#undef HIDDEN_JUMPTARGET
+#define HIDDEN_JUMPTARGET(name) name
#endif
-#define LOOP(shift) \
- ALIGN(32); \
-.loop##shift##: \
-(p[0]) ld8 r[0] = [asrc], 8 ; /* w1 */ \
-(p[MEMLAT+1]) st8 [dest] = value, 8 ; \
-(p[MEMLAT]) shrp value = r[MEMLAT], r[MEMLAT+1], shift ; \
- nop.b 0 ; \
- nop.b 0 ; \
- br.ctop.sptk .loop##shift ; \
- br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
-
-#define MEMLAT 21
-#define Nrot (((2*MEMLAT+3) + 7) & ~7)
-
-ENTRY(memmove)
- .prologue
- alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
- .rotr r[MEMLAT + 2], q[MEMLAT + 1]
- .rotp p[MEMLAT + 2]
- mov ret0 = in0 // return value = dest
- .save pr, saved_pr
- mov saved_pr = pr // save the predicate registers
- .save ar.lc, saved_lc
- mov saved_lc = ar.lc // save the loop counter
- .body
- or tmp3 = in0, in1 ;; // tmp3 = dest | src
- or tmp3 = tmp3, in2 // tmp3 = dest | src | len
- mov dest = in0 // dest
- mov src = in1 // src
- mov len = in2 // len
- sub tmp2 = r0, in0 // tmp2 = -dest
- cmp.eq p6, p0 = in2, r0 // if (len == 0)
-(p6) br.cond.spnt .restore_and_exit;;// return dest;
- and tmp4 = 7, tmp3 // tmp4 = (dest | src | len) & 7
- cmp.le p6, p0 = dest, src // if dest <= src it's always safe
-(p6) br.cond.spnt .forward // to copy forward
- add tmp3 = src, len;;
- cmp.lt p6, p0 = dest, tmp3 // if dest > src && dest < src + len
-(p6) br.cond.spnt .backward // we have to copy backward
-
-.forward:
- shr.u loopcnt = len, 4 ;; // loopcnt = len / 16
- cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
-(p6) br.cond.sptk .next // goto next;
-
-// The optimal case, when dest, src and len are all multiples of 8
-
- and tmp3 = 0xf, len
- mov pr.rot = 1 << 16 // set rotating predicates
- mov ar.ec = MEMLAT + 1 ;; // set the epilog counter
- cmp.ne p6, p0 = tmp3, r0 // do we have to copy an extra word?
- adds loopcnt = -1, loopcnt;; // --loopcnt
-(p6) ld8 value = [src], 8;;
-(p6) st8 [dest] = value, 8 // copy the "odd" word
- mov ar.lc = loopcnt // set the loop counter
- cmp.eq p6, p0 = 8, len
-(p6) br.cond.spnt .restore_and_exit;;// the one-word special case
- adds adest = 8, dest // set adest one word ahead of dest
- adds asrc = 8, src ;; // set asrc one word ahead of src
- nop.b 0 // get the "golden" alignment for
- nop.b 0 // the next loop
-.l0:
-(p[0]) ld8 r[0] = [src], 16
-(p[0]) ld8 q[0] = [asrc], 16
-(p[MEMLAT]) st8 [dest] = r[MEMLAT], 16
-(p[MEMLAT]) st8 [adest] = q[MEMLAT], 16
- br.ctop.dptk .l0 ;;
-
- mov pr = saved_pr, -1 // restore the predicate registers
- mov ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-.next:
- cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES
- and loopcnt = 7, tmp2 // loopcnt = -dest % 8
-(p6) br.cond.spnt .cpyfew // copy byte by byte
+ .section .text
+ .proc memmove#
+ .align 32
+ .global memmove#
+memmove:
+ { .mmi // -----------------> cycle 0
+ mov r8=r32 // save dest pointer for return value
+ cmp.eq p8=0,r34 // is length 0 ?
+ cmp.gtu p7=r32,r33 // dst < src ?
+ }
+ { .mib
+ add r14=r34,r33
+ cmp.leu p9=8,r34 // length > 8 ?
+ (p8) br.ret.dpnt.many b0 // ---> if length is 0 branch out
+ ;;
+ }
+// determine whether we need memcpy with ascending or descending address.
+// if both comparison are true jump to memcpyD
+ { .mmi // -----------------> cycle 1
+ (p7) cmp.gtu.unc p8=r14,r32 // (src + len) < dst ?
+ cmp.le p6=3,r34
+ add r23=2,r33 // second src pointer (for short copy)
+ } { .mbb
+ add r22=2,r32 // second dst pointer (for short copy)
+ (p8) br.cond.dpnt ?0_memcpyD#
+ (p9) br.cond.dpnt HIDDEN_JUMPTARGET (memcpy) // ---> if length > 7 goto large memcopy
;;
- cmp.eq p6, p0 = loopcnt, r0
-(p6) br.cond.sptk .dest_aligned
- sub len = len, loopcnt // len -= -dest % 8
- adds loopcnt = -1, loopcnt // --loopcnt
+ }
+// start short copy
+/// move the first byte to load the first level cache
+ { .mmi // -----------------> cycle 2
+ ld1 r16=[r33],1 // load byte 1
+ ;;
+ st1 [r32]=r16,1 // store byte 1
+ cmp.le p8=5,r34
;;
- mov ar.lc = loopcnt
-.l1: // copy -dest % 8 bytes
- ld1 value = [src], 1 // value = *src++
+ } { .mmi // -----------------> cycle 3
+(p6) ld1 r14=[r33],2 // load byte 2
+(p6) ld1 r15=[r23],2 // load byte 3
+ cmp.le p12,p13=7,r34
;;
- st1 [dest] = value, 1 // *dest++ = value
- br.cloop.dptk .l1
-.dest_aligned:
- and sh1 = 7, src // sh1 = src % 8
- and tmp2 = -8, len // tmp2 = len & -OPSIZ
- and asrc = -8, src // asrc = src & -OPSIZ -- align src
- shr.u loopcnt = len, 3 // loopcnt = len / 8
- and len = 7, len;; // len = len % 8
- adds loopcnt = -1, loopcnt // --loopcnt
- addl tmp4 = @ltoff(.table), gp
- addl tmp3 = @ltoff(.loop56), gp
- mov ar.ec = MEMLAT + 1 // set EC
- mov pr.rot = 1 << 16;; // set rotating predicates
- mov ar.lc = loopcnt // set LC
- cmp.eq p6, p0 = sh1, r0 // is the src aligned?
-(p6) br.cond.sptk .src_aligned
- add src = src, tmp2 // src += len & -OPSIZ
- shl sh1 = sh1, 3 // sh1 = 8 * (src % 8)
- ld8 ploop56 = [tmp3] // ploop56 = &loop56
- ld8 ptable = [tmp4];; // ptable = &table
- add tmp3 = ptable, sh1;; // tmp3 = &table + sh1
- mov ar.ec = MEMLAT + 1 + 1 // one more pass needed
- ld8 tmp4 = [tmp3];; // tmp4 = loop offset
- sub loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
- ld8 r[1] = [asrc], 8;; // w0
- mov b6 = loopaddr;;
- br b6 // jump to the appropriate loop
-
- LOOP(8)
- LOOP(16)
- LOOP(24)
- LOOP(32)
- LOOP(40)
- LOOP(48)
- LOOP(56)
-
-.src_aligned:
-.l3:
-(p[0]) ld8 r[0] = [src], 8
-(p[MEMLAT]) st8 [dest] = r[MEMLAT], 8
- br.ctop.dptk .l3
-.cpyfew:
- cmp.eq p6, p0 = len, r0 // is len == 0 ?
- adds len = -1, len // --len;
-(p6) br.cond.spnt .restore_and_exit ;;
- mov ar.lc = len
-.l4:
- ld1 value = [src], 1
+ } { .mmi
+(p8) ld1 r16=[r33],2 // load byte 4
+(p8) ld1 r17=[r23],2 // load byte 5
+(p13) tbit.z p12=r34,0 // move last byte if len=7 or len is even
+ } { .mmi // -----------------> cycle 4
+(p6) st1 [r32]=r14,2 // store byte 2
+(p6) st1 [r22]=r15,2 // store byte 3
+ cmp.le p14=7,r34
;;
- st1 [dest] = value, 1
- br.cloop.dptk .l4 ;;
-.restore_and_exit:
- mov pr = saved_pr, -1 // restore the predicate registers
- mov ar.lc = saved_lc // restore the loop counter
- br.ret.sptk.many b0
-
-// In the case of a backward copy, optimise only the case when everything
-// is a multiple of 8, otherwise copy byte by byte. The backward copy is
-// used only when the blocks are overlapping and dest > src.
-
-.backward:
- shr.u loopcnt = len, 3 // loopcnt = len / 8
- add src = src, len // src points one byte past the end
- add dest = dest, len ;; // dest points one byte past the end
- mov ar.ec = MEMLAT + 1 // set the epilog counter
- mov pr.rot = 1 << 16 // set rotating predicates
- adds loopcnt = -1, loopcnt // --loopcnt
- cmp.ne p6, p0 = tmp4, r0 // if ((dest | src | len) & 7 != 0)
-(p6) br.cond.sptk .bytecopy ;; // copy byte by byte backward
- adds src = -8, src // src points to the last word
- adds dest = -8, dest // dest points to the last word
- mov ar.lc = loopcnt;; // set the loop counter
-.l5:
-(p[0]) ld8 r[0] = [src], -8
-(p[MEMLAT]) st8 [dest] = r[MEMLAT], -8
- br.ctop.dptk .l5
- br.cond.sptk .restore_and_exit
-.bytecopy:
- adds src = -1, src // src points to the last byte
- adds dest = -1, dest // dest points to the last byte
- adds loopcnt = -1, len;; // loopcnt = len - 1
- mov ar.lc = loopcnt;; // set the loop counter
-.l6:
-(p[0]) ld1 r[0] = [src], -1
-(p[MEMLAT]) st1 [dest] = r[MEMLAT], -1
- br.ctop.dptk .l6
- br.cond.sptk .restore_and_exit
-END(memmove)
-
- .rodata
- .align 8
-.table:
- data8 0 // dummy entry
- data8 .loop56 - .loop8
- data8 .loop56 - .loop16
- data8 .loop56 - .loop24
- data8 .loop56 - .loop32
- data8 .loop56 - .loop40
- data8 .loop56 - .loop48
- data8 .loop56 - .loop56
+ } { .mmi // -----------------> cycle 5
+(p12) ld1 r14=[r33],2 // load byte 2, 4, or 6
+(p14) ld1 r15=[r23],2 // load byte 7
+ } { .mmi
+(p8) st1 [r32]=r16,2 // store byte 4
+(p8) st1 [r22]=r17,2 // store byte 5
+ ;;
+ } { .mmb // -----------------> cycle 6
+(p12) st1 [r32]=r14,2 // store byte 2, 4, or 6
+(p14) st1 [r22]=r15,2 // store byte 7
+ br.ret.dptk b0
+ }
+// end of short memcopy
+// -- End memmove
+ .endp memmove#
+ .type HIDDEN_JUMPTARGET (memcpy),@function
+ .global HIDDEN_JUMPTARGET (memcpy)
+ .type ?0_memcpyD#,@function
+ .global ?0_memcpyD#
+// End
libc_hidden_builtin_def (memmove)
--- sysdeps/ia64/memset.S.intel 2003-11-18 22:11:26.000000000 -0800
+++ sysdeps/ia64/memset.S 2006-11-30 11:29:24.000000000 -0800
@@ -1,400 +1,388 @@
-/* Optimized version of the standard memset() function.
- This file is part of the GNU C Library.
- Copyright (C) 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
- Contributed by Dan Pop for Itanium <Dan.Pop@cern.ch>.
- Rewritten for McKinley by Sverre Jarp, HP Labs/CERN <Sverre.Jarp@cern.ch>
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
- 02111-1307 USA. */
-
-/* Return: dest
-
- Inputs:
- in0: dest
- in1: value
- in2: count
-
- The algorithm is fairly straightforward: set byte by byte until we
- we get to a 16B-aligned address, then loop on 128 B chunks using an
- early store as prefetching, then loop on 32B chucks, then clear remaining
- words, finally clear remaining bytes.
- Since a stf.spill f0 can store 16B in one go, we use this instruction
- to get peak speed when value = 0. */
+// memset: function to set a number of bytes to a char value
+//
+// Copyright (c) 2000-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// Author: Steve Skedzielewski
+// Date: June, 2000
+// Date: February, 2002 - update
+//
#include <sysdep.h>
#undef ret
-#define dest in0
-#define value in1
-#define cnt in2
-
-#define tmp r31
-#define save_lc r30
-#define ptr0 r29
-#define ptr1 r28
-#define ptr2 r27
-#define ptr3 r26
-#define ptr9 r24
-#define loopcnt r23
-#define linecnt r22
-#define bytecnt r21
-
-#define fvalue f6
-
-// This routine uses only scratch predicate registers (p6 - p15)
-#define p_scr p6 // default register for same-cycle branches
-#define p_nz p7
-#define p_zr p8
-#define p_unalgn p9
-#define p_y p11
-#define p_n p12
-#define p_yy p13
-#define p_nn p14
-
-#define movi0 mov
-
-#define MIN1 15
-#define MIN1P1HALF 8
-#define LINE_SIZE 128
-#define LSIZE_SH 7 // shift amount
-#define PREF_AHEAD 8
-
-#define USE_FLP
-#if defined(USE_INT)
-#define store st8
-#define myval value
-#elif defined(USE_FLP)
-#define store stf8
-#define myval fvalue
-#endif
-
-.align 64
-ENTRY(memset)
-{ .mmi
+ .section .text
+// -- Begin memset
+ .proc memset#
+ .align 32
+// Replicate the value into all bytes using mmx broadcast
+// Fall through to aligned short (<16 bytes) code
+// live out: r21 (alignment), r31(replicated c),
+// r32(s), r33(c), r34(n)
+ .global memset#
.prologue
- alloc tmp = ar.pfs, 3, 0, 0, 0
- lfetch.nt1 [dest]
- .save ar.lc, save_lc
- movi0 save_lc = ar.lc
+memset:
+ mov r8=r32 // Return value
+ cmp.le p14=16,r34
+ and r22=0xF,r32 // Spec test for 16-byte boundary
+ and r21=7,r32 // Spec test for 8-byte boundary
+ mux1 r31=r33,@brcst // Replicate byte value
+ (p14) br.cond.dpnt Not_short
+ ;;
+// Handle short values quickly
+ cmp.ne p15=0,r21 // If zero, skip alignment
+ cmp.le p11,p10=8,r34 // Spec test for st8 safety
+ tbit.nz p13,p12=r32,0 // Spec test for st1 alignment
+ cmp.ge p14=0,r34 // Spec test for early exit
+ (p14) br.ret.dpnt b0
+ (p15) br.cond.dpnt Align_short
+ ;;
+// We're aligned and p11/p10 is set/clear if we need to do the st8
+// Use complementary predicates to allow length tests in parallel with store
+Short:
+{ .mmi
+ .pred.rel "mutex",p10,p11
+ (p11) st8 [r32]=r31,8
+ (p11) cmp.le p13,p12=12,r34
+ (p10) cmp.le p13,p12=4,r34
} { .mmi
- .body
- mov ret0 = dest // return value
- cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
- cmp.eq p_scr, p0 = cnt, r0
-;; }
-{ .mmi
- and ptr2 = -(MIN1+1), dest // aligned address
- and tmp = MIN1, dest // prepare to check for alignment
- tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
-} { .mib
- mov ptr1 = dest
- mux1 value = value, @brcst // create 8 identical bytes in word
-(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
-;; }
-{ .mib
- cmp.ne p_unalgn, p0 = tmp, r0
-} { .mib // NB: # of bytes to move is 1 higher
- sub bytecnt = (MIN1+1), tmp // than loopcnt
- cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
-(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
-;; }
-{ .mmi
-(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
-(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
-;; }
-{ .mib
-(p_y) add cnt = -8, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
-} { .mib
-(p_y) st8 [ptr2] = value, -4
-(p_n) add ptr2 = 4, ptr2
-;; }
-{ .mib
-(p_yy) add cnt = -4, cnt
-(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
+ .pred.rel "mutex",p12,p13
+ (p11) add r34=-8,r34
+ ;;
+ (p13) st4 [r32]=r31,4
+ (p13) cmp.le p11,p10=6,r34
+} { .mii
+ (p12) cmp.le p11,p10=2,r34
+ .pred.rel "mutex",p10,p11
+ (p13) add r34=-4,r34
+ ;;
+ (p11) cmp.le p13=3,r34
+} { .mii
+ (p11) st2 [r32]=r31,2
+ (p10) cmp.le p13=1,r34
+ ;;
} { .mib
-(p_yy) st4 [ptr2] = value, -2
-(p_nn) add ptr2 = 2, ptr2
-;; }
-{ .mmi
- mov tmp = LINE_SIZE+1 // for compare
-(p_y) add cnt = -2, cnt
-(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
+ (p13) st1 [r32]=r31
+ br.ret.sptk b0
+ ;;
+}
+// Align, while taking care not to exceed length
+// Similar to aligned code above, but adds an alignment test to length test
+Align_short:
+{ .mmi
+ .pred.rel "mutex",p12,p13
+ (p13) st1 [r32]=r33,1
+ (p13) cmp.le p11,p10=3,r34
+ (p12) cmp.le p11,p10=2,r34
+} { .mii
+ (p13) add r34=-1,r34
+ ;;
+ (p11) tbit.nz p11,p10=r32,1 // length is OK, are we on 2-byte boundary?
+ ;;
} { .mmi
- setf.sig fvalue=value // transfer value to FLP side
-(p_y) st2 [ptr2] = value, -1
-(p_n) add ptr2 = 1, ptr2
-;; }
-
-{ .mmi
-(p_yy) st1 [ptr2] = value
- cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
-} { .mbb
-(p_yy) add cnt = -1, cnt
-(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
-;; }
-
-{ .mib
- nop.m 0
- shr.u linecnt = cnt, LSIZE_SH
-(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
-;; }
-
-#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
- .align 32 // -------- // L1A: store ahead into cache lines; fill later
-#endif
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
+ .pred.rel "mutex",p10,p11
+ (p11) st2 [r32]=r31,2
+ (p10) cmp.le p13,p12=4,r34
+ (p11) cmp.le p13,p12=6,r34
} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt // start of stores
- add ptr2 = 8, ptr1 // (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total
-;; } // range
-{ .mmi
- add tmp = -1, linecnt // next loop count
- movi0 ar.lc = loopcnt
-;; }
-.pref_l1a:
-{ .mib
- store [ptr9] = myval, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1a
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- movi0 ar.lc = tmp
-;; }
-.l1ax:
- { .mmi
- store [ptr2] = myval, 8
- store [ptr0] = myval, 8
- ;; }
- { .mmi
- store [ptr2] = myval, 24
- store [ptr0] = myval, 24
- ;; }
- { .mmi
- store [ptr2] = myval, 8
- store [ptr0] = myval, 8
- ;; }
- { .mmi
- store [ptr2] = myval, 24
- store [ptr0] = myval, 24
- ;; }
- { .mmi
- store [ptr2] = myval, 8
- store [ptr0] = myval, 8
- ;; }
- { .mmi
- store [ptr2] = myval, 24
- store [ptr0] = myval, 24
- ;; }
- { .mmi
- store [ptr2] = myval, 8
- store [ptr0] = myval, 32
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- store [ptr2] = myval, 24
-(p_scr) store [ptr9] = myval, 128
- br.cloop.dptk.few .l1ax
-;; }
-{ .mbb
- cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
- br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
-;; }
-
-#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO
- { nop 0 }
-#else
- .align 32
-#endif
-.l1b: // ------------------ // L1B: store ahead into cache lines; fill later
-{ .mmi
- and tmp = -(LINE_SIZE), cnt // compute end of range
- mov ptr9 = ptr1 // used for prefetching
- and cnt = (LINE_SIZE-1), cnt // remainder
+ (p11) add r34=-2,r34
+ ;;
+ (p13) tbit.nz p13,p12=r32,2
+ ;;
} { .mmi
- mov loopcnt = PREF_AHEAD-1 // default prefetch loop
- cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
-;; }
-{ .mmi
-(p_scr) add loopcnt = -1, linecnt
- add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
- add ptr1 = tmp, ptr1 // first address beyond total range
-;; }
-{ .mmi
- add tmp = -1, linecnt // next loop count
- movi0 ar.lc = loopcnt
-;; }
-.pref_l1b:
-{ .mib
- stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
- nop.i 0
- br.cloop.dptk.few .pref_l1b
-;; }
-{ .mmi
- add ptr0 = 16, ptr2 // Two stores in parallel
- movi0 ar.lc = tmp
-;; }
-.l1bx:
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 32
- ;; }
- { .mmi
- stf.spill [ptr2] = f0, 32
- stf.spill [ptr0] = f0, 64
- cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
- ;; }
-{ .mmb
- stf.spill [ptr2] = f0, 32
-(p_scr) stf.spill [ptr9] = f0, 128
- br.cloop.dptk.few .l1bx
-;; }
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment
-;; }
-
-.fraction_of_line:
-{ .mib
- add ptr2 = 16, ptr1
- shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
-;; }
-{ .mib
- cmp.eq p_scr, p0 = loopcnt, r0
- add loopcnt = -1, loopcnt
-(p_scr) br.cond.dpnt.many store_words
-;; }
-{ .mib
- and cnt = 0x1f, cnt // compute the remaining cnt
- movi0 ar.lc = loopcnt
-;; }
-#ifndef GAS_ALIGN_BREAKS_UNWIND_INFO
- .align 32
-#endif
-.l2: // ---------------------------- // L2A: store 32B in 2 cycles
-{ .mmb
- store [ptr1] = myval, 8
- store [ptr2] = myval, 8
-;; } { .mmb
- store [ptr1] = myval, 24
- store [ptr2] = myval, 24
- br.cloop.dptk.many .l2
-;; }
-store_words:
-{ .mib
- cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
-(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
-;; }
-
-{ .mmi
- store [ptr1] = myval, 8 // store
- cmp.le p_y, p_n = 16, cnt //
- add cnt = -8, cnt // subtract
-;; }
-{ .mmi
-(p_y) store [ptr1] = myval, 8 // store
-(p_y) cmp.le.unc p_yy, p_nn = 16, cnt //
-(p_y) add cnt = -8, cnt // subtract
-;; }
-{ .mmi // store
-(p_yy) store [ptr1] = myval, 8 //
-(p_yy) add cnt = -8, cnt // subtract
-;; }
-
-.move_bytes_from_alignment:
-{ .mib
- cmp.eq p_scr, p0 = cnt, r0
- tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
-(p_scr) br.cond.dpnt.few .restore_and_exit
-;; }
-{ .mib
-(p_y) st4 [ptr1] = value, 4
- tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
-;; }
-{ .mib
-(p_yy) st2 [ptr1] = value, 2
- tbit.nz.unc p_y, p0 = cnt, 0
-;; }
-
-{ .mib
-(p_y) st1 [ptr1] = value
-;; }
-.restore_and_exit:
-{ .mib
- nop.m 0
- movi0 ar.lc = save_lc
- br.ret.sptk.many rp
-;; }
+ .pred.rel "mutex",p12,p13
+ (p13) st4 [r32]=r31,4
+ (p12) cmp.le p11,p10=8,r34
+ (p13) cmp.le p11,p10=12,r34
+} { .mib
+ (p13) add r34=-4,r34
+ br.cond.sptk Short
+ ;;
+}
+// Code for lengths >= 16
+// If we're not on a 16-byte boundary, move to one
+// live out: r31 (replicated c), r33(unsigned c), r32(s), r34(unsigned n)
+Not_short:
+ cmp.ne p15=0,r22 //0: Low 4 bits zero?
+ cmp.ne p11,p10=0,r33
+ tbit.nz p13,p12=r32,0 // Spec test for st1 alignment
+ (p15) br.cond.dpnt Align_long
+ ;;
+// OK, it's long, it's aligned to a 16-byte boundary.
+// If r33 is not zero, skip to st8 code, otherwise fall into spill f0 version
+Is_aligned:
+ cmp.ne p14=0,r33 // Check value of fill character
+ add r16=128,r32 // prefetch pointer
+ .save ar.lc,r11
+ mov r11=ar.lc
+ mov r24=r34
+ (p14) br.cond.dpnt Nonzero
+ ;;
+//
+// Version when memset is clearing memory
+//
+ .body
+ add r17=16,r32 // second spill pointer
+ cmp.le p13=32,r34 // Spec for first set of spills
+ cmp.ge p14=127,r34
+ and r24=127,r34
+ mov r21=144 // = 128+16, length needed for second prefetch
+ (p14) br.cond.dpnt Zero_medium
+//
+/// Enter loop code when length is at least 128
+/// Prefetch each line with a spill
+///
+ stf.spill [r32]=f0,32
+ ;;
+ cmp.le p9=r21,r34
+ shr.u r22=r34,7 // line size is 128
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) stf.spill [r16]=f0,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ add r22=-1,r22 // Loop count
+ ;;
+ mov ar.lc=r22
+ (p9) stf.spill [r16]=f0,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) stf.spill [r16]=f0,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) stf.spill [r16]=f0,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) stf.spill [r16]=f0,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+// Counted loop storing 128 bytes/iteration,
+/// with out-of-order spills causing line prefetch
+// live out: r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
+// r33(replicated c), r34(n), p13(n&15>32)
+Zero_loop:
+ (p9) stf.spill [r16]=f0,128
+ stf.spill [r17]=f0,32
+ cmp.le p9=r21,r34
+ ;;
+ stf.spill [r32]=f0,32
+ stf.spill [r17]=f0,32
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ stf.spill [r32]=f0,32
+ stf.spill [r17]=f0,32
+ cmp.le p13=32,r24
+ ;;
+ stf.spill [r32]=f0,64
+ stf.spill [r17]=f0,32
+ br.cloop.sptk Zero_loop
+ ;;
+ add r32=-32,r32
+ ;;
+Zero_medium:
+ (p13) stf.spill [r32]=f0,32 // Redundant if entered from loop path
+ (p13) stf.spill [r17]=f0,32
+ cmp.le p12=64,r24
+ ;;
+ (p12) stf.spill [r32]=f0,32
+ (p12) stf.spill [r17]=f0,32
+ cmp.le p11=96,r24
+ ;;
+ (p11) stf.spill [r32]=f0,32
+ (p11) stf.spill [r17]=f0,32
+ tbit.nz p10=r24,4
+ ;;
+ (p10) stf.spill [r32]=f0,16
+ tbit.nz p9=r24,3
+ ;;
+ (p9) st8 [r32]=r0,8
+ tbit.nz p13=r24,2
+ ;;
+//
+// Clean up any partial word stores.
+//
+ tbit.nz p12=r24,1
+ (p13) st4 [r32]=r0,4
+ ;;
+ (p12) st2 [r32]=r0,2
+ tbit.nz p11=r24,0
+ ;;
+ (p11) st1 [r32]=r0,1
+ mov ar.lc=r11
+ br.ret.sptk.many b0
+ ;;
+//
+// Fill character is not zero
+// Now that p is aligned to a 16-byte boundary
+// use straight-line code for n<=64, a loop otherwise
+// live out: r8 (return value, original value of r32)
+// p14 (n>=MINIMUM_LONG)
+//
+Nonzero:
+ MINIMUM_LONG=0x40
+ add r17=8,r32 //0: second pointer
+ mov r21=136 // = 128+8, length needed for second prefetch
+ add r22=64,r34 // May need extra 1/2 iteration
+ cmp.le p13=16,r34 // Spec for use when loop is skipped
+ cmp.gt p14=MINIMUM_LONG,r34
+ (p14) br.cond.dpnt Nonzero_medium
+ ;;
+//
+/// Enter loop code when length is at least 128
+/// Prefetch each line with a st8
+///
+ st8 [r32]=r31,16
+ cmp.le p9=r21,r34
+ shr.u r22=r22,7 // line size is 128
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) st8 [r16]=r31,128
+ add r22=-1,r22 // Loop count
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ mov ar.lc=r22
+ (p9) st8 [r16]=r31,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) st8 [r16]=r31,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) st8 [r16]=r31,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+ (p9) st8 [r16]=r31,128
+ cmp.le p9=r21,r34
+ add r21=128,r21 // next prefetch safe length
+ ;;
+// Counted loop storing 128 bytes/iteration,
+/// with out-of-order spills causing line prefetch
+// live out: r11(ar.lc), r17(s+16), r23(128-n&15), r24(n&15), r32(s)
+// r33(replicated c), r34(n), p13(n&15>32)
+Nonzero_loop:
+ (p9) st8 [r16]=r31,128
+ st8 [r17]=r31,16
+ cmp.lt p10,p11=127,r24 // should we store the last 64?
+ ;;
+ st8 [r32]=r31,16
+ st8 [r17]=r31,16
+ (p10) add r24=-128,r24 // Update count of remaining bytes
+ ;;
+ st8 [r32]=r31,16
+ st8 [r17]=r31,16
+ (p11) add r24=-64,r24 // Update count of remaining bytes
+ ;;
+ st8 [r32]=r31,16
+ st8 [r17]=r31,16
+ cmp.le p9=r21,r34 // Compare prefetch offset with length
+ ;;
+ (p10) st8 [r32]=r31,16
+ (p10) st8 [r17]=r31,16
+ add r21=128,r21 // next prefetch-safe length
+ ;;
+ (p10) st8 [r32]=r31,16
+ (p10) st8 [r17]=r31,16
+ cmp.le p13=16,r24 // Spec for epilog
+ ;;
+ (p10) st8 [r32]=r31,16
+ (p10) st8 [r17]=r31,16
+// (p10) cmp.lt.unc p11,p12=64,r24 // p11 true if we need another iter
+ ;;
+// {.mmi
+ (p10) st8 [r32]=r31,32
+ (p10) st8 [r17]=r31,16
+//} {.mib
+// .pred.rel "mutex",p11,p12
+// (p11) add r32=32,r32 // skip the bytes stored out-of-order
+// (p12) add r32=16,r32 // prepare for epilogue
+ br.cloop.sptk Nonzero_loop
+ ;;
+//}
+ (p10) add r32=-16,r32
+ ;;
+// Short memsets are done with predicated straightline code
+// live out: r8 (return value, original value of r32)
+Nonzero_medium:
+ (p13) st8 [r32]=r31,16
+ (p13) st8 [r17]=r31,16
+ cmp.le p12=0x20,r24 //0: 32 <= n?
+ ;;
+ (p12) st8 [r32]=r31,16
+ (p12) st8 [r17]=r31,16
+ cmp.le p11=0x30,r24 //0: 48 <= n?
+ ;;
+ (p11) st8 [r32]=r31,16
+ (p11) st8 [r17]=r31,16
+ tbit.nz p10=r24,3
+ ;;
+ (p10) st8 [r32]=r31,8
+ tbit.nz p9=r24,2
+ ;;
+//
+// Clean up any partial word stores.
+//
+ tbit.nz p8=r24,1
+ (p9) st4 [r32]=r31,4
+ ;;
+ (p8) st2 [r32]=r31,2
+ tbit.nz p7=r24,0
+ ;;
+ (p7) st1 [r32]=r31,1
+ mov ar.lc=r11
+ br.ret.sptk.many b0
+ ;;
+Align_long:
+ (p13) st1 [r32]=r33,1
+ (p13) add r34=-1,r34
+ ;;
+ tbit.nz p13=r32,1
+ ;;
+ (p13) st2 [r32]=r31,2
+ (p13) add r34=-2,r34
+ ;;
+ tbit.nz p13=r32,2
+ ;;
+ (p13) st4 [r32]=r31,4
+ (p13) add r34=-4,r34
+ ;;
+ tbit.nz p13,p12=r32,3
+ ;;
+ (p13) st8 [r32]=r31,8
+ (p13) add r34=-8,r34
+ ;;
+ cmp.le p11,p10=8,r34 // Spec for entry to Short
+ cmp.le p13,p12=16,r34
+ (p12) br.cond.dpnt Short
+ br.cond.dptk Is_aligned
+ ;;
+//
+// -- End memset
+ .endp memset#
+// End
-.move_bytes_unaligned:
-{ .mmi
- .pred.rel "mutex",p_y, p_n
- .pred.rel "mutex",p_yy, p_nn
-(p_n) cmp.le p_yy, p_nn = 4, cnt
-(p_y) cmp.le p_yy, p_nn = 5, cnt
-(p_n) add ptr2 = 2, ptr1
-} { .mmi
-(p_y) add ptr2 = 3, ptr1
-(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte
-(p_y) add cnt = -1, cnt // [15, 14 (or less) left]
-;; }
-{ .mmi
-(p_yy) cmp.le.unc p_y, p0 = 8, cnt
- add ptr3 = ptr1, cnt // prepare last store
- movi0 ar.lc = save_lc
-} { .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
-(p_yy) add cnt = -4, cnt // [11, 10 (o less) left]
-;; }
-{ .mmi
-(p_y) cmp.le.unc p_yy, p0 = 8, cnt
- add ptr3 = -1, ptr3 // last store
- tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
-} { .mmi
-(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
-(p_y) add cnt = -4, cnt // [7, 6 (or less) left]
-;; }
-{ .mmi
-(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
-(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes
- // [3, 2 (or less) left]
- tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
-} { .mmi
-(p_yy) add cnt = -4, cnt
-;; }
-{ .mmb
-(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
-(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
- br.ret.sptk.many rp
-;; }
-END(memset)
libc_hidden_builtin_def (memset)
--- sysdeps/ia64/serial-memmove.S.intel 2006-12-01 09:24:41.000000000 -0800
+++ sysdeps/ia64/serial-memmove.S 2006-11-30 12:39:45.000000000 -0800
@@ -0,0 +1,518 @@
+// ?1__serial_memmove:
+//
+// Copyright (c) 2002-2006, Intel Corporation
+// All rights reserved.
+//
+// WARRANTY DISCLAIMER
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Intel Corporation is the author of this code, and requests that all
+// problem reports or change requests be submitted to it directly at
+// http://www.intel.com/opensource/
+//
+// This routine test two memory zones selected as source / destination
+// for a copy loop.
+// 1/ if there is no overlapping (source + length) < destination call memcpy
+// 2/ if source < destination a Write-After-Read dependency is assumed,
+// therefore make a call to memcpy in a descending address mode.
+// 3/ if destination > (source + length),
+// again there is no overlapping and memcpy can be safely called
+// 4/ Otherwise we have overlapping and Read-After-Write dependency.
+// Therefore the copy loop is done explicitly using the ld size given
+// by the fourth argument
+//
+// NOTE: To avoid versioning for aligned and unaligned access, coalescing
+// is not done in the unrolled loops
+//
+// Author: Steve Skedzielewski, JT Acquaviva
+// Date: July, 2002
+//
+
+#include <sysdep.h>
+#undef ret
+
+#ifdef IS_IN_rtld
+#undef HIDDEN_JUMPTARGET
+#define HIDDEN_JUMPTARGET(name) name
+#endif
+
+ .section .text
+ .proc ?1__serial_memmove#
+ .align 32
+ .global ?1__serial_memmove#
+// Arguments: r32 is dest, r33 is src, r34 is length in bytes, r35 is element size.
+// Length is in Bytes to allow a faster call to memcpy than if the length were
+// given in number of elements
+//
+?1__serial_memmove:
+ .prologue
+ { .mmi // -----------------> cycle 0
+ add r15=r34,r33 // r15= src + length
+ add r16=r34,r32 // r16= dst + length
+ cmp.gtu p9=r33,r32 // dest < src ? ie. no overlapp, or a WAR dependency
+} { .mib
+ mov r8=r32 // save dest pointer for return value
+ cmp.gtu p8=r35,r34 // is length <= object size ?
+ (p8) br.ret.dpnt.many b0 // ---> if length is <= object size branch out
+ ;;
+ }
+ { .mmi
+ sub r28=r32,r33 // distance is destination - source in Byte
+ cmp.gtu p10,p11=r32,r15 // is src + length < dest ? i.e no overlapp
+ .save ar.lc, r31
+ mov r31 = ar.lc // saving the loop iteration counter
+ ;;
+ }
+ .body
+ { .mmb
+ // if one of the 2 following comparisons is true, it means either no overlapp or a WAR
+ // dependency therefore it is safe to branch to memcpy
+ cmp.eq p6=1,r35 // various compare for code versionning
+ (p11) cmp.gtu p10,p0=r33,r16 // is dst + length < src ? i.e no overlapp
+ (p10) br.cond.dpnt HIDDEN_JUMPTARGET (memcpy) // no overlapp, branch to memcpy
+ ;;
+ }
+ { .mib
+ add r23=r33,r35 // source duplication
+ add r22=r32,r35 // dest duplication
+ (p9) br.cond.dpnt HIDDEN_JUMPTARGET (memmove) // good overlap (WAR) branch to memmove
+ ;;
+ }
+ { .mmi // various compare for code versionning
+ cmp.eq p7=2,r35
+ cmp.eq p8=4,r35
+ cmp.eq p9=16,r35 // notice, we do not test size of 8, which is the DEFAULT case
+ }
+//
+// Note for length < 4:
+// Despite unrolling it seems that size is never tested to know if we can jump
+// or not to the unrolled case. i.e. for length < 4 seems not to handled.
+// In fact it done implicitelry:
+// 1/ if there is no or a 'good' dependency, memcpy is going to be called and
+// is going to handle this right.
+// 2/ if there is a bad dependency, the distance will always be < 4 therefore
+// the unrolled case (unrolled 4 times) will never be called and it will always
+// jump to the serial loop which is ok for short length.
+
+//
+// branch to the loop corresponding to element size
+//
+ { .bbb
+ (p6) br.cond.dpnt .copy_loop_size_1
+ (p7) br.cond.dpnt .copy_loop_size_2
+ (p8) br.cond.dpnt .copy_loop_size_4
+ ;;
+ }
+ { .mib
+// for size of 16 we just proceed as a size of 8. This is functionally safe and
+// there is no easy optimization to add for size 16. Performance of size 8 are already ok.
+// Since this is the last case, if we reach this point this is an unconditionnal branch
+// But the pointer duplication is predicated and done with 8 Byte object and not 16
+// Therefore, instead of using r35 we used the immediate value 8.
+ (p9) add r22=8,r32 // dest duplication
+ (p9) add r23=8,r33 // source duplication
+ br.cond.dpnt .copy_loop_size_8 // by DEFAULT branch to size of 8
+ ;;
+ }
+
+// If we have not yet branched to memcpy there is a RAW dependency,
+// so we need to perform an iteration per iteration copy loop. // An optimization is to proceed to distance computation for the dependency.
+// If distance is > 4 times the size of the element we unrolling by 4,
+// else we do the one element per iteration copy loop.
+// To do this, we compute the distance in ytes of the dependency,
+// then we divide it by the size of the element (using shift right).
+//
+
+.copy_loop_size_1:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 1 to get numbers of
+// iteration for a 4 times unrolled loop with 1 Byte element. r14 is the number of iterations
+// in element of 1 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mmi
+ cmp.gtu p13=4,r28 // check distance, if distance < 4 there is dependency branch to serial
+ add r17=-1,r34 // number of iteration for the serial loop
+ shr.u r11=r34,2 // number of iteration+1 for the unrolled loop
+ ;;
+ }
+ { .mib // Check if unrolled version is ok or not
+ add r10=-1,r11 // exact number of iteration for the unrolled loop
+ (p13) mov ar.lc=r17 // if serial, set loop counter to r17
+ (p13) br.cond.dptk .serial_loop_size_1 // if serial jump to serial part
+ ;;
+ }
+ { .mmi
+ nop.m 0
+ and r14=3,r34 // look at the epilog size for the unrolled loop
+ mov ar.lc=r10 // set the loop counter for unrolled loop
+ ;;
+ }
+.copy_loop_unrolled_size_1:
+ { .mmi
+ ld1 r24=[r33],2
+ ld1 r25=[r23],2
+ cmp.leu p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+ ;;
+ }
+ { .mmi
+ ld1 r26=[r33],2
+ ld1 r27=[r23],2
+ cmp.leu p7=2,r14
+ } { .mmi
+ st1 [r32]=r24,2
+ st1 [r22]=r25,2
+ cmp.leu p8=3,r14
+ ;;
+ } { .mmb
+ st1 [r32]=r26,2
+ st1 [r22]=r27,2
+ br.cloop.sptk .copy_loop_unrolled_size_1
+ ;;
+ } { .mmi
+ (p6) ld1 r24=[r33],2
+ (p7) ld1 r25=[r23]
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p8) ld1 r26=[r33]
+ (p6) st1 [r32]=r24,2
+ nop.i 0
+ }
+ { .mmi
+ (p7) st1 [r22]=r25
+ nop.m 0
+ nop.i 0
+ ;;
+ }
+ { .mib
+ (p8) st1 [r32]=r26
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+.serial_loop_size_1:
+ { .mmi
+ ld1 r3=[r33],1
+ ;;
+ st1 [r32]=r3,1
+ nop.i 0
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ br.cloop.sptk .serial_loop_size_1
+ ;;
+ }
+// end of copy loop
+ { .mib
+ nop.m 0
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+//
+// End of copy for elements of size 1 Byte
+//
+
+.copy_loop_size_2:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 3 to get numbers of
+// iteration for a 4 times unrolled loop with 8 Byte element. r14 is the number of iterations
+// in element of 2 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+ nop.m 0
+ shr.u r11=r34,3 // number of iteration for the unrolled loop
+ ;;
+ shr.u r17=r34,1 // number of iteration for the serial loop
+ ;;
+ }
+ { .mmi
+ and r14=3,r17 // epilog length
+ add r10=-1,r11 // unrolled loop exact counter
+ shr.u r28=r28,1 // distance in element, where element size is 8 Byte
+ ;;
+ }
+ { .mmi
+ cmp.gtu p13=4,r28 // check distance, if distance < 4 there is dependency branch to epilog
+ add r17=-1,r17 // serial loop exact counter
+ mov ar.lc=r10 // by default set the loop counter to the unrolled loop
+ ;;
+ }
+ { .mib
+ (p13) mov ar.lc=r17
+ (p13) br.cond.dptk .serial_loop_size_2
+ }
+.copy_loop_unrolled_size_2:
+ { .mmi
+ ld2 r24=[r33],4
+ ld2 r25=[r23],4
+ cmp.leu p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+ ;;
+ } { .mmi
+ ld2 r26=[r33],4
+ ld2 r27=[r23],4
+ cmp.leu p7=2,r14
+ } { .mmi
+ st2 [r32]=r24,4
+ st2 [r22]=r25,4
+ cmp.leu p8=3,r14
+ ;;
+ } { .mmb
+ st2 [r32]=r26,4
+ st2 [r22]=r27,4
+ br.cloop.sptk .copy_loop_unrolled_size_2
+ ;;
+ } { .mmi
+ (p6) ld2 r24=[r33],4
+ (p7) ld2 r25=[r23]
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p8) ld2 r26=[r33]
+ (p6) st2 [r32]=r24,4
+ nop.i 0
+ }
+ { .mmi
+ (p7) st2 [r22]=r25
+ nop.m 0
+ nop.i 0
+ ;;
+ }
+ { .mib
+ (p8) st2 [r32]=r26
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+.serial_loop_size_2:
+ { .mmi
+ ld2 r3=[r33],2
+ ;;
+ st2 [r32]=r3,2
+ nop.i 0
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ br.cloop.sptk .serial_loop_size_2
+ ;;
+ }
+// end of copy loop
+ { .mib
+ nop.m 0
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+//
+// End of copy for 2 Byte elements
+//
+
+.copy_loop_size_4:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 4 to get numbers of
+// iteration for a 4 times unrolled loop with 4 Byte element. r14 is the number of iterations
+// in element of 4 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+ nop.m 0
+ shr.u r11=r34,4
+ ;;
+ shr.u r17=r34,2
+ ;;
+ }
+ { .mmi
+ add r10=-1,r11 // unrolled loop counter
+ and r14=3,r17 // looks at the last 2 bits for epilog length
+ shr.u r28=r28,2 // distance in element, where element size is 4 Byte
+ ;;
+ }
+ { .mmi
+
+ cmp.gtu p13=4,r28 // check distance, if distance < 4 there is dependency branch to epilog
+ add r17=-1,r17 // serial loop counter
+ mov ar.lc=r10
+ ;;
+ }
+ { .mib // if we reach this bundle it means than we can unroll
+ (p13) mov ar.lc=r17
+ (p13) br.cond.dptk .serial_loop_size_4
+ ;;
+ }
+.copy_loop_unrolled_size_4:
+ { .mmi
+ ld4 r24=[r33],8
+ ld4 r25=[r23],8
+ cmp.leu p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+ ;;
+ } { .mmi
+ ld4 r26=[r33],8
+ ld4 r27=[r23],8
+ cmp.leu p7=2,r14
+ } { .mmi
+ st4 [r32]=r24,8
+ st4 [r22]=r25,8
+ cmp.leu p8=3,r14
+ ;;
+ } { .mmb
+ st4 [r32]=r26,8
+ st4 [r22]=r27,8
+ br.cloop.sptk .copy_loop_unrolled_size_4
+ ;;
+ } { .mmi
+ (p6) ld4 r24=[r33],8
+ (p7) ld4 r25=[r23]
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p8) ld4 r26=[r33]
+ (p6) st4 [r32]=r24,8
+ nop.i 0
+ }
+ { .mmi
+ (p7) st4 [r22]=r25
+ nop.m 0
+ nop.i 0
+ ;;
+ }
+ { .mib
+ (p8) st4 [r32]=r26
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+.serial_loop_size_4:
+ { .mmi
+ ld4 r3=[r33],4
+ ;;
+ st4 [r32]=r3,4
+ nop.i 0
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ br.cloop.sptk .serial_loop_size_4
+ ;;
+ }
+// end of copy loop
+{ .mib
+ nop.m 0
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+}
+//
+// End of copy for elements of size 4 Byte.
+//
+
+.copy_loop_size_8:
+// These 2 bundles depend from element size. r11 is length in Byte shifted by 5 to get numbers of
+// iteration for a 4 times unrolled loop with 8 Byte element. r14 is the number of iterations
+// in element of 8 Byte which is going to be used by the epilog, a priori epilog size = loop size
+ { .mii
+ nop.m 0
+ shr.u r11=r34,5
+ ;;
+ shr.u r17=r34,3
+ ;;
+ }
+ { .mmi
+ add r10=-1,r11
+ and r14=3,r17 // looks at the last 2 bits
+ shr.u r28=r28,3 // distance in element, where element size is 8 Byte
+ ;;
+ }
+ { .mmi
+ cmp.gtu p13=4,r28 // check distance, if distance < 4 there is dependency branch to epilog
+ add r17=-1,r17
+ mov ar.lc=r10
+ ;;
+ }
+ { .mib // if we reach this bundle it means than we can unroll
+ (p13) mov ar.lc=r17
+ (p13) br.cond.dptk .serial_loop_size_8
+ ;;
+ }
+.copy_loop_unrolled_size_8:
+ { .mmi
+ ld8 r24=[r33],16
+ ld8 r25=[r23],16
+ cmp.leu p6=1,r14 // since loop is unrolled 4 time, epilog length is at most 3
+ ;;
+ } { .mmi
+ ld8 r26=[r33],16
+ ld8 r27=[r23],16
+ cmp.leu p7=2,r14
+ } { .mmi
+ st8 [r32]=r24,16
+ st8 [r22]=r25,16
+ cmp.leu p8=3,r14
+ ;;
+ } { .mmb
+ st8 [r32]=r26,16
+ st8 [r22]=r27,16
+ br.cloop.sptk .copy_loop_unrolled_size_8
+ ;;
+ } { .mmi
+ (p6) ld8 r24=[r33],16
+ (p7) ld8 r25=[r23]
+ nop.i 0
+ ;;
+ }
+ { .mmi
+ (p8) ld8 r26=[r33]
+ (p6) st8 [r32]=r24,16
+ nop.i 0
+ }
+ { .mmi
+ (p7) st8 [r22]=r25
+ nop.m 0
+ nop.i 0
+ ;;
+ }
+ { .mib
+ (p8) st8 [r32]=r26
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+.serial_loop_size_8:
+ { .mmi
+ ld8 r3=[r33],8
+ ;;
+ st8 [r32]=r3,8
+ nop.i 0
+ }
+ { .mib
+ nop.m 0
+ nop.i 0
+ br.cloop.sptk .serial_loop_size_8
+ ;;
+ }
+// end of copy loop
+ { .mib
+ nop.m 0
+ mov ar.lc=r31 // restoring loop counter
+ br.ret.dpnt.many b0 // end of program return
+ ;;
+ }
+//
+// End of copy for element of 8 Bytes.
+//
+
+// -- End ?1__serial_memmove
+ .endp ?1__serial_memmove#
+ .type HIDDEN_JUMPTARGET (memcpy),@function
+ .global HIDDEN_JUMPTARGET (memcpy)
+ .type HIDDEN_JUMPTARGET (memmove),@function
+ .global HIDDEN_JUMPTARGET (memmove)
+// End