]> sourceware.org Git - glibc.git/blame - sysdeps/sparc/sparc64/sparcv9b/memcpy.S
Update.
[glibc.git] / sysdeps / sparc / sparc64 / sparcv9b / memcpy.S
CommitLineData
bb769ab6
UD
1/* Copy SIZE bytes from SRC to DEST.
2 For UltraSPARC-III.
62f29da7 3 Copyright (C) 2001, 2003 Free Software Foundation, Inc.
bb769ab6
UD
4 This file is part of the GNU C Library.
5 Contributed by David S. Miller (davem@redhat.com)
6
7 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
bb769ab6
UD
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 15 Lesser General Public License for more details.
bb769ab6 16
41bdb6e2
AJ
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */
bb769ab6
UD
21
22#include <sysdep.h>
23#define ASI_BLK_P 0xf0
24#define FPRS_FEF 0x04
25#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
26#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
27#define SMALL_COPY_USES_FPU
28
29#ifndef XCC
30#define USE_BPR
31#define XCC xcc
32#endif
33
34 .text
35 .align 32
36
37ENTRY(bcopy)
38 sub %o1, %o0, %o4 /* IEU0 Group */
39 mov %o0, %g3 /* IEU1 */
40 cmp %o4, %o2 /* IEU1 Group */
41 mov %o1, %o0 /* IEU0 */
42 bgeu,pt %XCC, 100f /* CTI */
43 mov %g3, %o1 /* IEU0 Group */
44#ifndef USE_BPR
45 srl %o2, 0, %o2 /* IEU1 */
46#endif
47 brnz,pn %o2, 220f /* CTI Group */
48 add %o0, %o2, %o0 /* IEU0 */
49 retl
50 nop
51END(bcopy)
52
53 /* Special/non-trivial issues of this code:
54 *
55 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
56 * 2) Only low 32 FPU registers are used so that only the
57 * lower half of the FPU register set is dirtied by this
58 * code. This is especially important in the kernel.
59 * 3) This code never prefetches cachelines past the end
60 * of the source buffer.
61 *
62 * The cheetah's flexible spine, oversized liver, enlarged heart,
63 * slender muscular body, and claws make it the swiftest hunter
64 * in Africa and the fastest animal on land. Can reach speeds
65 * of up to 2.4GB per second.
66 */
67 .align 32
68ENTRY(memcpy)
69
70100: /* %o0=dst, %o1=src, %o2=len */
71#ifndef __KERNEL__
72 /* Save away original 'dst' for memcpy return value. */
73 mov %o0, %g3 ! A0 Group
74#endif
75 /* Anything to copy at all? */
76 cmp %o2, 0 ! A1
77 ble,pn %XCC, 102f ! BR
78
79 /* Extremely small copy? */
80218: cmp %o2, 31 ! A0 Group
81 ble,pn %XCC, 101f ! BR
82
83 /* Large enough to use unrolled prefetch loops? */
84 cmp %o2, 0x100 ! A1
85 bge,a,pt %XCC, 103f ! BR Group
86 andcc %o0, 0x3f, %g2 ! A0
87
88 ba,pt %XCC, 108f ! BR Group
89 andcc %o0, 0x7, %g2 ! A0
90
91 .align 32
92101:
93 /* Copy %o2 bytes from src to dst, one byte at a time. */
94 ldub [%o1 + 0x00], %o3 ! MS Group
95 add %o1, 0x1, %o1 ! A0
96 add %o0, 0x1, %o0 ! A1
97 subcc %o2, 1, %o2 ! A0 Group
98
99 bg,pt %XCC, 101b ! BR
100 stb %o3, [%o0 + -1] ! MS Group (1-cycle stall)
101
102102:
103#ifdef __KERNEL__
104 retl ! BR Group (0-4 cycle stall)
105 clr %o0 ! A0
106#else
107 retl ! BR Group (0-4 cycle stall)
108 mov %g3, %o0 ! A0
109#endif
110
111 /* Here len >= (6 * 64) and condition codes reflect execution
112 * of "andcc %o0, 0x7, %g2", done by caller.
113 */
114 .align 64
115103:
116 /* Is 'dst' already aligned on an 64-byte boundary? */
117 be,pt %XCC, 2f ! BR
118
119 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
120 * of bytes to copy to make 'dst' 64-byte aligned. We pre-
121 * subtract this from 'len'.
122 */
123 sub %g2, 0x40, %g2 ! A0 Group
124 sub %g0, %g2, %g2 ! A0 Group
125 sub %o2, %g2, %o2 ! A0 Group
126
127 /* Copy %g2 bytes from src to dst, one byte at a time. */
1281: ldub [%o1 + 0x00], %o3 ! MS (Group)
129 add %o1, 0x1, %o1 ! A1
130 add %o0, 0x1, %o0 ! A0 Group
131 subcc %g2, 0x1, %g2 ! A1
132
133 bg,pt %XCC, 1b ! BR Group
134 stb %o3, [%o0 + -1] ! MS Group
135
1362: VISEntryHalf ! MS+MS
137 and %o1, 0x7, %g1 ! A1
138 ba,pt %XCC, 104f ! BR
139 alignaddr %o1, %g0, %o1 ! MS (Break-after)
140
141 .align 64
142104:
143 prefetch [%o1 + 0x000], #one_read ! MS Group1
144 prefetch [%o1 + 0x040], #one_read ! MS Group2
145 andn %o2, (0x40 - 1), %o4 ! A0
146 prefetch [%o1 + 0x080], #one_read ! MS Group3
147 cmp %o4, 0x140 ! A0
148 prefetch [%o1 + 0x0c0], #one_read ! MS Group4
149 ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8)
150 bge,a,pt %XCC, 1f ! BR
151
152 prefetch [%o1 + 0x100], #one_read ! MS Group6
1531: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9)
154 cmp %o4, 0x180 ! A1
155 bge,a,pt %XCC, 1f ! BR
156 prefetch [%o1 + 0x140], #one_read ! MS Group7
1571: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10)
158 cmp %o4, 0x1c0 ! A1
159 bge,a,pt %XCC, 1f ! BR
160
161 prefetch [%o1 + 0x180], #one_read ! MS Group8
1621: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12)
163 ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12)
164 faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13)
165 ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13)
166 faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15)
167 ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15)
168 faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16)
169
170 ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16)
171 faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18)
172 ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
173 faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19)
174 ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19)
175
176 /* We only use the first loop if len > (7 * 64). */
177 subcc %o4, 0x1c0, %o4 ! A0 Group17
178 bg,pt %XCC, 105f ! BR
179 add %o1, 0x40, %o1 ! A1
180
181 add %o4, 0x140, %o4 ! A0 Group18
182 ba,pt %XCC, 106f ! BR
183 srl %o4, 6, %o3 ! A0 Group19
184 nop
185 nop
186 nop
187 nop
188 nop
189
190 nop
191 nop
192
193 /* This loop performs the copy and queues new prefetches.
194 * We drop into the second loop when len <= (5 * 64). Note
195 * that this (5 * 64) factor has been subtracted from len
196 * already.
197 */
198105:
199 ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5)
200 faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5)
201 ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6)
202 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7)
203 stda %f16, [%o0] ASI_BLK_P ! MS
204 ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7)
205
206 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
207 ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15)
208 faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16)
209 ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16)
210 faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17)
211 ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17)
212 faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18)
213 ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18)
214
215 faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19)
216 ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19)
217 prefetch [%o1 + 0x180], #one_read ! MS
218 faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20)
219 subcc %o4, 0x40, %o4 ! A0
220 add %o1, 0x40, %o1 ! A1
221 bg,pt %XCC, 105b ! BR
222 add %o0, 0x40, %o0 ! A0 Group18
223
224 mov 5, %o3 ! A1
225
226 /* This loop performs on the copy, no new prefetches are
227 * queued. We do things this way so that we do not perform
228 * any spurious prefetches past the end of the src buffer.
229 */
230106:
231 ldd [%o1 + 0x008], %f2 ! MS
232 faligndata %f12, %f14, %f28 ! FGA Group2
233 ldd [%o1 + 0x010], %f4 ! MS
234 faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall)
235 stda %f16, [%o0] ASI_BLK_P ! MS
236 ldd [%o1 + 0x018], %f6 ! AX
237 faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall)
238
239 ldd [%o1 + 0x020], %f8 ! MS
240 faligndata %f2, %f4, %f18 ! FGA Group13
241 ldd [%o1 + 0x028], %f10 ! MS
242 faligndata %f4, %f6, %f20 ! FGA Group14
243 ldd [%o1 + 0x030], %f12 ! MS
244 faligndata %f6, %f8, %f22 ! FGA Group15
245 ldd [%o1 + 0x038], %f14 ! MS
246 faligndata %f8, %f10, %f24 ! FGA Group16
247
248 ldd [%o1 + 0x040], %f0 ! AX
249 faligndata %f10, %f12, %f26 ! FGA Group17
250 subcc %o3, 0x01, %o3 ! A0
251 add %o1, 0x40, %o1 ! A1
252 bg,pt %XCC, 106b ! BR
253 add %o0, 0x40, %o0 ! A0 Group18
254
255 /* Finally we copy the last full 64-byte block. */
256 ldd [%o1 + 0x008], %f2 ! MS
257 faligndata %f12, %f14, %f28 ! FGA
258 ldd [%o1 + 0x010], %f4 ! MS Group19
259 faligndata %f14, %f0, %f30 ! FGA
260 stda %f16, [%o0] ASI_BLK_P ! MS Group20
261 ldd [%o1 + 0x018], %f6 ! AX
262 faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall)
263 ldd [%o1 + 0x020], %f8 ! MS
264 faligndata %f2, %f4, %f18 ! FGA Group12
265 ldd [%o1 + 0x028], %f10 ! MS
266 faligndata %f4, %f6, %f20 ! FGA Group13
267 ldd [%o1 + 0x030], %f12 ! MS
268 faligndata %f6, %f8, %f22 ! FGA Group14
269 ldd [%o1 + 0x038], %f14 ! MS
270 faligndata %f8, %f10, %f24 ! FGA Group15
271 cmp %g1, 0 ! A0
272 be,pt %XCC, 1f ! BR
273 add %o0, 0x40, %o0 ! A1
274 ldd [%o1 + 0x040], %f0 ! MS
2751: faligndata %f10, %f12, %f26 ! FGA Group16
276 faligndata %f12, %f14, %f28 ! FGA Group17
277 faligndata %f14, %f0, %f30 ! FGA Group18
278 stda %f16, [%o0] ASI_BLK_P ! MS
279 add %o0, 0x40, %o0 ! A0
280 add %o1, 0x40, %o1 ! A1
281 membar #Sync ! MS Group26 (7-cycle stall)
282
283 /* Now we copy the (len modulo 64) bytes at the end.
284 * Note how we borrow the %f0 loaded above.
285 *
286 * Also notice how this code is careful not to perform a
287 * load past the end of the src buffer just like similar
288 * code found in 'toosmall' processing.
289 */
290 and %o2, 0x3f, %o2 ! A0 Group
291 andcc %o2, 0x38, %g2 ! A0 Group
292 be,pn %XCC, 107f ! BR
293 subcc %g2, 0x8, %g2 ! A1
294 be,pn %XCC, 107f ! BR Group
295 cmp %g1, 0 ! A0
296
297 be,a,pt %XCC, 1f ! BR Group
298 ldd [%o1 + 0x00], %f0 ! MS
299
3001: ldd [%o1 + 0x08], %f2 ! MS Group
301 add %o1, 0x8, %o1 ! A0
302 sub %o2, 0x8, %o2 ! A1
303 subcc %g2, 0x8, %g2 ! A0 Group
304 faligndata %f0, %f2, %f8 ! FGA Group
305 std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
306 be,pn %XCC, 107f ! BR
307 add %o0, 0x8, %o0 ! A0
308 ldd [%o1 + 0x08], %f0 ! MS Group
309 add %o1, 0x8, %o1 ! A0
310 sub %o2, 0x8, %o2 ! A1
311 subcc %g2, 0x8, %g2 ! A0 Group
312 faligndata %f2, %f0, %f8 ! FGA
313 std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX)
314 bne,pn %XCC, 1b ! BR
315 add %o0, 0x8, %o0 ! A0 Group
316
317 /* If anything is left, we copy it one byte at a time.
318 * Note that %g1 is (src & 0x3) saved above before the
319 * alignaddr was performed.
320 */
321107:
322 cmp %o2, 0
323 add %o1, %g1, %o1
324 VISExitHalf
325 be,pn %XCC, 102b
326 nop
327 ba,a,pt %XCC, 101b
328
329 /* If we get here, then 32 <= len < (6 * 64) */
330108:
331
332#ifdef SMALL_COPY_USES_FPU
333
334 /* Is 'dst' already aligned on an 8-byte boundary? */
335 be,pt %XCC, 2f ! BR Group
336
337 /* Compute abs((dst & 7) - 8) into %g2. This is the number
338 * of bytes to copy to make 'dst' 8-byte aligned. We pre-
339 * subtract this from 'len'.
340 */
341 sub %g2, 0x8, %g2 ! A0
342 sub %g0, %g2, %g2 ! A0 Group (reg-dep)
343 sub %o2, %g2, %o2 ! A0 Group (reg-dep)
344
345 /* Copy %g2 bytes from src to dst, one byte at a time. */
3461: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles)
347 add %o1, 0x1, %o1 ! A1
348 add %o0, 0x1, %o0 ! A0 Group
349 subcc %g2, 0x1, %g2 ! A1
350
351 bg,pt %XCC, 1b ! BR Group
352 stb %o3, [%o0 + -1] ! MS Group
353
3542: VISEntryHalf ! MS+MS
355
356 /* Compute (len - (len % 8)) into %g2. This is guarenteed
357 * to be nonzero.
358 */
359 andn %o2, 0x7, %g2 ! A0 Group
360
361 /* You may read this and believe that it allows reading
362 * one 8-byte longword past the end of src. It actually
363 * does not, as %g2 is subtracted as loads are done from
364 * src, so we always stop before running off the end.
365 * Also, we are guarenteed to have at least 0x10 bytes
366 * to move here.
367 */
368 sub %g2, 0x8, %g2 ! A0 Group (reg-dep)
369 alignaddr %o1, %g0, %g1 ! MS (Break-after)
370 ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall)
371 add %g1, 0x8, %g1 ! A0
372
3731: ldd [%g1 + 0x00], %f2 ! MS Group
374 add %g1, 0x8, %g1 ! A0
375 sub %o2, 0x8, %o2 ! A1
376 subcc %g2, 0x8, %g2 ! A0 Group
377
378 faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall)
379 std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
380 add %o1, 0x8, %o1 ! A0
381 be,pn %XCC, 2f ! BR
382
383 add %o0, 0x8, %o0 ! A1
384 ldd [%g1 + 0x00], %f0 ! MS Group
385 add %g1, 0x8, %g1 ! A0
386 sub %o2, 0x8, %o2 ! A1
387
388 subcc %g2, 0x8, %g2 ! A0 Group
389 faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall)
390 std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall)
391 add %o1, 0x8, %o1 ! A0
392
393 bne,pn %XCC, 1b ! BR
394 add %o0, 0x8, %o0 ! A1
395
396 /* Nothing left to copy? */
3972: cmp %o2, 0 ! A0 Group
398 VISExitHalf ! A0+MS
399 be,pn %XCC, 102b ! BR Group
400 nop ! A0
401 ba,a,pt %XCC, 101b ! BR Group
402
403#else /* !(SMALL_COPY_USES_FPU) */
404
405 xor %o1, %o0, %g2
406 andcc %g2, 0x7, %g0
407 bne,pn %XCC, 101b
408 andcc %o1, 0x7, %g2
409
410 be,pt %XCC, 2f
411 sub %g2, 0x8, %g2
412 sub %g0, %g2, %g2
413 sub %o2, %g2, %o2
414
4151: ldub [%o1 + 0x00], %o3
416 add %o1, 0x1, %o1
417 add %o0, 0x1, %o0
418 subcc %g2, 0x1, %g2
419 bg,pt %XCC, 1b
420 stb %o3, [%o0 + -1]
421
4222: andn %o2, 0x7, %g2
423 sub %o2, %g2, %o2
424
4253: ldx [%o1 + 0x00], %o3
426 add %o1, 0x8, %o1
427 add %o0, 0x8, %o0
428 subcc %g2, 0x8, %g2
429 bg,pt %XCC, 3b
430 stx %o3, [%o0 + -8]
431
432 cmp %o2, 0
433 bne,pn %XCC, 101b
434 nop
435 ba,a,pt %XCC, 102b
436
437#endif /* !(SMALL_COPY_USES_FPU) */
438END(memcpy)
439
440#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3) \
441 ldx [%src - offset - 0x20], %t0; \
442 ldx [%src - offset - 0x18], %t1; \
443 ldx [%src - offset - 0x10], %t2; \
444 ldx [%src - offset - 0x08], %t3; \
445 stw %t0, [%dst - offset - 0x1c]; \
446 srlx %t0, 32, %t0; \
447 stw %t0, [%dst - offset - 0x20]; \
448 stw %t1, [%dst - offset - 0x14]; \
449 srlx %t1, 32, %t1; \
450 stw %t1, [%dst - offset - 0x18]; \
451 stw %t2, [%dst - offset - 0x0c]; \
452 srlx %t2, 32, %t2; \
453 stw %t2, [%dst - offset - 0x10]; \
454 stw %t3, [%dst - offset - 0x04]; \
455 srlx %t3, 32, %t3; \
456 stw %t3, [%dst - offset - 0x08];
457
458#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \
459 ldx [%src - offset - 0x20], %t0; \
460 ldx [%src - offset - 0x18], %t1; \
461 ldx [%src - offset - 0x10], %t2; \
462 ldx [%src - offset - 0x08], %t3; \
463 stx %t0, [%dst - offset - 0x20]; \
464 stx %t1, [%dst - offset - 0x18]; \
465 stx %t2, [%dst - offset - 0x10]; \
466 stx %t3, [%dst - offset - 0x08]; \
467 ldx [%src - offset - 0x40], %t0; \
468 ldx [%src - offset - 0x38], %t1; \
469 ldx [%src - offset - 0x30], %t2; \
470 ldx [%src - offset - 0x28], %t3; \
471 stx %t0, [%dst - offset - 0x40]; \
472 stx %t1, [%dst - offset - 0x38]; \
473 stx %t2, [%dst - offset - 0x30]; \
474 stx %t3, [%dst - offset - 0x28];
475
476#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \
477 ldx [%src + offset + 0x00], %t0; \
478 ldx [%src + offset + 0x08], %t1; \
479 stw %t0, [%dst + offset + 0x04]; \
480 srlx %t0, 32, %t2; \
481 stw %t2, [%dst + offset + 0x00]; \
482 stw %t1, [%dst + offset + 0x0c]; \
483 srlx %t1, 32, %t3; \
484 stw %t3, [%dst + offset + 0x08];
485
486#define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \
487 ldx [%src + offset + 0x00], %t0; \
488 ldx [%src + offset + 0x08], %t1; \
489 stx %t0, [%dst + offset + 0x00]; \
490 stx %t1, [%dst + offset + 0x08];
491
492 .align 32
493228: andcc %o2, 1, %g0 /* IEU1 Group */
494 be,pt %icc, 2f+4 /* CTI */
4951: ldub [%o1 - 1], %o5 /* LOAD Group */
496 sub %o1, 1, %o1 /* IEU0 */
497 sub %o0, 1, %o0 /* IEU1 */
498 subcc %o2, 1, %o2 /* IEU1 Group */
499 be,pn %xcc, 229f /* CTI */
500 stb %o5, [%o0] /* Store */
5012: ldub [%o1 - 1], %o5 /* LOAD Group */
502 sub %o0, 2, %o0 /* IEU0 */
503 ldub [%o1 - 2], %g5 /* LOAD Group */
504 sub %o1, 2, %o1 /* IEU0 */
505 subcc %o2, 2, %o2 /* IEU1 Group */
506 stb %o5, [%o0 + 1] /* Store */
507 bne,pt %xcc, 2b /* CTI */
508 stb %g5, [%o0] /* Store */
509229: retl
510 mov %g4, %o0
511
512 .align 32
513ENTRY(memmove)
514 mov %o0, %g3
515#ifndef USE_BPR
516 srl %o2, 0, %o2 /* IEU1 Group */
517#endif
518 brz,pn %o2, 102b /* CTI Group */
519 sub %o0, %o1, %o4 /* IEU0 */
520 cmp %o4, %o2 /* IEU1 Group */
521 bgeu,pt %XCC, 218b /* CTI */
522 mov %o0, %g4 /* IEU0 */
523 add %o0, %o2, %o0 /* IEU0 Group */
524220: add %o1, %o2, %o1 /* IEU1 */
525 cmp %o2, 15 /* IEU1 Group */
526 bleu,pn %xcc, 228b /* CTI */
527 andcc %o0, 7, %g2 /* IEU1 Group */
528 sub %o0, %o1, %g5 /* IEU0 */
529 andcc %g5, 3, %o5 /* IEU1 Group */
530 bne,pn %xcc, 232f /* CTI */
531 andcc %o1, 3, %g0 /* IEU1 Group */
532 be,a,pt %xcc, 236f /* CTI */
533 andcc %o1, 4, %g0 /* IEU1 Group */
534 andcc %o1, 1, %g0 /* IEU1 Group */
535 be,pn %xcc, 4f /* CTI */
536 andcc %o1, 2, %g0 /* IEU1 Group */
537 ldub [%o1 - 1], %g2 /* Load Group */
538 sub %o1, 1, %o1 /* IEU0 */
539 sub %o0, 1, %o0 /* IEU1 */
540 sub %o2, 1, %o2 /* IEU0 Group */
541 be,pn %xcc, 5f /* CTI Group */
542 stb %g2, [%o0] /* Store */
5434: lduh [%o1 - 2], %g2 /* Load Group */
544 sub %o1, 2, %o1 /* IEU0 */
545 sub %o0, 2, %o0 /* IEU1 */
546 sub %o2, 2, %o2 /* IEU0 */
547 sth %g2, [%o0] /* Store Group + bubble */
5485: andcc %o1, 4, %g0 /* IEU1 */
549236: be,a,pn %xcc, 2f /* CTI */
62f29da7 550 andcc %o2, -128, %g6 /* IEU1 Group */
bb769ab6
UD
551 lduw [%o1 - 4], %g5 /* Load Group */
552 sub %o1, 4, %o1 /* IEU0 */
553 sub %o0, 4, %o0 /* IEU1 */
554 sub %o2, 4, %o2 /* IEU0 Group */
555 stw %g5, [%o0] /* Store */
62f29da7 556 andcc %o2, -128, %g6 /* IEU1 Group */
bb769ab6
UD
5572: be,pn %xcc, 235f /* CTI */
558 andcc %o0, 4, %g0 /* IEU1 Group */
559 be,pn %xcc, 282f + 4 /* CTI Group */
5605: RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
561 RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
562 RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
563 RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
62f29da7 564 subcc %g6, 128, %g6 /* IEU1 Group */
bb769ab6
UD
565 sub %o1, 128, %o1 /* IEU0 */
566 bne,pt %xcc, 5b /* CTI */
567 sub %o0, 128, %o0 /* IEU0 Group */
62f29da7 568235: andcc %o2, 0x70, %g6 /* IEU1 Group */
bb769ab6
UD
56941: be,pn %xcc, 280f /* CTI */
570 andcc %o2, 8, %g0 /* IEU1 Group */
571 /* Clk1 8-( */
572 /* Clk2 8-( */
573 /* Clk3 8-( */
574 /* Clk4 8-( */
575279: rd %pc, %o5 /* PDU Group */
62f29da7
UD
576 sll %g6, 1, %g5 /* IEU0 Group */
577 sub %o1, %g6, %o1 /* IEU1 */
bb769ab6
UD
578 sub %o5, %g5, %o5 /* IEU0 Group */
579 jmpl %o5 + %lo(280f - 279b), %g0 /* CTI Group brk forced*/
62f29da7 580 sub %o0, %g6, %o0 /* IEU0 Group */
bb769ab6
UD
581 RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
582 RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
583 RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
584 RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
585 RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
586 RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
587 RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
588280: be,pt %xcc, 281f /* CTI */
589 andcc %o2, 4, %g0 /* IEU1 */
590 ldx [%o1 - 8], %g2 /* Load Group */
591 sub %o0, 8, %o0 /* IEU0 */
592 stw %g2, [%o0 + 4] /* Store Group */
593 sub %o1, 8, %o1 /* IEU1 */
594 srlx %g2, 32, %g2 /* IEU0 Group */
595 stw %g2, [%o0] /* Store */
596281: be,pt %xcc, 1f /* CTI */
597 andcc %o2, 2, %g0 /* IEU1 Group */
598 lduw [%o1 - 4], %g2 /* Load Group */
599 sub %o1, 4, %o1 /* IEU0 */
600 stw %g2, [%o0 - 4] /* Store Group */
601 sub %o0, 4, %o0 /* IEU0 */
6021: be,pt %xcc, 1f /* CTI */
603 andcc %o2, 1, %g0 /* IEU1 Group */
604 lduh [%o1 - 2], %g2 /* Load Group */
605 sub %o1, 2, %o1 /* IEU0 */
606 sth %g2, [%o0 - 2] /* Store Group */
607 sub %o0, 2, %o0 /* IEU0 */
6081: be,pt %xcc, 211f /* CTI */
609 nop /* IEU1 */
610 ldub [%o1 - 1], %g2 /* Load Group */
611 stb %g2, [%o0 - 1] /* Store Group + bubble */
612211: retl
613 mov %g4, %o0
614
615282: RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
616 RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
62f29da7 617 subcc %g6, 128, %g6 /* IEU1 Group */
bb769ab6
UD
618 sub %o1, 128, %o1 /* IEU0 */
619 bne,pt %xcc, 282b /* CTI */
620 sub %o0, 128, %o0 /* IEU0 Group */
62f29da7 621 andcc %o2, 0x70, %g6 /* IEU1 */
bb769ab6
UD
622 be,pn %xcc, 284f /* CTI */
623 andcc %o2, 8, %g0 /* IEU1 Group */
624 /* Clk1 8-( */
625 /* Clk2 8-( */
626 /* Clk3 8-( */
627 /* Clk4 8-( */
628283: rd %pc, %o5 /* PDU Group */
62f29da7
UD
629 sub %o1, %g6, %o1 /* IEU0 Group */
630 sub %o5, %g6, %o5 /* IEU1 */
bb769ab6 631 jmpl %o5 + %lo(284f - 283b), %g0 /* CTI Group brk forced*/
62f29da7 632 sub %o0, %g6, %o0 /* IEU0 Group */
bb769ab6
UD
633 RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
634 RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
635 RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
636 RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
637 RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
638 RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
639 RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
640284: be,pt %xcc, 285f /* CTI Group */
641 andcc %o2, 4, %g0 /* IEU1 */
642 ldx [%o1 - 8], %g2 /* Load Group */
643 sub %o0, 8, %o0 /* IEU0 */
644 sub %o1, 8, %o1 /* IEU0 Group */
645 stx %g2, [%o0] /* Store */
646285: be,pt %xcc, 1f /* CTI */
647 andcc %o2, 2, %g0 /* IEU1 Group */
648 lduw [%o1 - 4], %g2 /* Load Group */
649 sub %o0, 4, %o0 /* IEU0 */
650 sub %o1, 4, %o1 /* IEU0 Group */
651 stw %g2, [%o0] /* Store */
6521: be,pt %xcc, 1f /* CTI */
653 andcc %o2, 1, %g0 /* IEU1 Group */
654 lduh [%o1 - 2], %g2 /* Load Group */
655 sub %o0, 2, %o0 /* IEU0 */
656 sub %o1, 2, %o1 /* IEU0 Group */
657 sth %g2, [%o0] /* Store */
6581: be,pt %xcc, 1f /* CTI */
659 nop /* IEU0 Group */
660 ldub [%o1 - 1], %g2 /* Load Group */
661 stb %g2, [%o0 - 1] /* Store Group + bubble */
6621: retl
663 mov %g4, %o0
664
665232: brz,pt %g2, 2f /* CTI Group */
666 sub %o2, %g2, %o2 /* IEU0 Group */
6671: ldub [%o1 - 1], %g5 /* Load Group */
668 sub %o1, 1, %o1 /* IEU0 */
669 sub %o0, 1, %o0 /* IEU1 */
670 subcc %g2, 1, %g2 /* IEU1 Group */
671 bne,pt %xcc, 1b /* CTI */
672 stb %g5, [%o0] /* Store */
6732: andn %o2, 7, %g5 /* IEU0 Group */
674 and %o2, 7, %o2 /* IEU1 */
675 fmovd %f0, %f2 /* FPU */
676 alignaddr %o1, %g0, %g1 /* GRU Group */
677 ldd [%g1], %f4 /* Load Group */
6781: ldd [%g1 - 8], %f6 /* Load Group */
679 sub %g1, 8, %g1 /* IEU0 Group */
680 subcc %g5, 8, %g5 /* IEU1 */
681 faligndata %f6, %f4, %f0 /* GRU Group */
682 std %f0, [%o0 - 8] /* Store */
683 sub %o1, 8, %o1 /* IEU0 Group */
684 be,pn %xcc, 233f /* CTI */
685 sub %o0, 8, %o0 /* IEU1 */
686 ldd [%g1 - 8], %f4 /* Load Group */
687 sub %g1, 8, %g1 /* IEU0 */
688 subcc %g5, 8, %g5 /* IEU1 */
689 faligndata %f4, %f6, %f0 /* GRU Group */
690 std %f0, [%o0 - 8] /* Store */
691 sub %o1, 8, %o1 /* IEU0 */
692 bne,pn %xcc, 1b /* CTI Group */
693 sub %o0, 8, %o0 /* IEU0 */
694233: brz,pn %o2, 234f /* CTI Group */
695 nop /* IEU0 */
696237: ldub [%o1 - 1], %g5 /* LOAD */
697 sub %o1, 1, %o1 /* IEU0 */
698 sub %o0, 1, %o0 /* IEU1 */
699 subcc %o2, 1, %o2 /* IEU1 */
700 bne,pt %xcc, 237b /* CTI */
701 stb %g5, [%o0] /* Store Group */
702234: wr %g0, FPRS_FEF, %fprs
703 retl
704 mov %g4, %o0
705END(memmove)
706
707#ifdef USE_BPR
708weak_alias(memcpy, __align_cpy_1)
709weak_alias(memcpy, __align_cpy_2)
710weak_alias(memcpy, __align_cpy_4)
711weak_alias(memcpy, __align_cpy_8)
712weak_alias(memcpy, __align_cpy_16)
713#endif
This page took 0.159258 seconds and 5 git commands to generate.