]>
Commit | Line | Data |
---|---|---|
bb769ab6 UD |
1 | /* Copy SIZE bytes from SRC to DEST. |
2 | For UltraSPARC-III. | |
62f29da7 | 3 | Copyright (C) 2001, 2003 Free Software Foundation, Inc. |
bb769ab6 UD |
4 | This file is part of the GNU C Library. |
5 | Contributed by David S. Miller (davem@redhat.com) | |
6 | ||
7 | The GNU C Library is free software; you can redistribute it and/or | |
41bdb6e2 AJ |
8 | modify it under the terms of the GNU Lesser General Public |
9 | License as published by the Free Software Foundation; either | |
10 | version 2.1 of the License, or (at your option) any later version. | |
bb769ab6 UD |
11 | |
12 | The GNU C Library is distributed in the hope that it will be useful, | |
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 15 | Lesser General Public License for more details. |
bb769ab6 | 16 | |
41bdb6e2 AJ |
17 | You should have received a copy of the GNU Lesser General Public |
18 | License along with the GNU C Library; if not, write to the Free | |
19 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
20 | 02111-1307 USA. */ | |
bb769ab6 UD |
21 | |
22 | #include <sysdep.h> | |
23 | #define ASI_BLK_P 0xf0 | |
24 | #define FPRS_FEF 0x04 | |
25 | #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | |
26 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
27 | #define SMALL_COPY_USES_FPU | |
28 | ||
29 | #ifndef XCC | |
30 | #define USE_BPR | |
31 | #define XCC xcc | |
32 | #endif | |
33 | ||
34 | .text | |
35 | .align 32 | |
36 | ||
37 | ENTRY(bcopy) | |
38 | sub %o1, %o0, %o4 /* IEU0 Group */ | |
39 | mov %o0, %g3 /* IEU1 */ | |
40 | cmp %o4, %o2 /* IEU1 Group */ | |
41 | mov %o1, %o0 /* IEU0 */ | |
42 | bgeu,pt %XCC, 100f /* CTI */ | |
43 | mov %g3, %o1 /* IEU0 Group */ | |
44 | #ifndef USE_BPR | |
45 | srl %o2, 0, %o2 /* IEU1 */ | |
46 | #endif | |
47 | brnz,pn %o2, 220f /* CTI Group */ | |
48 | add %o0, %o2, %o0 /* IEU0 */ | |
49 | retl | |
50 | nop | |
51 | END(bcopy) | |
52 | ||
53 | /* Special/non-trivial issues of this code: | |
54 | * | |
55 | * 1) %o5 is preserved from VISEntryHalf to VISExitHalf | |
56 | * 2) Only low 32 FPU registers are used so that only the | |
57 | * lower half of the FPU register set is dirtied by this | |
58 | * code. This is especially important in the kernel. | |
59 | * 3) This code never prefetches cachelines past the end | |
60 | * of the source buffer. | |
61 | * | |
62 | * The cheetah's flexible spine, oversized liver, enlarged heart, | |
63 | * slender muscular body, and claws make it the swiftest hunter | |
64 | * in Africa and the fastest animal on land. Can reach speeds | |
65 | * of up to 2.4GB per second. | |
66 | */ | |
67 | .align 32 | |
68 | ENTRY(memcpy) | |
69 | ||
70 | 100: /* %o0=dst, %o1=src, %o2=len */ | |
71 | #ifndef __KERNEL__ | |
72 | /* Save away original 'dst' for memcpy return value. */ | |
73 | mov %o0, %g3 ! A0 Group | |
74 | #endif | |
75 | /* Anything to copy at all? */ | |
76 | cmp %o2, 0 ! A1 | |
77 | ble,pn %XCC, 102f ! BR | |
78 | ||
79 | /* Extremely small copy? */ | |
80 | 218: cmp %o2, 31 ! A0 Group | |
81 | ble,pn %XCC, 101f ! BR | |
82 | ||
83 | /* Large enough to use unrolled prefetch loops? */ | |
84 | cmp %o2, 0x100 ! A1 | |
85 | bge,a,pt %XCC, 103f ! BR Group | |
86 | andcc %o0, 0x3f, %g2 ! A0 | |
87 | ||
88 | ba,pt %XCC, 108f ! BR Group | |
89 | andcc %o0, 0x7, %g2 ! A0 | |
90 | ||
91 | .align 32 | |
92 | 101: | |
93 | /* Copy %o2 bytes from src to dst, one byte at a time. */ | |
94 | ldub [%o1 + 0x00], %o3 ! MS Group | |
95 | add %o1, 0x1, %o1 ! A0 | |
96 | add %o0, 0x1, %o0 ! A1 | |
97 | subcc %o2, 1, %o2 ! A0 Group | |
98 | ||
99 | bg,pt %XCC, 101b ! BR | |
100 | stb %o3, [%o0 + -1] ! MS Group (1-cycle stall) | |
101 | ||
102 | 102: | |
103 | #ifdef __KERNEL__ | |
104 | retl ! BR Group (0-4 cycle stall) | |
105 | clr %o0 ! A0 | |
106 | #else | |
107 | retl ! BR Group (0-4 cycle stall) | |
108 | mov %g3, %o0 ! A0 | |
109 | #endif | |
110 | ||
111 | /* Here len >= (6 * 64) and condition codes reflect execution | |
112 | * of "andcc %o0, 0x7, %g2", done by caller. | |
113 | */ | |
114 | .align 64 | |
115 | 103: | |
116 | /* Is 'dst' already aligned on an 64-byte boundary? */ | |
117 | be,pt %XCC, 2f ! BR | |
118 | ||
119 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | |
120 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | |
121 | * subtract this from 'len'. | |
122 | */ | |
123 | sub %g2, 0x40, %g2 ! A0 Group | |
124 | sub %g0, %g2, %g2 ! A0 Group | |
125 | sub %o2, %g2, %o2 ! A0 Group | |
126 | ||
127 | /* Copy %g2 bytes from src to dst, one byte at a time. */ | |
128 | 1: ldub [%o1 + 0x00], %o3 ! MS (Group) | |
129 | add %o1, 0x1, %o1 ! A1 | |
130 | add %o0, 0x1, %o0 ! A0 Group | |
131 | subcc %g2, 0x1, %g2 ! A1 | |
132 | ||
133 | bg,pt %XCC, 1b ! BR Group | |
134 | stb %o3, [%o0 + -1] ! MS Group | |
135 | ||
136 | 2: VISEntryHalf ! MS+MS | |
137 | and %o1, 0x7, %g1 ! A1 | |
138 | ba,pt %XCC, 104f ! BR | |
139 | alignaddr %o1, %g0, %o1 ! MS (Break-after) | |
140 | ||
141 | .align 64 | |
142 | 104: | |
143 | prefetch [%o1 + 0x000], #one_read ! MS Group1 | |
144 | prefetch [%o1 + 0x040], #one_read ! MS Group2 | |
145 | andn %o2, (0x40 - 1), %o4 ! A0 | |
146 | prefetch [%o1 + 0x080], #one_read ! MS Group3 | |
147 | cmp %o4, 0x140 ! A0 | |
148 | prefetch [%o1 + 0x0c0], #one_read ! MS Group4 | |
149 | ldd [%o1 + 0x000], %f0 ! MS Group5 (%f0 results at G8) | |
150 | bge,a,pt %XCC, 1f ! BR | |
151 | ||
152 | prefetch [%o1 + 0x100], #one_read ! MS Group6 | |
153 | 1: ldd [%o1 + 0x008], %f2 ! AX (%f2 results at G9) | |
154 | cmp %o4, 0x180 ! A1 | |
155 | bge,a,pt %XCC, 1f ! BR | |
156 | prefetch [%o1 + 0x140], #one_read ! MS Group7 | |
157 | 1: ldd [%o1 + 0x010], %f4 ! AX (%f4 results at G10) | |
158 | cmp %o4, 0x1c0 ! A1 | |
159 | bge,a,pt %XCC, 1f ! BR | |
160 | ||
161 | prefetch [%o1 + 0x180], #one_read ! MS Group8 | |
162 | 1: faligndata %f0, %f2, %f16 ! FGA Group9 (%f16 at G12) | |
163 | ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G12) | |
164 | faligndata %f2, %f4, %f18 ! FGA Group10 (%f18 results at G13) | |
165 | ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G13) | |
166 | faligndata %f4, %f6, %f20 ! FGA Group12 (1-cycle stall,%f20 at G15) | |
167 | ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G15) | |
168 | faligndata %f6, %f8, %f22 ! FGA Group13 (%f22 results at G16) | |
169 | ||
170 | ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G16) | |
171 | faligndata %f8, %f10, %f24 ! FGA Group15 (1-cycle stall,%f24 at G18) | |
172 | ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) | |
173 | faligndata %f10, %f12, %f26 ! FGA Group16 (%f26 results at G19) | |
174 | ldd [%o1 + 0x040], %f0 ! MS (%f0 results at G19) | |
175 | ||
176 | /* We only use the first loop if len > (7 * 64). */ | |
177 | subcc %o4, 0x1c0, %o4 ! A0 Group17 | |
178 | bg,pt %XCC, 105f ! BR | |
179 | add %o1, 0x40, %o1 ! A1 | |
180 | ||
181 | add %o4, 0x140, %o4 ! A0 Group18 | |
182 | ba,pt %XCC, 106f ! BR | |
183 | srl %o4, 6, %o3 ! A0 Group19 | |
184 | nop | |
185 | nop | |
186 | nop | |
187 | nop | |
188 | nop | |
189 | ||
190 | nop | |
191 | nop | |
192 | ||
193 | /* This loop performs the copy and queues new prefetches. | |
194 | * We drop into the second loop when len <= (5 * 64). Note | |
195 | * that this (5 * 64) factor has been subtracted from len | |
196 | * already. | |
197 | */ | |
198 | 105: | |
199 | ldd [%o1 + 0x008], %f2 ! MS Group2 (%f2 results at G5) | |
200 | faligndata %f12, %f14, %f28 ! FGA (%f28 results at G5) | |
201 | ldd [%o1 + 0x010], %f4 ! MS Group3 (%f4 results at G6) | |
202 | faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall, %f30 at G7) | |
203 | stda %f16, [%o0] ASI_BLK_P ! MS | |
204 | ldd [%o1 + 0x018], %f6 ! AX (%f6 results at G7) | |
205 | ||
206 | faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) | |
207 | ldd [%o1 + 0x020], %f8 ! MS (%f8 results at G15) | |
208 | faligndata %f2, %f4, %f18 ! FGA Group13 (%f18 results at G16) | |
209 | ldd [%o1 + 0x028], %f10 ! MS (%f10 results at G16) | |
210 | faligndata %f4, %f6, %f20 ! FGA Group14 (%f20 results at G17) | |
211 | ldd [%o1 + 0x030], %f12 ! MS (%f12 results at G17) | |
212 | faligndata %f6, %f8, %f22 ! FGA Group15 (%f22 results at G18) | |
213 | ldd [%o1 + 0x038], %f14 ! MS (%f14 results at G18) | |
214 | ||
215 | faligndata %f8, %f10, %f24 ! FGA Group16 (%f24 results at G19) | |
216 | ldd [%o1 + 0x040], %f0 ! AX (%f0 results at G19) | |
217 | prefetch [%o1 + 0x180], #one_read ! MS | |
218 | faligndata %f10, %f12, %f26 ! FGA Group17 (%f26 results at G20) | |
219 | subcc %o4, 0x40, %o4 ! A0 | |
220 | add %o1, 0x40, %o1 ! A1 | |
221 | bg,pt %XCC, 105b ! BR | |
222 | add %o0, 0x40, %o0 ! A0 Group18 | |
223 | ||
224 | mov 5, %o3 ! A1 | |
225 | ||
226 | /* This loop performs on the copy, no new prefetches are | |
227 | * queued. We do things this way so that we do not perform | |
228 | * any spurious prefetches past the end of the src buffer. | |
229 | */ | |
230 | 106: | |
231 | ldd [%o1 + 0x008], %f2 ! MS | |
232 | faligndata %f12, %f14, %f28 ! FGA Group2 | |
233 | ldd [%o1 + 0x010], %f4 ! MS | |
234 | faligndata %f14, %f0, %f30 ! FGA Group4 (1-cycle stall) | |
235 | stda %f16, [%o0] ASI_BLK_P ! MS | |
236 | ldd [%o1 + 0x018], %f6 ! AX | |
237 | faligndata %f0, %f2, %f16 ! FGA Group12 (7-cycle stall) | |
238 | ||
239 | ldd [%o1 + 0x020], %f8 ! MS | |
240 | faligndata %f2, %f4, %f18 ! FGA Group13 | |
241 | ldd [%o1 + 0x028], %f10 ! MS | |
242 | faligndata %f4, %f6, %f20 ! FGA Group14 | |
243 | ldd [%o1 + 0x030], %f12 ! MS | |
244 | faligndata %f6, %f8, %f22 ! FGA Group15 | |
245 | ldd [%o1 + 0x038], %f14 ! MS | |
246 | faligndata %f8, %f10, %f24 ! FGA Group16 | |
247 | ||
248 | ldd [%o1 + 0x040], %f0 ! AX | |
249 | faligndata %f10, %f12, %f26 ! FGA Group17 | |
250 | subcc %o3, 0x01, %o3 ! A0 | |
251 | add %o1, 0x40, %o1 ! A1 | |
252 | bg,pt %XCC, 106b ! BR | |
253 | add %o0, 0x40, %o0 ! A0 Group18 | |
254 | ||
255 | /* Finally we copy the last full 64-byte block. */ | |
256 | ldd [%o1 + 0x008], %f2 ! MS | |
257 | faligndata %f12, %f14, %f28 ! FGA | |
258 | ldd [%o1 + 0x010], %f4 ! MS Group19 | |
259 | faligndata %f14, %f0, %f30 ! FGA | |
260 | stda %f16, [%o0] ASI_BLK_P ! MS Group20 | |
261 | ldd [%o1 + 0x018], %f6 ! AX | |
262 | faligndata %f0, %f2, %f16 ! FGA Group11 (7-cycle stall) | |
263 | ldd [%o1 + 0x020], %f8 ! MS | |
264 | faligndata %f2, %f4, %f18 ! FGA Group12 | |
265 | ldd [%o1 + 0x028], %f10 ! MS | |
266 | faligndata %f4, %f6, %f20 ! FGA Group13 | |
267 | ldd [%o1 + 0x030], %f12 ! MS | |
268 | faligndata %f6, %f8, %f22 ! FGA Group14 | |
269 | ldd [%o1 + 0x038], %f14 ! MS | |
270 | faligndata %f8, %f10, %f24 ! FGA Group15 | |
271 | cmp %g1, 0 ! A0 | |
272 | be,pt %XCC, 1f ! BR | |
273 | add %o0, 0x40, %o0 ! A1 | |
274 | ldd [%o1 + 0x040], %f0 ! MS | |
275 | 1: faligndata %f10, %f12, %f26 ! FGA Group16 | |
276 | faligndata %f12, %f14, %f28 ! FGA Group17 | |
277 | faligndata %f14, %f0, %f30 ! FGA Group18 | |
278 | stda %f16, [%o0] ASI_BLK_P ! MS | |
279 | add %o0, 0x40, %o0 ! A0 | |
280 | add %o1, 0x40, %o1 ! A1 | |
281 | membar #Sync ! MS Group26 (7-cycle stall) | |
282 | ||
283 | /* Now we copy the (len modulo 64) bytes at the end. | |
284 | * Note how we borrow the %f0 loaded above. | |
285 | * | |
286 | * Also notice how this code is careful not to perform a | |
287 | * load past the end of the src buffer just like similar | |
288 | * code found in 'toosmall' processing. | |
289 | */ | |
290 | and %o2, 0x3f, %o2 ! A0 Group | |
291 | andcc %o2, 0x38, %g2 ! A0 Group | |
292 | be,pn %XCC, 107f ! BR | |
293 | subcc %g2, 0x8, %g2 ! A1 | |
294 | be,pn %XCC, 107f ! BR Group | |
295 | cmp %g1, 0 ! A0 | |
296 | ||
297 | be,a,pt %XCC, 1f ! BR Group | |
298 | ldd [%o1 + 0x00], %f0 ! MS | |
299 | ||
300 | 1: ldd [%o1 + 0x08], %f2 ! MS Group | |
301 | add %o1, 0x8, %o1 ! A0 | |
302 | sub %o2, 0x8, %o2 ! A1 | |
303 | subcc %g2, 0x8, %g2 ! A0 Group | |
304 | faligndata %f0, %f2, %f8 ! FGA Group | |
305 | std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) | |
306 | be,pn %XCC, 107f ! BR | |
307 | add %o0, 0x8, %o0 ! A0 | |
308 | ldd [%o1 + 0x08], %f0 ! MS Group | |
309 | add %o1, 0x8, %o1 ! A0 | |
310 | sub %o2, 0x8, %o2 ! A1 | |
311 | subcc %g2, 0x8, %g2 ! A0 Group | |
312 | faligndata %f2, %f0, %f8 ! FGA | |
313 | std %f8, [%o0 + 0x00] ! MS (XXX does it stall here? XXX) | |
314 | bne,pn %XCC, 1b ! BR | |
315 | add %o0, 0x8, %o0 ! A0 Group | |
316 | ||
317 | /* If anything is left, we copy it one byte at a time. | |
318 | * Note that %g1 is (src & 0x3) saved above before the | |
319 | * alignaddr was performed. | |
320 | */ | |
321 | 107: | |
322 | cmp %o2, 0 | |
323 | add %o1, %g1, %o1 | |
324 | VISExitHalf | |
325 | be,pn %XCC, 102b | |
326 | nop | |
327 | ba,a,pt %XCC, 101b | |
328 | ||
329 | /* If we get here, then 32 <= len < (6 * 64) */ | |
330 | 108: | |
331 | ||
332 | #ifdef SMALL_COPY_USES_FPU | |
333 | ||
334 | /* Is 'dst' already aligned on an 8-byte boundary? */ | |
335 | be,pt %XCC, 2f ! BR Group | |
336 | ||
337 | /* Compute abs((dst & 7) - 8) into %g2. This is the number | |
338 | * of bytes to copy to make 'dst' 8-byte aligned. We pre- | |
339 | * subtract this from 'len'. | |
340 | */ | |
341 | sub %g2, 0x8, %g2 ! A0 | |
342 | sub %g0, %g2, %g2 ! A0 Group (reg-dep) | |
343 | sub %o2, %g2, %o2 ! A0 Group (reg-dep) | |
344 | ||
345 | /* Copy %g2 bytes from src to dst, one byte at a time. */ | |
346 | 1: ldub [%o1 + 0x00], %o3 ! MS (Group) (%o3 in 3 cycles) | |
347 | add %o1, 0x1, %o1 ! A1 | |
348 | add %o0, 0x1, %o0 ! A0 Group | |
349 | subcc %g2, 0x1, %g2 ! A1 | |
350 | ||
351 | bg,pt %XCC, 1b ! BR Group | |
352 | stb %o3, [%o0 + -1] ! MS Group | |
353 | ||
354 | 2: VISEntryHalf ! MS+MS | |
355 | ||
356 | /* Compute (len - (len % 8)) into %g2. This is guarenteed | |
357 | * to be nonzero. | |
358 | */ | |
359 | andn %o2, 0x7, %g2 ! A0 Group | |
360 | ||
361 | /* You may read this and believe that it allows reading | |
362 | * one 8-byte longword past the end of src. It actually | |
363 | * does not, as %g2 is subtracted as loads are done from | |
364 | * src, so we always stop before running off the end. | |
365 | * Also, we are guarenteed to have at least 0x10 bytes | |
366 | * to move here. | |
367 | */ | |
368 | sub %g2, 0x8, %g2 ! A0 Group (reg-dep) | |
369 | alignaddr %o1, %g0, %g1 ! MS (Break-after) | |
370 | ldd [%g1 + 0x00], %f0 ! MS Group (1-cycle stall) | |
371 | add %g1, 0x8, %g1 ! A0 | |
372 | ||
373 | 1: ldd [%g1 + 0x00], %f2 ! MS Group | |
374 | add %g1, 0x8, %g1 ! A0 | |
375 | sub %o2, 0x8, %o2 ! A1 | |
376 | subcc %g2, 0x8, %g2 ! A0 Group | |
377 | ||
378 | faligndata %f0, %f2, %f8 ! FGA Group (1-cycle stall) | |
379 | std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) | |
380 | add %o1, 0x8, %o1 ! A0 | |
381 | be,pn %XCC, 2f ! BR | |
382 | ||
383 | add %o0, 0x8, %o0 ! A1 | |
384 | ldd [%g1 + 0x00], %f0 ! MS Group | |
385 | add %g1, 0x8, %g1 ! A0 | |
386 | sub %o2, 0x8, %o2 ! A1 | |
387 | ||
388 | subcc %g2, 0x8, %g2 ! A0 Group | |
389 | faligndata %f2, %f0, %f8 ! FGA Group (1-cycle stall) | |
390 | std %f8, [%o0 + 0x00] ! MS Group (2-cycle stall) | |
391 | add %o1, 0x8, %o1 ! A0 | |
392 | ||
393 | bne,pn %XCC, 1b ! BR | |
394 | add %o0, 0x8, %o0 ! A1 | |
395 | ||
396 | /* Nothing left to copy? */ | |
397 | 2: cmp %o2, 0 ! A0 Group | |
398 | VISExitHalf ! A0+MS | |
399 | be,pn %XCC, 102b ! BR Group | |
400 | nop ! A0 | |
401 | ba,a,pt %XCC, 101b ! BR Group | |
402 | ||
403 | #else /* !(SMALL_COPY_USES_FPU) */ | |
404 | ||
405 | xor %o1, %o0, %g2 | |
406 | andcc %g2, 0x7, %g0 | |
407 | bne,pn %XCC, 101b | |
408 | andcc %o1, 0x7, %g2 | |
409 | ||
410 | be,pt %XCC, 2f | |
411 | sub %g2, 0x8, %g2 | |
412 | sub %g0, %g2, %g2 | |
413 | sub %o2, %g2, %o2 | |
414 | ||
415 | 1: ldub [%o1 + 0x00], %o3 | |
416 | add %o1, 0x1, %o1 | |
417 | add %o0, 0x1, %o0 | |
418 | subcc %g2, 0x1, %g2 | |
419 | bg,pt %XCC, 1b | |
420 | stb %o3, [%o0 + -1] | |
421 | ||
422 | 2: andn %o2, 0x7, %g2 | |
423 | sub %o2, %g2, %o2 | |
424 | ||
425 | 3: ldx [%o1 + 0x00], %o3 | |
426 | add %o1, 0x8, %o1 | |
427 | add %o0, 0x8, %o0 | |
428 | subcc %g2, 0x8, %g2 | |
429 | bg,pt %XCC, 3b | |
430 | stx %o3, [%o0 + -8] | |
431 | ||
432 | cmp %o2, 0 | |
433 | bne,pn %XCC, 101b | |
434 | nop | |
435 | ba,a,pt %XCC, 102b | |
436 | ||
437 | #endif /* !(SMALL_COPY_USES_FPU) */ | |
438 | END(memcpy) | |
439 | ||
440 | #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3) \ | |
441 | ldx [%src - offset - 0x20], %t0; \ | |
442 | ldx [%src - offset - 0x18], %t1; \ | |
443 | ldx [%src - offset - 0x10], %t2; \ | |
444 | ldx [%src - offset - 0x08], %t3; \ | |
445 | stw %t0, [%dst - offset - 0x1c]; \ | |
446 | srlx %t0, 32, %t0; \ | |
447 | stw %t0, [%dst - offset - 0x20]; \ | |
448 | stw %t1, [%dst - offset - 0x14]; \ | |
449 | srlx %t1, 32, %t1; \ | |
450 | stw %t1, [%dst - offset - 0x18]; \ | |
451 | stw %t2, [%dst - offset - 0x0c]; \ | |
452 | srlx %t2, 32, %t2; \ | |
453 | stw %t2, [%dst - offset - 0x10]; \ | |
454 | stw %t3, [%dst - offset - 0x04]; \ | |
455 | srlx %t3, 32, %t3; \ | |
456 | stw %t3, [%dst - offset - 0x08]; | |
457 | ||
458 | #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ | |
459 | ldx [%src - offset - 0x20], %t0; \ | |
460 | ldx [%src - offset - 0x18], %t1; \ | |
461 | ldx [%src - offset - 0x10], %t2; \ | |
462 | ldx [%src - offset - 0x08], %t3; \ | |
463 | stx %t0, [%dst - offset - 0x20]; \ | |
464 | stx %t1, [%dst - offset - 0x18]; \ | |
465 | stx %t2, [%dst - offset - 0x10]; \ | |
466 | stx %t3, [%dst - offset - 0x08]; \ | |
467 | ldx [%src - offset - 0x40], %t0; \ | |
468 | ldx [%src - offset - 0x38], %t1; \ | |
469 | ldx [%src - offset - 0x30], %t2; \ | |
470 | ldx [%src - offset - 0x28], %t3; \ | |
471 | stx %t0, [%dst - offset - 0x40]; \ | |
472 | stx %t1, [%dst - offset - 0x38]; \ | |
473 | stx %t2, [%dst - offset - 0x30]; \ | |
474 | stx %t3, [%dst - offset - 0x28]; | |
475 | ||
476 | #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ | |
477 | ldx [%src + offset + 0x00], %t0; \ | |
478 | ldx [%src + offset + 0x08], %t1; \ | |
479 | stw %t0, [%dst + offset + 0x04]; \ | |
480 | srlx %t0, 32, %t2; \ | |
481 | stw %t2, [%dst + offset + 0x00]; \ | |
482 | stw %t1, [%dst + offset + 0x0c]; \ | |
483 | srlx %t1, 32, %t3; \ | |
484 | stw %t3, [%dst + offset + 0x08]; | |
485 | ||
486 | #define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \ | |
487 | ldx [%src + offset + 0x00], %t0; \ | |
488 | ldx [%src + offset + 0x08], %t1; \ | |
489 | stx %t0, [%dst + offset + 0x00]; \ | |
490 | stx %t1, [%dst + offset + 0x08]; | |
491 | ||
492 | .align 32 | |
493 | 228: andcc %o2, 1, %g0 /* IEU1 Group */ | |
494 | be,pt %icc, 2f+4 /* CTI */ | |
495 | 1: ldub [%o1 - 1], %o5 /* LOAD Group */ | |
496 | sub %o1, 1, %o1 /* IEU0 */ | |
497 | sub %o0, 1, %o0 /* IEU1 */ | |
498 | subcc %o2, 1, %o2 /* IEU1 Group */ | |
499 | be,pn %xcc, 229f /* CTI */ | |
500 | stb %o5, [%o0] /* Store */ | |
501 | 2: ldub [%o1 - 1], %o5 /* LOAD Group */ | |
502 | sub %o0, 2, %o0 /* IEU0 */ | |
503 | ldub [%o1 - 2], %g5 /* LOAD Group */ | |
504 | sub %o1, 2, %o1 /* IEU0 */ | |
505 | subcc %o2, 2, %o2 /* IEU1 Group */ | |
506 | stb %o5, [%o0 + 1] /* Store */ | |
507 | bne,pt %xcc, 2b /* CTI */ | |
508 | stb %g5, [%o0] /* Store */ | |
509 | 229: retl | |
510 | mov %g4, %o0 | |
511 | ||
512 | .align 32 | |
513 | ENTRY(memmove) | |
514 | mov %o0, %g3 | |
515 | #ifndef USE_BPR | |
516 | srl %o2, 0, %o2 /* IEU1 Group */ | |
517 | #endif | |
518 | brz,pn %o2, 102b /* CTI Group */ | |
519 | sub %o0, %o1, %o4 /* IEU0 */ | |
520 | cmp %o4, %o2 /* IEU1 Group */ | |
521 | bgeu,pt %XCC, 218b /* CTI */ | |
522 | mov %o0, %g4 /* IEU0 */ | |
523 | add %o0, %o2, %o0 /* IEU0 Group */ | |
524 | 220: add %o1, %o2, %o1 /* IEU1 */ | |
525 | cmp %o2, 15 /* IEU1 Group */ | |
526 | bleu,pn %xcc, 228b /* CTI */ | |
527 | andcc %o0, 7, %g2 /* IEU1 Group */ | |
528 | sub %o0, %o1, %g5 /* IEU0 */ | |
529 | andcc %g5, 3, %o5 /* IEU1 Group */ | |
530 | bne,pn %xcc, 232f /* CTI */ | |
531 | andcc %o1, 3, %g0 /* IEU1 Group */ | |
532 | be,a,pt %xcc, 236f /* CTI */ | |
533 | andcc %o1, 4, %g0 /* IEU1 Group */ | |
534 | andcc %o1, 1, %g0 /* IEU1 Group */ | |
535 | be,pn %xcc, 4f /* CTI */ | |
536 | andcc %o1, 2, %g0 /* IEU1 Group */ | |
537 | ldub [%o1 - 1], %g2 /* Load Group */ | |
538 | sub %o1, 1, %o1 /* IEU0 */ | |
539 | sub %o0, 1, %o0 /* IEU1 */ | |
540 | sub %o2, 1, %o2 /* IEU0 Group */ | |
541 | be,pn %xcc, 5f /* CTI Group */ | |
542 | stb %g2, [%o0] /* Store */ | |
543 | 4: lduh [%o1 - 2], %g2 /* Load Group */ | |
544 | sub %o1, 2, %o1 /* IEU0 */ | |
545 | sub %o0, 2, %o0 /* IEU1 */ | |
546 | sub %o2, 2, %o2 /* IEU0 */ | |
547 | sth %g2, [%o0] /* Store Group + bubble */ | |
548 | 5: andcc %o1, 4, %g0 /* IEU1 */ | |
549 | 236: be,a,pn %xcc, 2f /* CTI */ | |
62f29da7 | 550 | andcc %o2, -128, %g6 /* IEU1 Group */ |
bb769ab6 UD |
551 | lduw [%o1 - 4], %g5 /* Load Group */ |
552 | sub %o1, 4, %o1 /* IEU0 */ | |
553 | sub %o0, 4, %o0 /* IEU1 */ | |
554 | sub %o2, 4, %o2 /* IEU0 Group */ | |
555 | stw %g5, [%o0] /* Store */ | |
62f29da7 | 556 | andcc %o2, -128, %g6 /* IEU1 Group */ |
bb769ab6 UD |
557 | 2: be,pn %xcc, 235f /* CTI */ |
558 | andcc %o0, 4, %g0 /* IEU1 Group */ | |
559 | be,pn %xcc, 282f + 4 /* CTI Group */ | |
560 | 5: RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5) | |
561 | RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5) | |
562 | RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5) | |
563 | RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5) | |
62f29da7 | 564 | subcc %g6, 128, %g6 /* IEU1 Group */ |
bb769ab6 UD |
565 | sub %o1, 128, %o1 /* IEU0 */ |
566 | bne,pt %xcc, 5b /* CTI */ | |
567 | sub %o0, 128, %o0 /* IEU0 Group */ | |
62f29da7 | 568 | 235: andcc %o2, 0x70, %g6 /* IEU1 Group */ |
bb769ab6 UD |
569 | 41: be,pn %xcc, 280f /* CTI */ |
570 | andcc %o2, 8, %g0 /* IEU1 Group */ | |
571 | /* Clk1 8-( */ | |
572 | /* Clk2 8-( */ | |
573 | /* Clk3 8-( */ | |
574 | /* Clk4 8-( */ | |
575 | 279: rd %pc, %o5 /* PDU Group */ | |
62f29da7 UD |
576 | sll %g6, 1, %g5 /* IEU0 Group */ |
577 | sub %o1, %g6, %o1 /* IEU1 */ | |
bb769ab6 UD |
578 | sub %o5, %g5, %o5 /* IEU0 Group */ |
579 | jmpl %o5 + %lo(280f - 279b), %g0 /* CTI Group brk forced*/ | |
62f29da7 | 580 | sub %o0, %g6, %o0 /* IEU0 Group */ |
bb769ab6 UD |
581 | RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5) |
582 | RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5) | |
583 | RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5) | |
584 | RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5) | |
585 | RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5) | |
586 | RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5) | |
587 | RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5) | |
588 | 280: be,pt %xcc, 281f /* CTI */ | |
589 | andcc %o2, 4, %g0 /* IEU1 */ | |
590 | ldx [%o1 - 8], %g2 /* Load Group */ | |
591 | sub %o0, 8, %o0 /* IEU0 */ | |
592 | stw %g2, [%o0 + 4] /* Store Group */ | |
593 | sub %o1, 8, %o1 /* IEU1 */ | |
594 | srlx %g2, 32, %g2 /* IEU0 Group */ | |
595 | stw %g2, [%o0] /* Store */ | |
596 | 281: be,pt %xcc, 1f /* CTI */ | |
597 | andcc %o2, 2, %g0 /* IEU1 Group */ | |
598 | lduw [%o1 - 4], %g2 /* Load Group */ | |
599 | sub %o1, 4, %o1 /* IEU0 */ | |
600 | stw %g2, [%o0 - 4] /* Store Group */ | |
601 | sub %o0, 4, %o0 /* IEU0 */ | |
602 | 1: be,pt %xcc, 1f /* CTI */ | |
603 | andcc %o2, 1, %g0 /* IEU1 Group */ | |
604 | lduh [%o1 - 2], %g2 /* Load Group */ | |
605 | sub %o1, 2, %o1 /* IEU0 */ | |
606 | sth %g2, [%o0 - 2] /* Store Group */ | |
607 | sub %o0, 2, %o0 /* IEU0 */ | |
608 | 1: be,pt %xcc, 211f /* CTI */ | |
609 | nop /* IEU1 */ | |
610 | ldub [%o1 - 1], %g2 /* Load Group */ | |
611 | stb %g2, [%o0 - 1] /* Store Group + bubble */ | |
612 | 211: retl | |
613 | mov %g4, %o0 | |
614 | ||
615 | 282: RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5) | |
616 | RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5) | |
62f29da7 | 617 | subcc %g6, 128, %g6 /* IEU1 Group */ |
bb769ab6 UD |
618 | sub %o1, 128, %o1 /* IEU0 */ |
619 | bne,pt %xcc, 282b /* CTI */ | |
620 | sub %o0, 128, %o0 /* IEU0 Group */ | |
62f29da7 | 621 | andcc %o2, 0x70, %g6 /* IEU1 */ |
bb769ab6 UD |
622 | be,pn %xcc, 284f /* CTI */ |
623 | andcc %o2, 8, %g0 /* IEU1 Group */ | |
624 | /* Clk1 8-( */ | |
625 | /* Clk2 8-( */ | |
626 | /* Clk3 8-( */ | |
627 | /* Clk4 8-( */ | |
628 | 283: rd %pc, %o5 /* PDU Group */ | |
62f29da7 UD |
629 | sub %o1, %g6, %o1 /* IEU0 Group */ |
630 | sub %o5, %g6, %o5 /* IEU1 */ | |
bb769ab6 | 631 | jmpl %o5 + %lo(284f - 283b), %g0 /* CTI Group brk forced*/ |
62f29da7 | 632 | sub %o0, %g6, %o0 /* IEU0 Group */ |
bb769ab6 UD |
633 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3) |
634 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3) | |
635 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3) | |
636 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3) | |
637 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3) | |
638 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3) | |
639 | RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3) | |
640 | 284: be,pt %xcc, 285f /* CTI Group */ | |
641 | andcc %o2, 4, %g0 /* IEU1 */ | |
642 | ldx [%o1 - 8], %g2 /* Load Group */ | |
643 | sub %o0, 8, %o0 /* IEU0 */ | |
644 | sub %o1, 8, %o1 /* IEU0 Group */ | |
645 | stx %g2, [%o0] /* Store */ | |
646 | 285: be,pt %xcc, 1f /* CTI */ | |
647 | andcc %o2, 2, %g0 /* IEU1 Group */ | |
648 | lduw [%o1 - 4], %g2 /* Load Group */ | |
649 | sub %o0, 4, %o0 /* IEU0 */ | |
650 | sub %o1, 4, %o1 /* IEU0 Group */ | |
651 | stw %g2, [%o0] /* Store */ | |
652 | 1: be,pt %xcc, 1f /* CTI */ | |
653 | andcc %o2, 1, %g0 /* IEU1 Group */ | |
654 | lduh [%o1 - 2], %g2 /* Load Group */ | |
655 | sub %o0, 2, %o0 /* IEU0 */ | |
656 | sub %o1, 2, %o1 /* IEU0 Group */ | |
657 | sth %g2, [%o0] /* Store */ | |
658 | 1: be,pt %xcc, 1f /* CTI */ | |
659 | nop /* IEU0 Group */ | |
660 | ldub [%o1 - 1], %g2 /* Load Group */ | |
661 | stb %g2, [%o0 - 1] /* Store Group + bubble */ | |
662 | 1: retl | |
663 | mov %g4, %o0 | |
664 | ||
665 | 232: brz,pt %g2, 2f /* CTI Group */ | |
666 | sub %o2, %g2, %o2 /* IEU0 Group */ | |
667 | 1: ldub [%o1 - 1], %g5 /* Load Group */ | |
668 | sub %o1, 1, %o1 /* IEU0 */ | |
669 | sub %o0, 1, %o0 /* IEU1 */ | |
670 | subcc %g2, 1, %g2 /* IEU1 Group */ | |
671 | bne,pt %xcc, 1b /* CTI */ | |
672 | stb %g5, [%o0] /* Store */ | |
673 | 2: andn %o2, 7, %g5 /* IEU0 Group */ | |
674 | and %o2, 7, %o2 /* IEU1 */ | |
675 | fmovd %f0, %f2 /* FPU */ | |
676 | alignaddr %o1, %g0, %g1 /* GRU Group */ | |
677 | ldd [%g1], %f4 /* Load Group */ | |
678 | 1: ldd [%g1 - 8], %f6 /* Load Group */ | |
679 | sub %g1, 8, %g1 /* IEU0 Group */ | |
680 | subcc %g5, 8, %g5 /* IEU1 */ | |
681 | faligndata %f6, %f4, %f0 /* GRU Group */ | |
682 | std %f0, [%o0 - 8] /* Store */ | |
683 | sub %o1, 8, %o1 /* IEU0 Group */ | |
684 | be,pn %xcc, 233f /* CTI */ | |
685 | sub %o0, 8, %o0 /* IEU1 */ | |
686 | ldd [%g1 - 8], %f4 /* Load Group */ | |
687 | sub %g1, 8, %g1 /* IEU0 */ | |
688 | subcc %g5, 8, %g5 /* IEU1 */ | |
689 | faligndata %f4, %f6, %f0 /* GRU Group */ | |
690 | std %f0, [%o0 - 8] /* Store */ | |
691 | sub %o1, 8, %o1 /* IEU0 */ | |
692 | bne,pn %xcc, 1b /* CTI Group */ | |
693 | sub %o0, 8, %o0 /* IEU0 */ | |
694 | 233: brz,pn %o2, 234f /* CTI Group */ | |
695 | nop /* IEU0 */ | |
696 | 237: ldub [%o1 - 1], %g5 /* LOAD */ | |
697 | sub %o1, 1, %o1 /* IEU0 */ | |
698 | sub %o0, 1, %o0 /* IEU1 */ | |
699 | subcc %o2, 1, %o2 /* IEU1 */ | |
700 | bne,pt %xcc, 237b /* CTI */ | |
701 | stb %g5, [%o0] /* Store Group */ | |
702 | 234: wr %g0, FPRS_FEF, %fprs | |
703 | retl | |
704 | mov %g4, %o0 | |
705 | END(memmove) | |
706 | ||
707 | #ifdef USE_BPR | |
708 | weak_alias(memcpy, __align_cpy_1) | |
709 | weak_alias(memcpy, __align_cpy_2) | |
710 | weak_alias(memcpy, __align_cpy_4) | |
711 | weak_alias(memcpy, __align_cpy_8) | |
712 | weak_alias(memcpy, __align_cpy_16) | |
713 | #endif |