]> sourceware.org Git - glibc.git/blob - sysdeps/powerpc/powerpc32/power4/memcpy.S
Remove doubled words.
[glibc.git] / sysdeps / powerpc / powerpc32 / power4 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC32 on PowerPC64.
2 Copyright (C) 2003, 2006, 2011 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA
18 02110-1301 USA. */
19
20 #include <sysdep.h>
21 #include <bp-sym.h>
22 #include <bp-asm.h>
23
24 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
25 Returns 'dst'.
26
27 Memcpy handles short copies (< 32-bytes) using a binary move blocks
28 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled
29 with the appropriate combination of byte and halfword load/stores.
30 There is minimal effort to optimize the alignment of short moves.
31
32 Longer moves (>= 32-bytes) justify the effort to get at least the
33 destination word (4-byte) aligned. Further optimization is
34 possible when both source and destination are word aligned.
35 Each case has an optimized unrolled loop. */
36
37 .machine power4
38 EALIGN (BP_SYM (memcpy), 5, 0)
39 CALL_MCOUNT
40
41 stwu 1,-32(1)
42 cfi_adjust_cfa_offset(32)
43 stw 30,20(1)
44 cfi_offset(30,(20-32))
45 mr 30,3
46 cmplwi cr1,5,31
47 stw 31,24(1)
48 cfi_offset(31,(24-32))
49 neg 0,3
50 andi. 11,3,3 /* check alignment of dst. */
51 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */
52 clrlwi 10,4,30 /* check alignment of src. */
53 cmplwi cr6,5,8
54 ble- cr1,.L2 /* If move < 32 bytes use short move code. */
55 cmplw cr6,10,11
56 mr 12,4
57 srwi 9,5,2 /* Number of full words remaining. */
58 mtcrf 0x01,0
59 mr 31,5
60 beq .L0
61
62 subf 31,0,5
63 /* Move 0-3 bytes as needed to get the destination word aligned. */
64 1: bf 31,2f
65 lbz 6,0(12)
66 addi 12,12,1
67 stb 6,0(3)
68 addi 3,3,1
69 2: bf 30,0f
70 lhz 6,0(12)
71 addi 12,12,2
72 sth 6,0(3)
73 addi 3,3,2
74 0:
75 clrlwi 10,12,30 /* check alignment of src again. */
76 srwi 9,31,2 /* Number of full words remaining. */
77
78 /* Copy words from source to destination, assuming the destination is
79 aligned on a word boundary.
80
81 At this point we know there are at least 25 bytes left (32-7) to copy.
82 The next step is to determine if the source is also word aligned.
83 If not branch to the unaligned move code at .L6. which uses
84 a load, shift, store strategy.
85
86 Otherwise source and destination are word aligned, and we can use
87 the optimized word copy loop. */
88 .L0:
89 clrlwi 11,31,30 /* calculate the number of tail bytes */
90 mtcrf 0x01,9
91 bne- cr6,.L6 /* If source is not word aligned. */
92
93 /* Move words where destination and source are word aligned.
94 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
95 If the copy is not an exact multiple of 16 bytes, 1-3
96 words are copied as needed to set up the main loop. After
97 the main loop exits there may be a tail of 1-3 bytes. These bytes are
98 copied a halfword/byte at a time as needed to preserve alignment. */
99
100 srwi 8,31,4 /* calculate the 16 byte loop count */
101 cmplwi cr1,9,4
102 cmplwi cr6,11,0
103 mr 11,12
104
105 bf 30,1f
106 lwz 6,0(12)
107 lwz 7,4(12)
108 addi 11,12,8
109 mtctr 8
110 stw 6,0(3)
111 stw 7,4(3)
112 addi 10,3,8
113 bf 31,4f
114 lwz 0,8(12)
115 stw 0,8(3)
116 blt cr1,3f
117 addi 11,12,12
118 addi 10,3,12
119 b 4f
120 .align 4
121 1:
122 mr 10,3
123 mtctr 8
124 bf 31,4f
125 lwz 6,0(12)
126 addi 11,12,4
127 stw 6,0(3)
128 addi 10,3,4
129
130 .align 4
131 4:
132 lwz 6,0(11)
133 lwz 7,4(11)
134 lwz 8,8(11)
135 lwz 0,12(11)
136 stw 6,0(10)
137 stw 7,4(10)
138 stw 8,8(10)
139 stw 0,12(10)
140 addi 11,11,16
141 addi 10,10,16
142 bdnz 4b
143 3:
144 clrrwi 0,31,2
145 mtcrf 0x01,31
146 beq cr6,0f
147 .L9:
148 add 3,3,0
149 add 12,12,0
150
151 /* At this point we have a tail of 0-3 bytes and we know that the
152 destination is word aligned. */
153 2: bf 30,1f
154 lhz 6,0(12)
155 addi 12,12,2
156 sth 6,0(3)
157 addi 3,3,2
158 1: bf 31,0f
159 lbz 6,0(12)
160 stb 6,0(3)
161 0:
162 /* Return original dst pointer. */
163 mr 3,30
164 lwz 30,20(1)
165 lwz 31,24(1)
166 addi 1,1,32
167 blr
168
169 /* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and
170 9-31 bytes. Each case is handled without loops, using binary
171 (1,2,4,8) tests.
172
173 In the short (0-8 byte) case no attempt is made to force alignment
174 of either source or destination. The hardware will handle the
175 unaligned load/stores with small delays for crossing 32- 64-byte, and
176 4096-byte boundaries. Since these short moves are unlikely to be
177 unaligned or cross these boundaries, the overhead to force
178 alignment is not justified.
179
180 The longer (9-31 byte) move is more likely to cross 32- or 64-byte
181 boundaries. Since only loads are sensitive to the 32-/64-byte
182 boundaries it is more important to align the source than the
183 destination. If the source is not already word aligned, we first
184 move 1-3 bytes as needed. While the destination and stores may
185 still be unaligned, this is only an issue for page (4096 byte
186 boundary) crossing, which should be rare for these short moves.
187 The hardware handles this case automatically with a small delay. */
188
189 .align 4
190 .L2:
191 mtcrf 0x01,5
192 neg 8,4
193 clrrwi 11,4,2
194 andi. 0,8,3
195 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */
196 /* At least 9 bytes left. Get the source word aligned. */
197 cmplwi cr1,5,16
198 mr 10,5
199 mr 12,4
200 cmplwi cr6,0,2
201 beq .L3 /* If the source is already word aligned skip this. */
202 /* Copy 1-3 bytes to get source address word aligned. */
203 lwz 6,0(11)
204 subf 10,0,5
205 add 12,4,0
206 blt cr6,5f
207 srwi 7,6,16
208 bgt cr6,3f
209 sth 6,0(3)
210 b 7f
211 .align 4
212 3:
213 stb 7,0(3)
214 sth 6,1(3)
215 b 7f
216 .align 4
217 5:
218 stb 6,0(3)
219 7:
220 cmplwi cr1,10,16
221 add 3,3,0
222 mtcrf 0x01,10
223 .align 4
224 .L3:
225 /* At least 6 bytes left and the source is word aligned. */
226 blt cr1,8f
227 16: /* Move 16 bytes. */
228 lwz 6,0(12)
229 lwz 7,4(12)
230 stw 6,0(3)
231 lwz 6,8(12)
232 stw 7,4(3)
233 lwz 7,12(12)
234 addi 12,12,16
235 stw 6,8(3)
236 stw 7,12(3)
237 addi 3,3,16
238 8: /* Move 8 bytes. */
239 bf 28,4f
240 lwz 6,0(12)
241 lwz 7,4(12)
242 addi 12,12,8
243 stw 6,0(3)
244 stw 7,4(3)
245 addi 3,3,8
246 4: /* Move 4 bytes. */
247 bf 29,2f
248 lwz 6,0(12)
249 addi 12,12,4
250 stw 6,0(3)
251 addi 3,3,4
252 2: /* Move 2-3 bytes. */
253 bf 30,1f
254 lhz 6,0(12)
255 sth 6,0(3)
256 bf 31,0f
257 lbz 7,2(12)
258 stb 7,2(3)
259 mr 3,30
260 lwz 30,20(1)
261 addi 1,1,32
262 blr
263 1: /* Move 1 byte. */
264 bf 31,0f
265 lbz 6,0(12)
266 stb 6,0(3)
267 0:
268 /* Return original dst pointer. */
269 mr 3,30
270 lwz 30,20(1)
271 addi 1,1,32
272 blr
273
274 /* Special case to copy 0-8 bytes. */
275 .align 4
276 .LE8:
277 mr 12,4
278 bne cr6,4f
279 lwz 6,0(4)
280 lwz 7,4(4)
281 stw 6,0(3)
282 stw 7,4(3)
283 /* Return original dst pointer. */
284 mr 3,30
285 lwz 30,20(1)
286 addi 1,1,32
287 blr
288 .align 4
289 4: bf 29,2b
290 lwz 6,0(4)
291 stw 6,0(3)
292 6:
293 bf 30,5f
294 lhz 7,4(4)
295 sth 7,4(3)
296 bf 31,0f
297 lbz 8,6(4)
298 stb 8,6(3)
299 mr 3,30
300 lwz 30,20(1)
301 addi 1,1,32
302 blr
303 .align 4
304 5:
305 bf 31,0f
306 lbz 6,4(4)
307 stb 6,4(3)
308 .align 4
309 0:
310 /* Return original dst pointer. */
311 mr 3,30
312 lwz 30,20(1)
313 addi 1,1,32
314 blr
315
316 .align 4
317 .L6:
318
319 /* Copy words where the destination is aligned but the source is
320 not. Use aligned word loads from the source, shifted to realign
321 the data, to allow aligned destination stores.
322 Use an unrolled loop to copy 4 words (16-bytes) per iteration.
323 A single word is retained for storing at loop exit to avoid walking
324 off the end of a page within the loop.
325 If the copy is not an exact multiple of 16 bytes, 1-3
326 words are copied as needed to set up the main loop. After
327 the main loop exits there may be a tail of 1-3 bytes. These bytes are
328 copied a halfword/byte at a time as needed to preserve alignment. */
329
330
331 cmplwi cr6,11,0 /* are there tail bytes left ? */
332 subf 5,10,12 /* back up src pointer to prev word alignment */
333 slwi 10,10,3 /* calculate number of bits to shift 1st word left */
334 addi 11,9,-1 /* we move one word after the loop */
335 srwi 8,11,2 /* calculate the 16 byte loop count */
336 lwz 6,0(5) /* load 1st src word into R6 */
337 mr 4,3
338 lwz 7,4(5) /* load 2nd src word into R7 */
339 mtcrf 0x01,11
340 subfic 9,10,32 /* number of bits to shift 2nd word right */
341 mtctr 8
342 bf 30,1f
343
344 /* there are at least two words to copy, so copy them */
345 slw 0,6,10 /* shift 1st src word to left align it in R0 */
346 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
347 or 0,0,8 /* or them to get word to store */
348 lwz 6,8(5) /* load the 3rd src word */
349 stw 0,0(4) /* store the 1st dst word */
350 slw 0,7,10 /* now left align 2nd src word into R0 */
351 srw 8,6,9 /* shift 3rd src word to right align it in R8 */
352 or 0,0,8 /* or them to get word to store */
353 lwz 7,12(5)
354 stw 0,4(4) /* store the 2nd dst word */
355 addi 4,4,8
356 addi 5,5,16
357 bf 31,4f
358 /* there is a third word to copy, so copy it */
359 slw 0,6,10 /* shift 3rd src word to left align it in R0 */
360 srw 8,7,9 /* shift 4th src word to right align it in R8 */
361 or 0,0,8 /* or them to get word to store */
362 stw 0,0(4) /* store 3rd dst word */
363 mr 6,7
364 lwz 7,0(5)
365 addi 5,5,4
366 addi 4,4,4
367 b 4f
368 .align 4
369 1:
370 slw 0,6,10 /* shift 1st src word to left align it in R0 */
371 srw 8,7,9 /* shift 2nd src word to right align it in R8 */
372 addi 5,5,8
373 or 0,0,8 /* or them to get word to store */
374 bf 31,4f
375 mr 6,7
376 lwz 7,0(5)
377 addi 5,5,4
378 stw 0,0(4) /* store the 1st dst word */
379 addi 4,4,4
380
381 .align 4
382 4:
383 /* copy 16 bytes at a time */
384 slw 0,6,10
385 srw 8,7,9
386 or 0,0,8
387 lwz 6,0(5)
388 stw 0,0(4)
389 slw 0,7,10
390 srw 8,6,9
391 or 0,0,8
392 lwz 7,4(5)
393 stw 0,4(4)
394 slw 0,6,10
395 srw 8,7,9
396 or 0,0,8
397 lwz 6,8(5)
398 stw 0,8(4)
399 slw 0,7,10
400 srw 8,6,9
401 or 0,0,8
402 lwz 7,12(5)
403 stw 0,12(4)
404 addi 5,5,16
405 addi 4,4,16
406 bdnz+ 4b
407 8:
408 /* calculate and store the final word */
409 slw 0,6,10
410 srw 8,7,9
411 or 0,0,8
412 stw 0,0(4)
413 3:
414 clrrwi 0,31,2
415 mtcrf 0x01,31
416 bne cr6,.L9 /* If the tail is 0 bytes we are done! */
417
418 /* Return original dst pointer. */
419 mr 3,30
420 lwz 30,20(1)
421 lwz 31,24(1)
422 addi 1,1,32
423 blr
424 END (BP_SYM (memcpy))
425
426 libc_hidden_builtin_def (memcpy)
This page took 0.057785 seconds and 5 git commands to generate.