1/* Optimized memcpy implementation for PowerPC32 on PowerPC64.
2   Copyright (C) 2003-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.
23
24   Memcpy handles short copies (< 32-bytes) using a binary move blocks
25   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
26   with the appropriate combination of byte and halfword load/stores.
27   There is minimal effort to optimize the alignment of short moves.
28
29   Longer moves (>= 32-bytes) justify the effort to get at least the
30   destination word (4-byte) aligned.  Further optimization is
31   possible when both source and destination are word aligned.
32   Each case has an optimized unrolled loop.   */
33
34	.machine power4
35EALIGN (memcpy, 5, 0)
36	CALL_MCOUNT
37
38    stwu  1,-32(1)
39    cfi_adjust_cfa_offset(32)
40    stw   30,20(1)
41    cfi_offset(30,(20-32))
42    mr    30,3
43    cmplwi cr1,5,31
44    stw   31,24(1)
45    cfi_offset(31,(24-32))
46    neg   0,3
47    andi. 11,3,3	/* check alignment of dst.  */
48    clrlwi 0,0,30	/* Number of bytes until the 1st word of dst.  */
49    clrlwi 10,4,30	/* check alignment of src.  */
50    cmplwi cr6,5,8
51    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
52    cmplw cr6,10,11
53    mr    12,4
54    srwi  9,5,2		/* Number of full words remaining.  */
55    mtcrf 0x01,0
56    mr    31,5
57    beq   .L0
58
59    subf  31,0,5
60  /* Move 0-3 bytes as needed to get the destination word aligned.  */
611:  bf    31,2f
62    lbz   6,0(12)
63    addi  12,12,1
64    stb   6,0(3)
65    addi  3,3,1
662:  bf    30,0f
67    lhz   6,0(12)
68    addi  12,12,2
69    sth   6,0(3)
70    addi  3,3,2
710:
72    clrlwi 10,12,30	/* check alignment of src again.  */
73    srwi  9,31,2	/* Number of full words remaining.  */
74
75  /* Copy words from source to destination, assuming the destination is
76     aligned on a word boundary.
77
78     At this point we know there are at least 25 bytes left (32-7) to copy.
79     The next step is to determine if the source is also word aligned.
80     If not branch to the unaligned move code at .L6. which uses
81     a load, shift, store strategy.
82
83     Otherwise source and destination are word aligned, and we can use
84     the optimized word copy loop.  */
85.L0:
86    clrlwi	11,31,30  /* calculate the number of tail bytes */
87    mtcrf 0x01,9
88    bne-  cr6,.L6   /* If source is not word aligned.  */
89
90  /* Move words where destination and source are word aligned.
91     Use an unrolled loop to copy 4 words (16-bytes) per iteration.
92     If the copy is not an exact multiple of 16 bytes, 1-3
93     words are copied as needed to set up the main loop.  After
94     the main loop exits there may be a tail of 1-3 bytes. These bytes are
95     copied a halfword/byte at a time as needed to preserve alignment.  */
96
97    srwi  8,31,4    /* calculate the 16 byte loop count */
98    cmplwi	cr1,9,4
99    cmplwi	cr6,11,0
100    mr    11,12
101
102    bf    30,1f
103    lwz   6,0(12)
104    lwz   7,4(12)
105    addi  11,12,8
106    mtctr 8
107    stw   6,0(3)
108    stw   7,4(3)
109    addi  10,3,8
110    bf    31,4f
111    lwz   0,8(12)
112    stw   0,8(3)
113    blt   cr1,3f
114    addi  11,12,12
115    addi  10,3,12
116    b     4f
117    .align  4
1181:
119    mr    10,3
120    mtctr 8
121    bf    31,4f
122    lwz   6,0(12)
123    addi  11,12,4
124    stw   6,0(3)
125    addi  10,3,4
126
127    .align  4
1284:
129    lwz   6,0(11)
130    lwz   7,4(11)
131    lwz   8,8(11)
132    lwz   0,12(11)
133    stw   6,0(10)
134    stw   7,4(10)
135    stw   8,8(10)
136    stw   0,12(10)
137    addi  11,11,16
138    addi  10,10,16
139    bdnz  4b
1403:
141    clrrwi 0,31,2
142    mtcrf 0x01,31
143    beq   cr6,0f
144.L9:
145    add   3,3,0
146    add   12,12,0
147
148/*  At this point we have a tail of 0-3 bytes and we know that the
149    destination is word aligned.  */
1502:  bf    30,1f
151    lhz   6,0(12)
152    addi  12,12,2
153    sth   6,0(3)
154    addi  3,3,2
1551:  bf    31,0f
156    lbz   6,0(12)
157    stb   6,0(3)
1580:
159  /* Return original dst pointer.  */
160    mr  3,30
161    lwz 30,20(1)
162    lwz 31,24(1)
163    addi 1,1,32
164    blr
165
166/* Copy up to 31 bytes.  This is divided into two cases 0-8 bytes and
167   9-31 bytes.  Each case is handled without loops, using binary
168   (1,2,4,8) tests.
169
170   In the short (0-8 byte) case no attempt is made to force alignment
171   of either source or destination.  The hardware will handle the
172   unaligned load/stores with small delays for crossing 32- 64-byte, and
173   4096-byte boundaries. Since these short moves are unlikely to be
174   unaligned or cross these boundaries, the overhead to force
175   alignment is not justified.
176
177   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
178   boundaries.  Since only loads are sensitive to the 32-/64-byte
179   boundaries it is more important to align the source than the
180   destination.  If the source is not already word aligned, we first
181   move 1-3 bytes as needed.  While the destination and stores may
182   still be unaligned, this is only an issue for page (4096 byte
183   boundary) crossing, which should be rare for these short moves.
184   The hardware handles this case automatically with a small delay.  */
185
186    .align  4
187.L2:
188    mtcrf 0x01,5
189    neg   8,4
190    clrrwi 11,4,2
191    andi. 0,8,3
192    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
193/* At least 9 bytes left.  Get the source word aligned.  */
194    cmplwi	cr1,5,16
195    mr    10,5
196    mr    12,4
197    cmplwi	cr6,0,2
198    beq   .L3	/* If the source is already word aligned skip this.  */
199/* Copy 1-3 bytes to get source address word aligned.  */
200    lwz   6,0(11)
201    subf  10,0,5
202    add   12,4,0
203    blt   cr6,5f
204    srwi  7,6,16
205    bgt	  cr6,3f
206#ifdef __LITTLE_ENDIAN__
207    sth   7,0(3)
208#else
209    sth   6,0(3)
210#endif
211    b     7f
212    .align  4
2133:
214#ifdef __LITTLE_ENDIAN__
215    rotlwi 6,6,24
216    stb   6,0(3)
217    sth   7,1(3)
218#else
219    stb   7,0(3)
220    sth   6,1(3)
221#endif
222    b     7f
223    .align  4
2245:
225#ifdef __LITTLE_ENDIAN__
226    rotlwi 6,6,8
227#endif
228    stb   6,0(3)
2297:
230    cmplwi	cr1,10,16
231    add   3,3,0
232    mtcrf 0x01,10
233    .align  4
234.L3:
235/* At least 6 bytes left and the source is word aligned.  */
236    blt   cr1,8f
23716: /* Move 16 bytes.  */
238    lwz   6,0(12)
239    lwz   7,4(12)
240    stw   6,0(3)
241    lwz   6,8(12)
242    stw   7,4(3)
243    lwz   7,12(12)
244    addi  12,12,16
245    stw   6,8(3)
246    stw   7,12(3)
247    addi  3,3,16
2488:  /* Move 8 bytes.  */
249    bf    28,4f
250    lwz   6,0(12)
251    lwz   7,4(12)
252    addi  12,12,8
253    stw   6,0(3)
254    stw   7,4(3)
255    addi  3,3,8
2564:  /* Move 4 bytes.  */
257    bf    29,2f
258    lwz   6,0(12)
259    addi  12,12,4
260    stw   6,0(3)
261    addi  3,3,4
2622:  /* Move 2-3 bytes.  */
263    bf    30,1f
264    lhz   6,0(12)
265    sth   6,0(3)
266    bf    31,0f
267    lbz   7,2(12)
268    stb   7,2(3)
269    mr    3,30
270    lwz   30,20(1)
271    addi  1,1,32
272    blr
2731:  /* Move 1 byte.  */
274    bf    31,0f
275    lbz   6,0(12)
276    stb   6,0(3)
2770:
278  /* Return original dst pointer.  */
279    mr   3,30
280    lwz  30,20(1)
281    addi 1,1,32
282    blr
283
284/* Special case to copy 0-8 bytes.  */
285    .align  4
286.LE8:
287    mr    12,4
288    bne   cr6,4f
289    lwz   6,0(4)
290    lwz   7,4(4)
291    stw   6,0(3)
292    stw   7,4(3)
293  /* Return original dst pointer.  */
294    mr    3,30
295    lwz   30,20(1)
296    addi  1,1,32
297    blr
298    .align  4
2994:  bf    29,2b
300    lwz   6,0(4)
301    stw   6,0(3)
3026:
303    bf    30,5f
304    lhz   7,4(4)
305    sth   7,4(3)
306    bf    31,0f
307    lbz   8,6(4)
308    stb   8,6(3)
309    mr    3,30
310    lwz   30,20(1)
311    addi  1,1,32
312    blr
313    .align  4
3145:
315    bf    31,0f
316    lbz   6,4(4)
317    stb   6,4(3)
318    .align  4
3190:
320  /* Return original dst pointer.  */
321    mr   3,30
322    lwz  30,20(1)
323    addi 1,1,32
324    blr
325
326    .align  4
327.L6:
328
329  /* Copy words where the destination is aligned but the source is
330     not.  Use aligned word loads from the source, shifted to realign
331     the data, to allow aligned destination stores.
332     Use an unrolled loop to copy 4 words (16-bytes) per iteration.
333     A single word is retained for storing at loop exit to avoid walking
334     off the end of a page within the loop.
335     If the copy is not an exact multiple of 16 bytes, 1-3
336     words are copied as needed to set up the main loop.  After
337     the main loop exits there may be a tail of 1-3 bytes. These bytes are
338     copied a halfword/byte at a time as needed to preserve alignment.  */
339
340
341    cmplwi  cr6,11,0  /* are there tail bytes left ? */
342    subf    5,10,12   /* back up src pointer to prev word alignment */
343    slwi    10,10,3   /* calculate number of bits to shift 1st word left */
344    addi    11,9,-1   /* we move one word after the loop */
345    srwi    8,11,2    /* calculate the 16 byte loop count */
346    lwz     6,0(5)    /* load 1st src word into R6 */
347    mr      4,3
348    lwz     7,4(5)    /* load 2nd src word into R7 */
349    mtcrf   0x01,11
350    subfic  9,10,32   /* number of bits to shift 2nd word right */
351    mtctr   8
352    bf      30,1f
353
354    /* there are at least two words to copy, so copy them */
355#ifdef __LITTLE_ENDIAN__
356    srw   0,6,10
357    slw   8,7,9
358#else
359    slw   0,6,10  /* shift 1st src word to left align it in R0 */
360    srw   8,7,9   /* shift 2nd src word to right align it in R8 */
361#endif
362    or    0,0,8   /* or them to get word to store */
363    lwz   6,8(5)  /* load the 3rd src word */
364    stw   0,0(4)  /* store the 1st dst word */
365#ifdef __LITTLE_ENDIAN__
366    srw   0,7,10
367    slw   8,6,9
368#else
369    slw   0,7,10  /* now left align 2nd src word into R0 */
370    srw   8,6,9   /* shift 3rd src word to right align it in R8 */
371#endif
372    or    0,0,8   /* or them to get word to store */
373    lwz   7,12(5)
374    stw   0,4(4)  /* store the 2nd dst word */
375    addi  4,4,8
376    addi  5,5,16
377    bf    31,4f
378    /* there is a third word to copy, so copy it */
379#ifdef __LITTLE_ENDIAN__
380    srw   0,6,10
381    slw   8,7,9
382#else
383    slw   0,6,10  /* shift 3rd src word to left align it in R0 */
384    srw   8,7,9   /* shift 4th src word to right align it in R8 */
385#endif
386    or    0,0,8   /* or them to get word to store */
387    stw   0,0(4)  /* store 3rd dst word */
388    mr    6,7
389    lwz   7,0(5)
390    addi  5,5,4
391    addi  4,4,4
392    b     4f
393    .align 4
3941:
395#ifdef __LITTLE_ENDIAN__
396    srw     0,6,10
397    slw     8,7,9
398#else
399    slw     0,6,10  /* shift 1st src word to left align it in R0 */
400    srw     8,7,9   /* shift 2nd src word to right align it in R8 */
401#endif
402    addi  5,5,8
403    or    0,0,8   /* or them to get word to store */
404    bf    31,4f
405    mr    6,7
406    lwz   7,0(5)
407    addi  5,5,4
408    stw   0,0(4)  /* store the 1st dst word */
409    addi  4,4,4
410
411    .align  4
4124:
413    /* copy 16 bytes at a time */
414#ifdef __LITTLE_ENDIAN__
415    srw   0,6,10
416    slw   8,7,9
417#else
418    slw   0,6,10
419    srw   8,7,9
420#endif
421    or    0,0,8
422    lwz   6,0(5)
423    stw   0,0(4)
424#ifdef __LITTLE_ENDIAN__
425    srw   0,7,10
426    slw   8,6,9
427#else
428    slw   0,7,10
429    srw   8,6,9
430#endif
431    or    0,0,8
432    lwz   7,4(5)
433    stw   0,4(4)
434#ifdef __LITTLE_ENDIAN__
435    srw   0,6,10
436    slw   8,7,9
437#else
438    slw   0,6,10
439    srw   8,7,9
440#endif
441    or    0,0,8
442    lwz   6,8(5)
443    stw   0,8(4)
444#ifdef __LITTLE_ENDIAN__
445    srw   0,7,10
446    slw   8,6,9
447#else
448    slw   0,7,10
449    srw   8,6,9
450#endif
451    or    0,0,8
452    lwz   7,12(5)
453    stw   0,12(4)
454    addi  5,5,16
455    addi  4,4,16
456    bdnz+ 4b
4578:
458    /* calculate and store the final word */
459#ifdef __LITTLE_ENDIAN__
460    srw   0,6,10
461    slw   8,7,9
462#else
463    slw   0,6,10
464    srw   8,7,9
465#endif
466    or    0,0,8
467    stw   0,0(4)
4683:
469    clrrwi 0,31,2
470    mtcrf 0x01,31
471    bne   cr6,.L9	/* If the tail is 0 bytes we are done!  */
472
473  /* Return original dst pointer.  */
474    mr   3,30
475    lwz  30,20(1)
476    lwz  31,24(1)
477    addi 1,1,32
478    blr
479END (memcpy)
480
481libc_hidden_builtin_def (memcpy)
482