1/* Optimized memcpy implementation for PowerPC64.
2   Copyright (C) 2003-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.
23
24   Memcpy handles short copies (< 32-bytes) using a binary move blocks
25   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
26   with the appropriate combination of byte and halfword load/stores.
27   There is minimal effort to optimize the alignment of short moves.
28   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29   of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31   Longer moves (>= 32-bytes) justify the effort to get at least the
32   destination doubleword (8-byte) aligned.  Further optimization is
33   possible when both source and destination are doubleword aligned.
34   Each case has a optimized unrolled loop.   */
35
36#ifndef MEMCPY
37# define MEMCPY memcpy
38#endif
39	.machine power4
40ENTRY_TOCLESS (MEMCPY, 5)
41	CALL_MCOUNT 3
42
43    cmpldi cr1,5,31
44    neg   0,3
45    std   3,-16(1)
46    std   31,-8(1)
47    cfi_offset(31,-8)
48    andi. 11,3,7	/* check alignment of dst.  */
49    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
50    clrldi 10,4,61	/* check alignment of src.  */
51    cmpldi cr6,5,8
52    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
53    cmpld cr6,10,11
54    mr    12,4
55    srdi  9,5,3		/* Number of full double words remaining.  */
56    mtcrf 0x01,0
57    mr    31,5
58    beq   .L0
59
60    subf  31,0,5
61  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
621:  bf    31,2f
63    lbz   6,0(12)
64    addi  12,12,1
65    stb   6,0(3)
66    addi  3,3,1
672:  bf    30,4f
68    lhz   6,0(12)
69    addi  12,12,2
70    sth   6,0(3)
71    addi  3,3,2
724:  bf    29,0f
73    lwz   6,0(12)
74    addi  12,12,4
75    stw   6,0(3)
76    addi  3,3,4
770:
78    clrldi 10,12,61	/* check alignment of src again.  */
79    srdi  9,31,3	/* Number of full double words remaining.  */
80
81  /* Copy doublewords from source to destination, assuming the
82     destination is aligned on a doubleword boundary.
83
84     At this point we know there are at least 25 bytes left (32-7) to copy.
85     The next step is to determine if the source is also doubleword aligned.
86     If not branch to the unaligned move code at .L6. which uses
87     a load, shift, store strategy.
88
89     Otherwise source and destination are doubleword aligned, and we can
90     the optimized doubleword copy loop.  */
91.L0:
92    clrldi  11,31,61
93    mtcrf   0x01,9
94    cmpldi  cr1,11,0
95    bne-    cr6,.L6   /* If source is not DW aligned.  */
96
97  /* Move doublewords where destination and source are DW aligned.
98     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
99     If the copy is not an exact multiple of 32 bytes, 1-3
100     doublewords are copied as needed to set up the main loop.  After
101     the main loop exits there may be a tail of 1-7 bytes. These byte are
102     copied a word/halfword/byte at a time as needed to preserve alignment.  */
103
104    srdi  8,31,5
105    cmpldi	cr1,9,4
106    cmpldi	cr6,11,0
107    mr    11,12
108
109    bf    30,1f
110    ld    6,0(12)
111    ld    7,8(12)
112    addi  11,12,16
113    mtctr 8
114    std   6,0(3)
115    std   7,8(3)
116    addi  10,3,16
117    bf    31,4f
118    ld    0,16(12)
119    std   0,16(3)
120    blt   cr1,3f
121    addi  11,12,24
122    addi  10,3,24
123    b     4f
124    .align  4
1251:
126    mr    10,3
127    mtctr 8
128    bf    31,4f
129    ld    6,0(12)
130    addi  11,12,8
131    std   6,0(3)
132    addi  10,3,8
133
134    .align  4
1354:
136    ld    6,0(11)
137    ld    7,8(11)
138    ld    8,16(11)
139    ld    0,24(11)
140    addi  11,11,32
1412:
142    std   6,0(10)
143    std   7,8(10)
144    std   8,16(10)
145    std   0,24(10)
146    addi  10,10,32
147    bdnz  4b
1483:
149
150    rldicr 0,31,0,60
151    mtcrf 0x01,31
152    beq   cr6,0f
153.L9:
154    add   3,3,0
155    add   12,12,0
156
157/*  At this point we have a tail of 0-7 bytes and we know that the
158    destination is double word aligned.  */
1594:  bf    29,2f
160    lwz   6,0(12)
161    addi  12,12,4
162    stw   6,0(3)
163    addi  3,3,4
1642:  bf    30,1f
165    lhz   6,0(12)
166    addi  12,12,2
167    sth   6,0(3)
168    addi  3,3,2
1691:  bf    31,0f
170    lbz   6,0(12)
171    stb   6,0(3)
1720:
173  /* Return original dst pointer.  */
174    ld 31,-8(1)
175    ld 3,-16(1)
176    blr
177
178/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
179   bytes.  Each case is handled without loops, using binary (1,2,4,8)
180   tests.
181
182   In the short (0-8 byte) case no attempt is made to force alignment
183   of either source or destination.  The hardware will handle the
184   unaligned load/stores with small delays for crossing 32- 64-byte, and
185   4096-byte boundaries. Since these short moves are unlikely to be
186   unaligned or cross these boundaries, the overhead to force
187   alignment is not justified.
188
189   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
190   boundaries.  Since only loads are sensitive to the 32-/64-byte
191   boundaries it is more important to align the source then the
192   destination.  If the source is not already word aligned, we first
193   move 1-3 bytes as needed.  Since we are only word aligned we don't
194   use double word load/stores to insure that all loads are aligned.
195   While the destination and stores may still be unaligned, this
196   is only an issue for page (4096 byte boundary) crossing, which
197   should be rare for these short moves.  The hardware handles this
198   case automatically with a small delay.  */
199
200    .align  4
201.L2:
202    mtcrf 0x01,5
203    neg   8,4
204    clrrdi	11,4,2
205    andi. 0,8,3
206    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
207/* At least 9 bytes left.  Get the source word aligned.  */
208    cmpldi	cr1,5,16
209    mr    10,5
210    mr    12,4
211    cmpldi	cr6,0,2
212    beq   .L3	/* If the source is already word aligned skip this.  */
213/* Copy 1-3 bytes to get source address word aligned.  */
214    lwz   6,0(11)
215    subf  10,0,5
216    add   12,4,0
217    blt   cr6,5f
218    srdi  7,6,16
219    bgt	  cr6,3f
220#ifdef __LITTLE_ENDIAN__
221    sth   7,0(3)
222#else
223    sth   6,0(3)
224#endif
225    b     7f
226    .align  4
2273:
228#ifdef __LITTLE_ENDIAN__
229    rotlwi 6,6,24
230    stb   6,0(3)
231    sth   7,1(3)
232#else
233    stb   7,0(3)
234    sth   6,1(3)
235#endif
236    b     7f
237    .align  4
2385:
239#ifdef __LITTLE_ENDIAN__
240    rotlwi 6,6,8
241#endif
242    stb   6,0(3)
2437:
244    cmpldi	cr1,10,16
245    add   3,3,0
246    mtcrf 0x01,10
247    .align  4
248.L3:
249/* At least 6 bytes left and the source is word aligned.  */
250    blt   cr1,8f
25116: /* Move 16 bytes.  */
252    lwz   6,0(12)
253    lwz   7,4(12)
254    stw   6,0(3)
255    lwz   6,8(12)
256    stw   7,4(3)
257    lwz   7,12(12)
258    addi  12,12,16
259    stw   6,8(3)
260    stw   7,12(3)
261    addi  3,3,16
2628:  /* Move 8 bytes.  */
263    bf    28,4f
264    lwz   6,0(12)
265    lwz   7,4(12)
266    addi  12,12,8
267    stw   6,0(3)
268    stw   7,4(3)
269    addi  3,3,8
2704:  /* Move 4 bytes.  */
271    bf    29,2f
272    lwz   6,0(12)
273    addi  12,12,4
274    stw   6,0(3)
275    addi  3,3,4
2762:  /* Move 2-3 bytes.  */
277    bf    30,1f
278    lhz   6,0(12)
279    sth   6,0(3)
280    bf    31,0f
281    lbz   7,2(12)
282    stb   7,2(3)
283    ld 3,-16(1)
284    blr
2851:  /* Move 1 byte.  */
286    bf    31,0f
287    lbz   6,0(12)
288    stb   6,0(3)
2890:
290  /* Return original dst pointer.  */
291    ld    3,-16(1)
292    blr
293
294/* Special case to copy 0-8 bytes.  */
295    .align  4
296.LE8:
297    mr    12,4
298    bne   cr6,4f
299/* Would have liked to use use ld/std here but the 630 processors are
300   slow for load/store doubles that are not at least word aligned.
301   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
302    lwz   6,0(4)
303    lwz   7,4(4)
304    stw   6,0(3)
305    stw   7,4(3)
306  /* Return original dst pointer.  */
307    ld    3,-16(1)
308    blr
309    .align  4
3104:  bf    29,2b
311    lwz   6,0(4)
312    stw   6,0(3)
3136:
314    bf    30,5f
315    lhz   7,4(4)
316    sth   7,4(3)
317    bf    31,0f
318    lbz   8,6(4)
319    stb   8,6(3)
320    ld 3,-16(1)
321    blr
322    .align  4
3235:
324    bf    31,0f
325    lbz   6,4(4)
326    stb   6,4(3)
327    .align  4
3280:
329  /* Return original dst pointer.  */
330    ld    3,-16(1)
331    blr
332
333    .align  4
334.L6:
335
336  /* Copy doublewords where the destination is aligned but the source is
337     not.  Use aligned doubleword loads from the source, shifted to realign
338     the data, to allow aligned destination stores.  */
339    addi    11,9,-1  /* loop DW count is one less than total */
340    subf    5,10,12
341    sldi    10,10,3
342    mr      4,3
343    srdi    8,11,2   /* calculate the 32 byte loop count */
344    ld      6,0(5)
345    mtcrf   0x01,11
346    cmpldi  cr6,9,4
347    mtctr   8
348    ld      7,8(5)
349    subfic  9,10,64
350    bf      30,1f
351
352    /* there are at least two DWs to copy */
353#ifdef __LITTLE_ENDIAN__
354    srd     0,6,10
355    sld     8,7,9
356#else
357    sld     0,6,10
358    srd     8,7,9
359#endif
360    or      0,0,8
361    ld      6,16(5)
362    std     0,0(4)
363#ifdef __LITTLE_ENDIAN__
364    srd     0,7,10
365    sld     8,6,9
366#else
367    sld     0,7,10
368    srd     8,6,9
369#endif
370    or      0,0,8
371    ld      7,24(5)
372    std     0,8(4)
373    addi    4,4,16
374    addi    5,5,32
375    blt     cr6,8f  /* if total DWs = 3, then bypass loop */
376    bf      31,4f
377    /* there is a third DW to copy */
378#ifdef __LITTLE_ENDIAN__
379    srd     0,6,10
380    sld     8,7,9
381#else
382    sld     0,6,10
383    srd     8,7,9
384#endif
385    or      0,0,8
386    std     0,0(4)
387    mr      6,7
388    ld      7,0(5)
389    addi    5,5,8
390    addi    4,4,8
391    beq     cr6,8f  /* if total DWs = 4, then bypass loop */
392    b       4f
393    .align 4
3941:
395#ifdef __LITTLE_ENDIAN__
396    srd     0,6,10
397    sld     8,7,9
398#else
399    sld     0,6,10
400    srd     8,7,9
401#endif
402    addi    5,5,16
403    or      0,0,8
404    bf      31,4f
405    mr      6,7
406    ld      7,0(5)
407    addi    5,5,8
408    std     0,0(4)
409    addi    4,4,8
410    .align 4
411/* copy 32 bytes at a time */
4124:
413#ifdef __LITTLE_ENDIAN__
414    srd   0,6,10
415    sld   8,7,9
416#else
417    sld   0,6,10
418    srd   8,7,9
419#endif
420    or    0,0,8
421    ld    6,0(5)
422    std   0,0(4)
423#ifdef __LITTLE_ENDIAN__
424    srd   0,7,10
425    sld   8,6,9
426#else
427    sld   0,7,10
428    srd   8,6,9
429#endif
430    or    0,0,8
431    ld    7,8(5)
432    std   0,8(4)
433#ifdef __LITTLE_ENDIAN__
434    srd   0,6,10
435    sld   8,7,9
436#else
437    sld   0,6,10
438    srd   8,7,9
439#endif
440    or    0,0,8
441    ld    6,16(5)
442    std   0,16(4)
443#ifdef __LITTLE_ENDIAN__
444    srd   0,7,10
445    sld   8,6,9
446#else
447    sld   0,7,10
448    srd   8,6,9
449#endif
450    or    0,0,8
451    ld    7,24(5)
452    std   0,24(4)
453    addi  5,5,32
454    addi  4,4,32
455    bdnz+ 4b
456    .align 4
4578:
458    /* calculate and store the final DW */
459#ifdef __LITTLE_ENDIAN__
460    srd   0,6,10
461    sld   8,7,9
462#else
463    sld   0,6,10
464    srd   8,7,9
465#endif
466    or    0,0,8
467    std   0,0(4)
4683:
469    rldicr 0,31,0,60
470    mtcrf 0x01,31
471    bne   cr1,.L9	/* If the tail is 0 bytes we are done!  */
472  /* Return original dst pointer.  */
473    ld 31,-8(1)
474    ld 3,-16(1)
475    blr
476END_GEN_TB (MEMCPY,TB_TOCLESS)
477libc_hidden_builtin_def (memcpy)
478