1/* Optimized memcpy implementation for PowerPC64.
2   Copyright (C) 2003-2021 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22   Returns 'dst'.
23
24   Memcpy handles short copies (< 32-bytes) using a binary move blocks
25   (no loops) of lwz/stw.  The tail (remaining 1-3) bytes is handled
26   with the appropriate combination of byte and halfword load/stores.
27   There is minimal effort to optimize the alignment of short moves.
28   The 64-bit implementations of POWER3 and POWER4 do a reasonable job
29   of handling unaligned load/stores that do not cross 32-byte boundaries.
30
31   Longer moves (>= 32-bytes) justify the effort to get at least the
32   destination doubleword (8-byte) aligned.  Further optimization is
33   possible when both source and destination are doubleword aligned.
34   Each case has a optimized unrolled loop.   */
35
36#ifndef MEMCPY
37# define MEMCPY memcpy
38#endif
39
40ENTRY_TOCLESS (MEMCPY, 5)
41	CALL_MCOUNT 3
42
43    cmpldi cr1,5,31
44    neg   0,3
45    std   3,-16(1)
46    std   31,-8(1)
47    cfi_offset(31,-8)
48    andi. 11,3,7	/* check alignment of dst.  */
49    clrldi 0,0,61	/* Number of bytes until the 1st doubleword of dst.  */
50    clrldi 10,4,61	/* check alignment of src.  */
51    cmpldi cr6,5,8
52    ble-  cr1,.L2	/* If move < 32 bytes use short move code.  */
53    cmpld cr6,10,11
54    mr    12,4
55    srdi  9,5,3		/* Number of full double words remaining.  */
56    mtcrf 0x01,0
57    mr    31,5
58    beq   .L0
59
60    subf  31,0,5
61  /* Move 0-7 bytes as needed to get the destination doubleword aligned.  */
621:  bf    31,2f
63    lbz   6,0(12)
64    addi  12,12,1
65    stb   6,0(3)
66    addi  3,3,1
672:  bf    30,4f
68    lhz   6,0(12)
69    addi  12,12,2
70    sth   6,0(3)
71    addi  3,3,2
724:  bf    29,0f
73    lwz   6,0(12)
74    addi  12,12,4
75    stw   6,0(3)
76    addi  3,3,4
770:
78    clrldi 10,12,61	/* check alignment of src again.  */
79    srdi  9,31,3	/* Number of full double words remaining.  */
80
81  /* Copy doublewords from source to destination, assuming the
82     destination is aligned on a doubleword boundary.
83
84     At this point we know there are at least 25 bytes left (32-7) to copy.
85     The next step is to determine if the source is also doubleword aligned.
86     If not branch to the unaligned move code at .L6. which uses
87     a load, shift, store strategy.
88
89     Otherwise source and destination are doubleword aligned, and we can
90     the optimized doubleword copy loop.  */
91.L0:
92    clrldi	11,31,61
93    mtcrf 0x01,9
94    bne-  cr6,.L6   /* If source is not DW aligned.  */
95
96  /* Move doublewords where destination and source are DW aligned.
97     Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration.
98     If the copy is not an exact multiple of 32 bytes, 1-3
99     doublewords are copied as needed to set up the main loop.  After
100     the main loop exits there may be a tail of 1-7 bytes. These byte are
101     copied a word/halfword/byte at a time as needed to preserve alignment.  */
102
103    srdi  8,31,5
104    cmpldi	cr1,9,4
105    cmpldi	cr6,11,0
106    mr    11,12
107
108    bf    30,1f
109    ld    6,0(12)
110    ld    7,8(12)
111    addi  11,12,16
112    mtctr 8
113    std   6,0(3)
114    std   7,8(3)
115    addi  10,3,16
116    bf    31,4f
117    ld    0,16(12)
118    std   0,16(3)
119    blt   cr1,3f
120    addi  11,12,24
121    addi  10,3,24
122    b     4f
123    .align  4
1241:
125    mr    10,3
126    mtctr 8
127    bf    31,4f
128    ld    6,0(12)
129    addi  11,12,8
130    std   6,0(3)
131    addi  10,3,8
132
133    .align  4
1344:
135    ld    6,0(11)
136    ld    7,8(11)
137    ld    8,16(11)
138    ld    0,24(11)
139    addi  11,11,32
1402:
141    std   6,0(10)
142    std   7,8(10)
143    std   8,16(10)
144    std   0,24(10)
145    addi  10,10,32
146    bdnz  4b
1473:
148
149    rldicr 0,31,0,60
150    mtcrf 0x01,31
151    beq   cr6,0f
152.L9:
153    add   3,3,0
154    add   12,12,0
155
156/*  At this point we have a tail of 0-7 bytes and we know that the
157    destination is double word aligned.  */
1584:  bf    29,2f
159    lwz   6,0(12)
160    addi  12,12,4
161    stw   6,0(3)
162    addi  3,3,4
1632:  bf    30,1f
164    lhz   6,0(12)
165    addi  12,12,2
166    sth   6,0(3)
167    addi  3,3,2
1681:  bf    31,0f
169    lbz   6,0(12)
170    stb   6,0(3)
1710:
172  /* Return original dst pointer.  */
173    ld 31,-8(1)
174    ld 3,-16(1)
175    blr
176
177/* Copy up to 31 bytes.  This divided into two cases 0-8 bytes and 9-31
178   bytes.  Each case is handled without loops, using binary (1,2,4,8)
179   tests.
180
181   In the short (0-8 byte) case no attempt is made to force alignment
182   of either source or destination.  The hardware will handle the
183   unaligned load/stores with small delays for crossing 32- 64-byte, and
184   4096-byte boundaries. Since these short moves are unlikely to be
185   unaligned or cross these boundaries, the overhead to force
186   alignment is not justified.
187
188   The longer (9-31 byte) move is more likely to cross 32- or 64-byte
189   boundaries.  Since only loads are sensitive to the 32-/64-byte
190   boundaries it is more important to align the source then the
191   destination.  If the source is not already word aligned, we first
192   move 1-3 bytes as needed.  Since we are only word aligned we don't
193   use double word load/stores to insure that all loads are aligned.
194   While the destination and stores may still be unaligned, this
195   is only an issue for page (4096 byte boundary) crossing, which
196   should be rare for these short moves.  The hardware handles this
197   case automatically with a small delay.  */
198
199    .align  4
200.L2:
201    mtcrf 0x01,5
202    neg   8,4
203    clrrdi	11,4,2
204    andi. 0,8,3
205    ble   cr6,.LE8	/* Handle moves of 0-8 bytes.  */
206/* At least 9 bytes left.  Get the source word aligned.  */
207    cmpldi	cr1,5,16
208    mr    10,5
209    mr    12,4
210    cmpldi	cr6,0,2
211    beq   .L3	/* If the source is already word aligned skip this.  */
212/* Copy 1-3 bytes to get source address word aligned.  */
213    lwz   6,0(11)
214    subf  10,0,5
215    add   12,4,0
216    blt   cr6,5f
217    srdi  7,6,16
218    bgt	  cr6,3f
219#ifdef __LITTLE_ENDIAN__
220    sth   7,0(3)
221#else
222    sth   6,0(3)
223#endif
224    b     7f
225    .align  4
2263:
227#ifdef __LITTLE_ENDIAN__
228    rotlwi 6,6,24
229    stb   6,0(3)
230    sth   7,1(3)
231#else
232    stb   7,0(3)
233    sth   6,1(3)
234#endif
235    b     7f
236    .align  4
2375:
238#ifdef __LITTLE_ENDIAN__
239    rotlwi 6,6,8
240#endif
241    stb   6,0(3)
2427:
243    cmpldi	cr1,10,16
244    add   3,3,0
245    mtcrf 0x01,10
246    .align  4
247.L3:
248/* At least 6 bytes left and the source is word aligned.  */
249    blt   cr1,8f
25016: /* Move 16 bytes.  */
251    lwz   6,0(12)
252    lwz   7,4(12)
253    stw   6,0(3)
254    lwz   6,8(12)
255    stw   7,4(3)
256    lwz   7,12(12)
257    addi  12,12,16
258    stw   6,8(3)
259    stw   7,12(3)
260    addi  3,3,16
2618:  /* Move 8 bytes.  */
262    bf    28,4f
263    lwz   6,0(12)
264    lwz   7,4(12)
265    addi  12,12,8
266    stw   6,0(3)
267    stw   7,4(3)
268    addi  3,3,8
2694:  /* Move 4 bytes.  */
270    bf    29,2f
271    lwz   6,0(12)
272    addi  12,12,4
273    stw   6,0(3)
274    addi  3,3,4
2752:  /* Move 2-3 bytes.  */
276    bf    30,1f
277    lhz   6,0(12)
278    sth   6,0(3)
279    bf    31,0f
280    lbz   7,2(12)
281    stb   7,2(3)
282    ld 3,-16(1)
283    blr
2841:  /* Move 1 byte.  */
285    bf    31,0f
286    lbz   6,0(12)
287    stb   6,0(3)
2880:
289  /* Return original dst pointer.  */
290    ld    3,-16(1)
291    blr
292
293/* Special case to copy 0-8 bytes.  */
294    .align  4
295.LE8:
296    mr    12,4
297    bne   cr6,4f
298/* Would have liked to use use ld/std here but the 630 processors are
299   slow for load/store doubles that are not at least word aligned.
300   Unaligned Load/Store word execute with only a 1 cycle penalty.  */
301    lwz   6,0(4)
302    lwz   7,4(4)
303    stw   6,0(3)
304    stw   7,4(3)
305  /* Return original dst pointer.  */
306    ld    3,-16(1)
307    blr
308    .align  4
3094:  bf    29,2b
310    lwz   6,0(4)
311    stw   6,0(3)
3126:
313    bf    30,5f
314    lhz   7,4(4)
315    sth   7,4(3)
316    bf    31,0f
317    lbz   8,6(4)
318    stb   8,6(3)
319    ld 3,-16(1)
320    blr
321    .align  4
3225:
323    bf    31,0f
324    lbz   6,4(4)
325    stb   6,4(3)
326    .align  4
3270:
328  /* Return original dst pointer.  */
329    ld    3,-16(1)
330    blr
331
332    .align  4
333.L6:
334
335  /* Copy doublewords where the destination is aligned but the source is
336     not.  Use aligned doubleword loads from the source, shifted to realign
337     the data, to allow aligned destination stores.  */
338    subf  5,10,12
339    andi. 0,9,1
340    cmpldi cr6,11,0
341    sldi  10,10,3
342    mr    11,9
343    mr    4,3
344    ld    6,0(5)
345    ld    7,8(5)
346    subfic  9,10,64
347    beq   2f
348#ifdef __LITTLE_ENDIAN__
349    srd   0,6,10
350#else
351    sld   0,6,10
352#endif
353    cmpldi  11,1
354    mr    6,7
355    addi  4,4,-8
356    addi  11,11,-1
357    b     1f
3582:  addi  5,5,8
359    .align  4
360#ifdef __LITTLE_ENDIAN__
3610:  srd   0,6,10
362    sld   8,7,9
363#else
3640:  sld   0,6,10
365    srd   8,7,9
366#endif
367    cmpldi  11,2
368    ld    6,8(5)
369    or    0,0,8
370    addi  11,11,-2
371    std   0,0(4)
372#ifdef __LITTLE_ENDIAN__
373    srd   0,7,10
3741:  sld   8,6,9
375#else
376    sld   0,7,10
3771:  srd   8,6,9
378#endif
379    or    0,0,8
380    beq   8f
381    ld    7,16(5)
382    std   0,8(4)
383    addi  5,5,16
384    addi  4,4,16
385    b     0b
386    .align 4
3878:
388    std   0,8(4)
389    rldicr 0,31,0,60
390    mtcrf 0x01,31
391    bne   cr6,.L9	/* If the tail is 0 bytes we are done!  */
392  /* Return original dst pointer.  */
393    ld 31,-8(1)
394    ld 3,-16(1)
395    blr
396END_GEN_TB (MEMCPY,TB_TOCLESS)
397libc_hidden_builtin_def (memcpy)
398