1/* Optimized memcpy implementation for PowerPC32 on PowerPC64. 2 Copyright (C) 2003-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); 22 Returns 'dst'. 23 24 Memcpy handles short copies (< 32-bytes) using a binary move blocks 25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled 26 with the appropriate combination of byte and halfword load/stores. 27 There is minimal effort to optimize the alignment of short moves. 28 29 Longer moves (>= 32-bytes) justify the effort to get at least the 30 destination word (4-byte) aligned. Further optimization is 31 possible when both source and destination are word aligned. 32 Each case has an optimized unrolled loop. */ 33 34 .machine power4 35EALIGN (memcpy, 5, 0) 36 CALL_MCOUNT 37 38 stwu 1,-32(1) 39 cfi_adjust_cfa_offset(32) 40 stw 30,20(1) 41 cfi_offset(30,(20-32)) 42 mr 30,3 43 cmplwi cr1,5,31 44 stw 31,24(1) 45 cfi_offset(31,(24-32)) 46 neg 0,3 47 andi. 11,3,3 /* check alignment of dst. */ 48 clrlwi 0,0,30 /* Number of bytes until the 1st word of dst. */ 49 clrlwi 10,4,30 /* check alignment of src. */ 50 cmplwi cr6,5,8 51 ble- cr1,.L2 /* If move < 32 bytes use short move code. */ 52 cmplw cr6,10,11 53 mr 12,4 54 srwi 9,5,2 /* Number of full words remaining. */ 55 mtcrf 0x01,0 56 mr 31,5 57 beq .L0 58 59 subf 31,0,5 60 /* Move 0-3 bytes as needed to get the destination word aligned. */ 611: bf 31,2f 62 lbz 6,0(12) 63 addi 12,12,1 64 stb 6,0(3) 65 addi 3,3,1 662: bf 30,0f 67 lhz 6,0(12) 68 addi 12,12,2 69 sth 6,0(3) 70 addi 3,3,2 710: 72 clrlwi 10,12,30 /* check alignment of src again. */ 73 srwi 9,31,2 /* Number of full words remaining. */ 74 75 /* Copy words from source to destination, assuming the destination is 76 aligned on a word boundary. 77 78 At this point we know there are at least 25 bytes left (32-7) to copy. 79 The next step is to determine if the source is also word aligned. 80 If not branch to the unaligned move code at .L6. which uses 81 a load, shift, store strategy. 82 83 Otherwise source and destination are word aligned, and we can use 84 the optimized word copy loop. */ 85.L0: 86 clrlwi 11,31,30 /* calculate the number of tail bytes */ 87 mtcrf 0x01,9 88 bne- cr6,.L6 /* If source is not word aligned. */ 89 90 /* Move words where destination and source are word aligned. 91 Use an unrolled loop to copy 4 words (16-bytes) per iteration. 92 If the copy is not an exact multiple of 16 bytes, 1-3 93 words are copied as needed to set up the main loop. After 94 the main loop exits there may be a tail of 1-3 bytes. These bytes are 95 copied a halfword/byte at a time as needed to preserve alignment. */ 96 97 srwi 8,31,4 /* calculate the 16 byte loop count */ 98 cmplwi cr1,9,4 99 cmplwi cr6,11,0 100 mr 11,12 101 102 bf 30,1f 103 lwz 6,0(12) 104 lwz 7,4(12) 105 addi 11,12,8 106 mtctr 8 107 stw 6,0(3) 108 stw 7,4(3) 109 addi 10,3,8 110 bf 31,4f 111 lwz 0,8(12) 112 stw 0,8(3) 113 blt cr1,3f 114 addi 11,12,12 115 addi 10,3,12 116 b 4f 117 .align 4 1181: 119 mr 10,3 120 mtctr 8 121 bf 31,4f 122 lwz 6,0(12) 123 addi 11,12,4 124 stw 6,0(3) 125 addi 10,3,4 126 127 .align 4 1284: 129 lwz 6,0(11) 130 lwz 7,4(11) 131 lwz 8,8(11) 132 lwz 0,12(11) 133 stw 6,0(10) 134 stw 7,4(10) 135 stw 8,8(10) 136 stw 0,12(10) 137 addi 11,11,16 138 addi 10,10,16 139 bdnz 4b 1403: 141 clrrwi 0,31,2 142 mtcrf 0x01,31 143 beq cr6,0f 144.L9: 145 add 3,3,0 146 add 12,12,0 147 148/* At this point we have a tail of 0-3 bytes and we know that the 149 destination is word aligned. */ 1502: bf 30,1f 151 lhz 6,0(12) 152 addi 12,12,2 153 sth 6,0(3) 154 addi 3,3,2 1551: bf 31,0f 156 lbz 6,0(12) 157 stb 6,0(3) 1580: 159 /* Return original dst pointer. */ 160 mr 3,30 161 lwz 30,20(1) 162 lwz 31,24(1) 163 addi 1,1,32 164 blr 165 166/* Copy up to 31 bytes. This is divided into two cases 0-8 bytes and 167 9-31 bytes. Each case is handled without loops, using binary 168 (1,2,4,8) tests. 169 170 In the short (0-8 byte) case no attempt is made to force alignment 171 of either source or destination. The hardware will handle the 172 unaligned load/stores with small delays for crossing 32- 64-byte, and 173 4096-byte boundaries. Since these short moves are unlikely to be 174 unaligned or cross these boundaries, the overhead to force 175 alignment is not justified. 176 177 The longer (9-31 byte) move is more likely to cross 32- or 64-byte 178 boundaries. Since only loads are sensitive to the 32-/64-byte 179 boundaries it is more important to align the source than the 180 destination. If the source is not already word aligned, we first 181 move 1-3 bytes as needed. While the destination and stores may 182 still be unaligned, this is only an issue for page (4096 byte 183 boundary) crossing, which should be rare for these short moves. 184 The hardware handles this case automatically with a small delay. */ 185 186 .align 4 187.L2: 188 mtcrf 0x01,5 189 neg 8,4 190 clrrwi 11,4,2 191 andi. 0,8,3 192 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ 193/* At least 9 bytes left. Get the source word aligned. */ 194 cmplwi cr1,5,16 195 mr 10,5 196 mr 12,4 197 cmplwi cr6,0,2 198 beq .L3 /* If the source is already word aligned skip this. */ 199/* Copy 1-3 bytes to get source address word aligned. */ 200 lwz 6,0(11) 201 subf 10,0,5 202 add 12,4,0 203 blt cr6,5f 204 srwi 7,6,16 205 bgt cr6,3f 206#ifdef __LITTLE_ENDIAN__ 207 sth 7,0(3) 208#else 209 sth 6,0(3) 210#endif 211 b 7f 212 .align 4 2133: 214#ifdef __LITTLE_ENDIAN__ 215 rotlwi 6,6,24 216 stb 6,0(3) 217 sth 7,1(3) 218#else 219 stb 7,0(3) 220 sth 6,1(3) 221#endif 222 b 7f 223 .align 4 2245: 225#ifdef __LITTLE_ENDIAN__ 226 rotlwi 6,6,8 227#endif 228 stb 6,0(3) 2297: 230 cmplwi cr1,10,16 231 add 3,3,0 232 mtcrf 0x01,10 233 .align 4 234.L3: 235/* At least 6 bytes left and the source is word aligned. */ 236 blt cr1,8f 23716: /* Move 16 bytes. */ 238 lwz 6,0(12) 239 lwz 7,4(12) 240 stw 6,0(3) 241 lwz 6,8(12) 242 stw 7,4(3) 243 lwz 7,12(12) 244 addi 12,12,16 245 stw 6,8(3) 246 stw 7,12(3) 247 addi 3,3,16 2488: /* Move 8 bytes. */ 249 bf 28,4f 250 lwz 6,0(12) 251 lwz 7,4(12) 252 addi 12,12,8 253 stw 6,0(3) 254 stw 7,4(3) 255 addi 3,3,8 2564: /* Move 4 bytes. */ 257 bf 29,2f 258 lwz 6,0(12) 259 addi 12,12,4 260 stw 6,0(3) 261 addi 3,3,4 2622: /* Move 2-3 bytes. */ 263 bf 30,1f 264 lhz 6,0(12) 265 sth 6,0(3) 266 bf 31,0f 267 lbz 7,2(12) 268 stb 7,2(3) 269 mr 3,30 270 lwz 30,20(1) 271 addi 1,1,32 272 blr 2731: /* Move 1 byte. */ 274 bf 31,0f 275 lbz 6,0(12) 276 stb 6,0(3) 2770: 278 /* Return original dst pointer. */ 279 mr 3,30 280 lwz 30,20(1) 281 addi 1,1,32 282 blr 283 284/* Special case to copy 0-8 bytes. */ 285 .align 4 286.LE8: 287 mr 12,4 288 bne cr6,4f 289 lwz 6,0(4) 290 lwz 7,4(4) 291 stw 6,0(3) 292 stw 7,4(3) 293 /* Return original dst pointer. */ 294 mr 3,30 295 lwz 30,20(1) 296 addi 1,1,32 297 blr 298 .align 4 2994: bf 29,2b 300 lwz 6,0(4) 301 stw 6,0(3) 3026: 303 bf 30,5f 304 lhz 7,4(4) 305 sth 7,4(3) 306 bf 31,0f 307 lbz 8,6(4) 308 stb 8,6(3) 309 mr 3,30 310 lwz 30,20(1) 311 addi 1,1,32 312 blr 313 .align 4 3145: 315 bf 31,0f 316 lbz 6,4(4) 317 stb 6,4(3) 318 .align 4 3190: 320 /* Return original dst pointer. */ 321 mr 3,30 322 lwz 30,20(1) 323 addi 1,1,32 324 blr 325 326 .align 4 327.L6: 328 329 /* Copy words where the destination is aligned but the source is 330 not. Use aligned word loads from the source, shifted to realign 331 the data, to allow aligned destination stores. 332 Use an unrolled loop to copy 4 words (16-bytes) per iteration. 333 A single word is retained for storing at loop exit to avoid walking 334 off the end of a page within the loop. 335 If the copy is not an exact multiple of 16 bytes, 1-3 336 words are copied as needed to set up the main loop. After 337 the main loop exits there may be a tail of 1-3 bytes. These bytes are 338 copied a halfword/byte at a time as needed to preserve alignment. */ 339 340 341 cmplwi cr6,11,0 /* are there tail bytes left ? */ 342 subf 5,10,12 /* back up src pointer to prev word alignment */ 343 slwi 10,10,3 /* calculate number of bits to shift 1st word left */ 344 addi 11,9,-1 /* we move one word after the loop */ 345 srwi 8,11,2 /* calculate the 16 byte loop count */ 346 lwz 6,0(5) /* load 1st src word into R6 */ 347 mr 4,3 348 lwz 7,4(5) /* load 2nd src word into R7 */ 349 mtcrf 0x01,11 350 subfic 9,10,32 /* number of bits to shift 2nd word right */ 351 mtctr 8 352 bf 30,1f 353 354 /* there are at least two words to copy, so copy them */ 355#ifdef __LITTLE_ENDIAN__ 356 srw 0,6,10 357 slw 8,7,9 358#else 359 slw 0,6,10 /* shift 1st src word to left align it in R0 */ 360 srw 8,7,9 /* shift 2nd src word to right align it in R8 */ 361#endif 362 or 0,0,8 /* or them to get word to store */ 363 lwz 6,8(5) /* load the 3rd src word */ 364 stw 0,0(4) /* store the 1st dst word */ 365#ifdef __LITTLE_ENDIAN__ 366 srw 0,7,10 367 slw 8,6,9 368#else 369 slw 0,7,10 /* now left align 2nd src word into R0 */ 370 srw 8,6,9 /* shift 3rd src word to right align it in R8 */ 371#endif 372 or 0,0,8 /* or them to get word to store */ 373 lwz 7,12(5) 374 stw 0,4(4) /* store the 2nd dst word */ 375 addi 4,4,8 376 addi 5,5,16 377 bf 31,4f 378 /* there is a third word to copy, so copy it */ 379#ifdef __LITTLE_ENDIAN__ 380 srw 0,6,10 381 slw 8,7,9 382#else 383 slw 0,6,10 /* shift 3rd src word to left align it in R0 */ 384 srw 8,7,9 /* shift 4th src word to right align it in R8 */ 385#endif 386 or 0,0,8 /* or them to get word to store */ 387 stw 0,0(4) /* store 3rd dst word */ 388 mr 6,7 389 lwz 7,0(5) 390 addi 5,5,4 391 addi 4,4,4 392 b 4f 393 .align 4 3941: 395#ifdef __LITTLE_ENDIAN__ 396 srw 0,6,10 397 slw 8,7,9 398#else 399 slw 0,6,10 /* shift 1st src word to left align it in R0 */ 400 srw 8,7,9 /* shift 2nd src word to right align it in R8 */ 401#endif 402 addi 5,5,8 403 or 0,0,8 /* or them to get word to store */ 404 bf 31,4f 405 mr 6,7 406 lwz 7,0(5) 407 addi 5,5,4 408 stw 0,0(4) /* store the 1st dst word */ 409 addi 4,4,4 410 411 .align 4 4124: 413 /* copy 16 bytes at a time */ 414#ifdef __LITTLE_ENDIAN__ 415 srw 0,6,10 416 slw 8,7,9 417#else 418 slw 0,6,10 419 srw 8,7,9 420#endif 421 or 0,0,8 422 lwz 6,0(5) 423 stw 0,0(4) 424#ifdef __LITTLE_ENDIAN__ 425 srw 0,7,10 426 slw 8,6,9 427#else 428 slw 0,7,10 429 srw 8,6,9 430#endif 431 or 0,0,8 432 lwz 7,4(5) 433 stw 0,4(4) 434#ifdef __LITTLE_ENDIAN__ 435 srw 0,6,10 436 slw 8,7,9 437#else 438 slw 0,6,10 439 srw 8,7,9 440#endif 441 or 0,0,8 442 lwz 6,8(5) 443 stw 0,8(4) 444#ifdef __LITTLE_ENDIAN__ 445 srw 0,7,10 446 slw 8,6,9 447#else 448 slw 0,7,10 449 srw 8,6,9 450#endif 451 or 0,0,8 452 lwz 7,12(5) 453 stw 0,12(4) 454 addi 5,5,16 455 addi 4,4,16 456 bdnz+ 4b 4578: 458 /* calculate and store the final word */ 459#ifdef __LITTLE_ENDIAN__ 460 srw 0,6,10 461 slw 8,7,9 462#else 463 slw 0,6,10 464 srw 8,7,9 465#endif 466 or 0,0,8 467 stw 0,0(4) 4683: 469 clrrwi 0,31,2 470 mtcrf 0x01,31 471 bne cr6,.L9 /* If the tail is 0 bytes we are done! */ 472 473 /* Return original dst pointer. */ 474 mr 3,30 475 lwz 30,20(1) 476 lwz 31,24(1) 477 addi 1,1,32 478 blr 479END (memcpy) 480 481libc_hidden_builtin_def (memcpy) 482