1/* Optimized memcpy implementation for PowerPC64. 2 Copyright (C) 2003-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); 22 Returns 'dst'. 23 24 Memcpy handles short copies (< 32-bytes) using a binary move blocks 25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled 26 with the appropriate combination of byte and halfword load/stores. 27 There is minimal effort to optimize the alignment of short moves. 28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job 29 of handling unaligned load/stores that do not cross 32-byte boundaries. 30 31 Longer moves (>= 32-bytes) justify the effort to get at least the 32 destination doubleword (8-byte) aligned. Further optimization is 33 possible when both source and destination are doubleword aligned. 34 Each case has a optimized unrolled loop. */ 35 36#ifndef MEMCPY 37# define MEMCPY memcpy 38#endif 39 40ENTRY_TOCLESS (MEMCPY, 5) 41 CALL_MCOUNT 3 42 43 cmpldi cr1,5,31 44 neg 0,3 45 std 3,-16(1) 46 std 31,-8(1) 47 cfi_offset(31,-8) 48 andi. 11,3,7 /* check alignment of dst. */ 49 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ 50 clrldi 10,4,61 /* check alignment of src. */ 51 cmpldi cr6,5,8 52 ble- cr1,.L2 /* If move < 32 bytes use short move code. */ 53 cmpld cr6,10,11 54 mr 12,4 55 srdi 9,5,3 /* Number of full double words remaining. */ 56 mtcrf 0x01,0 57 mr 31,5 58 beq .L0 59 60 subf 31,0,5 61 /* Move 0-7 bytes as needed to get the destination doubleword aligned. */ 621: bf 31,2f 63 lbz 6,0(12) 64 addi 12,12,1 65 stb 6,0(3) 66 addi 3,3,1 672: bf 30,4f 68 lhz 6,0(12) 69 addi 12,12,2 70 sth 6,0(3) 71 addi 3,3,2 724: bf 29,0f 73 lwz 6,0(12) 74 addi 12,12,4 75 stw 6,0(3) 76 addi 3,3,4 770: 78 clrldi 10,12,61 /* check alignment of src again. */ 79 srdi 9,31,3 /* Number of full double words remaining. */ 80 81 /* Copy doublewords from source to destination, assuming the 82 destination is aligned on a doubleword boundary. 83 84 At this point we know there are at least 25 bytes left (32-7) to copy. 85 The next step is to determine if the source is also doubleword aligned. 86 If not branch to the unaligned move code at .L6. which uses 87 a load, shift, store strategy. 88 89 Otherwise source and destination are doubleword aligned, and we can 90 the optimized doubleword copy loop. */ 91.L0: 92 clrldi 11,31,61 93 mtcrf 0x01,9 94 bne- cr6,.L6 /* If source is not DW aligned. */ 95 96 /* Move doublewords where destination and source are DW aligned. 97 Use a unrolled loop to copy 4 doubleword (32-bytes) per iteration. 98 If the copy is not an exact multiple of 32 bytes, 1-3 99 doublewords are copied as needed to set up the main loop. After 100 the main loop exits there may be a tail of 1-7 bytes. These byte are 101 copied a word/halfword/byte at a time as needed to preserve alignment. */ 102 103 srdi 8,31,5 104 cmpldi cr1,9,4 105 cmpldi cr6,11,0 106 mr 11,12 107 108 bf 30,1f 109 ld 6,0(12) 110 ld 7,8(12) 111 addi 11,12,16 112 mtctr 8 113 std 6,0(3) 114 std 7,8(3) 115 addi 10,3,16 116 bf 31,4f 117 ld 0,16(12) 118 std 0,16(3) 119 blt cr1,3f 120 addi 11,12,24 121 addi 10,3,24 122 b 4f 123 .align 4 1241: 125 mr 10,3 126 mtctr 8 127 bf 31,4f 128 ld 6,0(12) 129 addi 11,12,8 130 std 6,0(3) 131 addi 10,3,8 132 133 .align 4 1344: 135 ld 6,0(11) 136 ld 7,8(11) 137 ld 8,16(11) 138 ld 0,24(11) 139 addi 11,11,32 1402: 141 std 6,0(10) 142 std 7,8(10) 143 std 8,16(10) 144 std 0,24(10) 145 addi 10,10,32 146 bdnz 4b 1473: 148 149 rldicr 0,31,0,60 150 mtcrf 0x01,31 151 beq cr6,0f 152.L9: 153 add 3,3,0 154 add 12,12,0 155 156/* At this point we have a tail of 0-7 bytes and we know that the 157 destination is double word aligned. */ 1584: bf 29,2f 159 lwz 6,0(12) 160 addi 12,12,4 161 stw 6,0(3) 162 addi 3,3,4 1632: bf 30,1f 164 lhz 6,0(12) 165 addi 12,12,2 166 sth 6,0(3) 167 addi 3,3,2 1681: bf 31,0f 169 lbz 6,0(12) 170 stb 6,0(3) 1710: 172 /* Return original dst pointer. */ 173 ld 31,-8(1) 174 ld 3,-16(1) 175 blr 176 177/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 178 bytes. Each case is handled without loops, using binary (1,2,4,8) 179 tests. 180 181 In the short (0-8 byte) case no attempt is made to force alignment 182 of either source or destination. The hardware will handle the 183 unaligned load/stores with small delays for crossing 32- 64-byte, and 184 4096-byte boundaries. Since these short moves are unlikely to be 185 unaligned or cross these boundaries, the overhead to force 186 alignment is not justified. 187 188 The longer (9-31 byte) move is more likely to cross 32- or 64-byte 189 boundaries. Since only loads are sensitive to the 32-/64-byte 190 boundaries it is more important to align the source then the 191 destination. If the source is not already word aligned, we first 192 move 1-3 bytes as needed. Since we are only word aligned we don't 193 use double word load/stores to insure that all loads are aligned. 194 While the destination and stores may still be unaligned, this 195 is only an issue for page (4096 byte boundary) crossing, which 196 should be rare for these short moves. The hardware handles this 197 case automatically with a small delay. */ 198 199 .align 4 200.L2: 201 mtcrf 0x01,5 202 neg 8,4 203 clrrdi 11,4,2 204 andi. 0,8,3 205 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ 206/* At least 9 bytes left. Get the source word aligned. */ 207 cmpldi cr1,5,16 208 mr 10,5 209 mr 12,4 210 cmpldi cr6,0,2 211 beq .L3 /* If the source is already word aligned skip this. */ 212/* Copy 1-3 bytes to get source address word aligned. */ 213 lwz 6,0(11) 214 subf 10,0,5 215 add 12,4,0 216 blt cr6,5f 217 srdi 7,6,16 218 bgt cr6,3f 219#ifdef __LITTLE_ENDIAN__ 220 sth 7,0(3) 221#else 222 sth 6,0(3) 223#endif 224 b 7f 225 .align 4 2263: 227#ifdef __LITTLE_ENDIAN__ 228 rotlwi 6,6,24 229 stb 6,0(3) 230 sth 7,1(3) 231#else 232 stb 7,0(3) 233 sth 6,1(3) 234#endif 235 b 7f 236 .align 4 2375: 238#ifdef __LITTLE_ENDIAN__ 239 rotlwi 6,6,8 240#endif 241 stb 6,0(3) 2427: 243 cmpldi cr1,10,16 244 add 3,3,0 245 mtcrf 0x01,10 246 .align 4 247.L3: 248/* At least 6 bytes left and the source is word aligned. */ 249 blt cr1,8f 25016: /* Move 16 bytes. */ 251 lwz 6,0(12) 252 lwz 7,4(12) 253 stw 6,0(3) 254 lwz 6,8(12) 255 stw 7,4(3) 256 lwz 7,12(12) 257 addi 12,12,16 258 stw 6,8(3) 259 stw 7,12(3) 260 addi 3,3,16 2618: /* Move 8 bytes. */ 262 bf 28,4f 263 lwz 6,0(12) 264 lwz 7,4(12) 265 addi 12,12,8 266 stw 6,0(3) 267 stw 7,4(3) 268 addi 3,3,8 2694: /* Move 4 bytes. */ 270 bf 29,2f 271 lwz 6,0(12) 272 addi 12,12,4 273 stw 6,0(3) 274 addi 3,3,4 2752: /* Move 2-3 bytes. */ 276 bf 30,1f 277 lhz 6,0(12) 278 sth 6,0(3) 279 bf 31,0f 280 lbz 7,2(12) 281 stb 7,2(3) 282 ld 3,-16(1) 283 blr 2841: /* Move 1 byte. */ 285 bf 31,0f 286 lbz 6,0(12) 287 stb 6,0(3) 2880: 289 /* Return original dst pointer. */ 290 ld 3,-16(1) 291 blr 292 293/* Special case to copy 0-8 bytes. */ 294 .align 4 295.LE8: 296 mr 12,4 297 bne cr6,4f 298/* Would have liked to use use ld/std here but the 630 processors are 299 slow for load/store doubles that are not at least word aligned. 300 Unaligned Load/Store word execute with only a 1 cycle penalty. */ 301 lwz 6,0(4) 302 lwz 7,4(4) 303 stw 6,0(3) 304 stw 7,4(3) 305 /* Return original dst pointer. */ 306 ld 3,-16(1) 307 blr 308 .align 4 3094: bf 29,2b 310 lwz 6,0(4) 311 stw 6,0(3) 3126: 313 bf 30,5f 314 lhz 7,4(4) 315 sth 7,4(3) 316 bf 31,0f 317 lbz 8,6(4) 318 stb 8,6(3) 319 ld 3,-16(1) 320 blr 321 .align 4 3225: 323 bf 31,0f 324 lbz 6,4(4) 325 stb 6,4(3) 326 .align 4 3270: 328 /* Return original dst pointer. */ 329 ld 3,-16(1) 330 blr 331 332 .align 4 333.L6: 334 335 /* Copy doublewords where the destination is aligned but the source is 336 not. Use aligned doubleword loads from the source, shifted to realign 337 the data, to allow aligned destination stores. */ 338 subf 5,10,12 339 andi. 0,9,1 340 cmpldi cr6,11,0 341 sldi 10,10,3 342 mr 11,9 343 mr 4,3 344 ld 6,0(5) 345 ld 7,8(5) 346 subfic 9,10,64 347 beq 2f 348#ifdef __LITTLE_ENDIAN__ 349 srd 0,6,10 350#else 351 sld 0,6,10 352#endif 353 cmpldi 11,1 354 mr 6,7 355 addi 4,4,-8 356 addi 11,11,-1 357 b 1f 3582: addi 5,5,8 359 .align 4 360#ifdef __LITTLE_ENDIAN__ 3610: srd 0,6,10 362 sld 8,7,9 363#else 3640: sld 0,6,10 365 srd 8,7,9 366#endif 367 cmpldi 11,2 368 ld 6,8(5) 369 or 0,0,8 370 addi 11,11,-2 371 std 0,0(4) 372#ifdef __LITTLE_ENDIAN__ 373 srd 0,7,10 3741: sld 8,6,9 375#else 376 sld 0,7,10 3771: srd 8,6,9 378#endif 379 or 0,0,8 380 beq 8f 381 ld 7,16(5) 382 std 0,8(4) 383 addi 5,5,16 384 addi 4,4,16 385 b 0b 386 .align 4 3878: 388 std 0,8(4) 389 rldicr 0,31,0,60 390 mtcrf 0x01,31 391 bne cr6,.L9 /* If the tail is 0 bytes we are done! */ 392 /* Return original dst pointer. */ 393 ld 31,-8(1) 394 ld 3,-16(1) 395 blr 396END_GEN_TB (MEMCPY,TB_TOCLESS) 397libc_hidden_builtin_def (memcpy) 398