1/* A Thunderx2 Optimized memcpy implementation for AARCH64. 2 Copyright (C) 2018-2021 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22/* Assumptions: 23 * 24 * ARMv8-a, AArch64, unaligned accesses. 25 * 26 */ 27 28#define dstin x0 29#define src x1 30#define count x2 31#define dst x3 32#define srcend x4 33#define dstend x5 34#define tmp2 x6 35#define tmp3 x7 36#define tmp3w w7 37#define A_l x6 38#define A_lw w6 39#define A_h x7 40#define A_hw w7 41#define B_l x8 42#define B_lw w8 43#define B_h x9 44#define C_l x10 45#define C_h x11 46#define D_l x12 47#define D_h x13 48#define E_l src 49#define E_h count 50#define F_l srcend 51#define F_h dst 52#define G_l count 53#define G_h dst 54#define tmp1 x14 55 56#define A_q q0 57#define B_q q1 58#define C_q q2 59#define D_q q3 60#define E_q q4 61#define F_q q5 62#define G_q q6 63#define H_q q7 64#define I_q q16 65#define J_q q17 66 67#define A_v v0 68#define B_v v1 69#define C_v v2 70#define D_v v3 71#define E_v v4 72#define F_v v5 73#define G_v v6 74#define H_v v7 75#define I_v v16 76#define J_v v17 77 78#ifndef MEMMOVE 79# define MEMMOVE memmove 80#endif 81#ifndef MEMCPY 82# define MEMCPY memcpy 83#endif 84 85#if IS_IN (libc) 86 87#undef MEMCPY 88#define MEMCPY __memcpy_thunderx2 89#undef MEMMOVE 90#define MEMMOVE __memmove_thunderx2 91 92 93/* Overlapping large forward memmoves use a loop that copies backwards. 94 Otherwise memcpy is used. Small moves branch to memcopy16 directly. 95 The longer memcpy cases fall through to the memcpy head. 96*/ 97 98ENTRY_ALIGN (MEMMOVE, 6) 99 100 PTR_ARG (0) 101 PTR_ARG (1) 102 SIZE_ARG (2) 103 104 add srcend, src, count 105 cmp count, 16 106 b.ls L(memcopy16) 107 sub tmp1, dstin, src 108 cmp count, 96 109 ccmp tmp1, count, 2, hi 110 b.lo L(move_long) 111 112END (MEMMOVE) 113libc_hidden_builtin_def (MEMMOVE) 114 115 116/* Copies are split into 3 main cases: small copies of up to 16 bytes, 117 medium copies of 17..96 bytes which are fully unrolled. Large copies 118 of more than 96 bytes align the destination and use load-and-merge 119 approach in the case src and dst addresses are unaligned not evenly, 120 so that, actual loads and stores are always aligned. 121 Large copies use the loops processing 64 bytes per iteration for 122 unaligned case and 128 bytes per iteration for aligned ones. 123*/ 124 125#define MEMCPY_PREFETCH_LDR 640 126 127 .p2align 4 128ENTRY (MEMCPY) 129 130 PTR_ARG (0) 131 PTR_ARG (1) 132 SIZE_ARG (2) 133 134 add srcend, src, count 135 cmp count, 16 136 b.ls L(memcopy16) 137 ldr A_q, [src], #16 138 add dstend, dstin, count 139 and tmp1, src, 15 140 cmp count, 96 141 b.hi L(memcopy_long) 142 143 /* Medium copies: 17..96 bytes. */ 144 ldr E_q, [srcend, -16] 145 cmp count, 64 146 b.gt L(memcpy_copy96) 147 cmp count, 48 148 b.le L(bytes_17_to_48) 149 /* 49..64 bytes */ 150 ldp B_q, C_q, [src] 151 str E_q, [dstend, -16] 152 stp A_q, B_q, [dstin] 153 str C_q, [dstin, 32] 154 ret 155 156L(bytes_17_to_48): 157 /* 17..48 bytes*/ 158 cmp count, 32 159 b.gt L(bytes_32_to_48) 160 /* 17..32 bytes*/ 161 str A_q, [dstin] 162 str E_q, [dstend, -16] 163 ret 164 165L(bytes_32_to_48): 166 /* 32..48 */ 167 ldr B_q, [src] 168 str A_q, [dstin] 169 str E_q, [dstend, -16] 170 str B_q, [dstin, 16] 171 ret 172 173 .p2align 4 174 /* Small copies: 0..16 bytes. */ 175L(memcopy16): 176 cmp count, 8 177 b.lo L(bytes_0_to_8) 178 ldr A_l, [src] 179 ldr A_h, [srcend, -8] 180 add dstend, dstin, count 181 str A_l, [dstin] 182 str A_h, [dstend, -8] 183 ret 184 .p2align 4 185 186L(bytes_0_to_8): 187 tbz count, 2, L(bytes_0_to_3) 188 ldr A_lw, [src] 189 ldr A_hw, [srcend, -4] 190 add dstend, dstin, count 191 str A_lw, [dstin] 192 str A_hw, [dstend, -4] 193 ret 194 195 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 196 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 197L(bytes_0_to_3): 198 cbz count, 1f 199 lsr tmp1, count, 1 200 ldrb A_lw, [src] 201 ldrb A_hw, [srcend, -1] 202 add dstend, dstin, count 203 ldrb B_lw, [src, tmp1] 204 strb B_lw, [dstin, tmp1] 205 strb A_hw, [dstend, -1] 206 strb A_lw, [dstin] 2071: 208 ret 209 210 .p2align 4 211 212L(memcpy_copy96): 213 /* Copying 65..96 bytes. A_q (first 16 bytes) and 214 E_q(last 16 bytes) are already loaded. The size 215 is large enough to benefit from aligned loads */ 216 bic src, src, 15 217 ldp B_q, C_q, [src] 218 /* Loaded 64 bytes, second 16-bytes chunk can be 219 overlapping with the first chunk by tmp1 bytes. 220 Stored 16 bytes. */ 221 sub dst, dstin, tmp1 222 add count, count, tmp1 223 /* The range of count being [65..96] becomes [65..111] 224 after tmp [0..15] gets added to it, 225 count now is <bytes-left-to-load>+48 */ 226 cmp count, 80 227 b.gt L(copy96_medium) 228 ldr D_q, [src, 32] 229 stp B_q, C_q, [dst, 16] 230 str D_q, [dst, 48] 231 str A_q, [dstin] 232 str E_q, [dstend, -16] 233 ret 234 235 .p2align 4 236L(copy96_medium): 237 ldp D_q, G_q, [src, 32] 238 cmp count, 96 239 b.gt L(copy96_large) 240 stp B_q, C_q, [dst, 16] 241 stp D_q, G_q, [dst, 48] 242 str A_q, [dstin] 243 str E_q, [dstend, -16] 244 ret 245 246L(copy96_large): 247 ldr F_q, [src, 64] 248 str B_q, [dst, 16] 249 stp C_q, D_q, [dst, 32] 250 stp G_q, F_q, [dst, 64] 251 str A_q, [dstin] 252 str E_q, [dstend, -16] 253 ret 254 255 .p2align 4 256L(memcopy_long): 257 bic src, src, 15 258 ldp B_q, C_q, [src], #32 259 sub dst, dstin, tmp1 260 add count, count, tmp1 261 add dst, dst, 16 262 and tmp1, dst, 15 263 ldp D_q, E_q, [src], #32 264 str A_q, [dstin] 265 266 /* Already loaded 64+16 bytes. Check if at 267 least 64 more bytes left */ 268 subs count, count, 64+64+16 269 b.lt L(loop128_exit0) 270 cmp count, MEMCPY_PREFETCH_LDR + 64 + 32 271 b.lt L(loop128) 272 cbnz tmp1, L(dst_unaligned) 273 sub count, count, MEMCPY_PREFETCH_LDR + 64 + 32 274 275 .p2align 4 276 277L(loop128_prefetch): 278 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] 279 ldp F_q, G_q, [src], #32 280 stp B_q, C_q, [dst], #32 281 ldp H_q, I_q, [src], #32 282 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR] 283 ldp B_q, C_q, [src], #32 284 stp D_q, E_q, [dst], #32 285 ldp D_q, E_q, [src], #32 286 stp F_q, G_q, [dst], #32 287 stp H_q, I_q, [dst], #32 288 subs count, count, 128 289 b.ge L(loop128_prefetch) 290 291 add count, count, MEMCPY_PREFETCH_LDR + 64 + 32 292 .p2align 4 293L(loop128): 294 ldp F_q, G_q, [src], #32 295 ldp H_q, I_q, [src], #32 296 stp B_q, C_q, [dst], #32 297 stp D_q, E_q, [dst], #32 298 subs count, count, 64 299 b.lt L(loop128_exit1) 300 ldp B_q, C_q, [src], #32 301 ldp D_q, E_q, [src], #32 302 stp F_q, G_q, [dst], #32 303 stp H_q, I_q, [dst], #32 304 subs count, count, 64 305 b.ge L(loop128) 306L(loop128_exit0): 307 ldp F_q, G_q, [srcend, -64] 308 ldp H_q, I_q, [srcend, -32] 309 stp B_q, C_q, [dst], #32 310 stp D_q, E_q, [dst] 311 stp F_q, G_q, [dstend, -64] 312 stp H_q, I_q, [dstend, -32] 313 ret 314L(loop128_exit1): 315 ldp B_q, C_q, [srcend, -64] 316 ldp D_q, E_q, [srcend, -32] 317 stp F_q, G_q, [dst], #32 318 stp H_q, I_q, [dst] 319 stp B_q, C_q, [dstend, -64] 320 stp D_q, E_q, [dstend, -32] 321 ret 322 323L(dst_unaligned_tail): 324 ldp C_q, D_q, [srcend, -64] 325 ldp E_q, F_q, [srcend, -32] 326 stp A_q, B_q, [dst], #32 327 stp H_q, I_q, [dst], #16 328 str G_q, [dst, tmp1] 329 stp C_q, D_q, [dstend, -64] 330 stp E_q, F_q, [dstend, -32] 331 ret 332 333L(dst_unaligned): 334 /* For the unaligned store case the code loads two 335 aligned chunks and then merges them using ext 336 instruction. This can be up to 30% faster than 337 the the simple unaligned store access. 338 339 Current state: tmp1 = dst % 16; C_q, D_q, E_q 340 contains data yet to be stored. src and dst points 341 to next-to-be-processed data. A_q, B_q contains 342 data already stored before, count = bytes left to 343 be load decremented by 64. 344 345 The control is passed here if at least 64 bytes left 346 to be loaded. The code does two aligned loads and then 347 extracts (16-tmp1) bytes from the first register and 348 tmp1 bytes from the next register forming the value 349 for the aligned store. 350 351 As ext instruction can only have it's index encoded 352 as immediate. 15 code chunks process each possible 353 index value. Computed goto is used to reach the 354 required code. */ 355 356 /* Store the 16 bytes to dst and align dst for further 357 operations, several bytes will be stored at this 358 address once more */ 359 360 ldp F_q, G_q, [src], #32 361 stp B_q, C_q, [dst], #32 362 bic dst, dst, 15 363 sub count, count, 32 364 adrp tmp2, L(ext_table) 365 add tmp2, tmp2, :lo12:L(ext_table) 366 add tmp2, tmp2, tmp1, LSL #2 367 ldr tmp3w, [tmp2] 368 add tmp2, tmp2, tmp3w, SXTW 369 br tmp2 370 371.p2align 4 372 /* to make the loop in each chunk 16-bytes aligned */ 373 nop 374#define EXT_CHUNK(shft) \ 375L(ext_size_ ## shft):;\ 376 ext A_v.16b, C_v.16b, D_v.16b, 16-shft;\ 377 ext B_v.16b, D_v.16b, E_v.16b, 16-shft;\ 378 ext H_v.16b, E_v.16b, F_v.16b, 16-shft;\ 3791:;\ 380 stp A_q, B_q, [dst], #32;\ 381 prfm pldl1strm, [src, MEMCPY_PREFETCH_LDR];\ 382 ldp C_q, D_q, [src], #32;\ 383 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ 384 stp H_q, I_q, [dst], #32;\ 385 ext A_v.16b, G_v.16b, C_v.16b, 16-shft;\ 386 ext B_v.16b, C_v.16b, D_v.16b, 16-shft;\ 387 ldp F_q, G_q, [src], #32;\ 388 ext H_v.16b, D_v.16b, F_v.16b, 16-shft;\ 389 subs count, count, 64;\ 390 b.ge 1b;\ 3912:;\ 392 ext I_v.16b, F_v.16b, G_v.16b, 16-shft;\ 393 b L(dst_unaligned_tail); 394 395EXT_CHUNK(1) 396EXT_CHUNK(2) 397EXT_CHUNK(3) 398EXT_CHUNK(4) 399EXT_CHUNK(5) 400EXT_CHUNK(6) 401EXT_CHUNK(7) 402EXT_CHUNK(8) 403EXT_CHUNK(9) 404EXT_CHUNK(10) 405EXT_CHUNK(11) 406EXT_CHUNK(12) 407EXT_CHUNK(13) 408EXT_CHUNK(14) 409EXT_CHUNK(15) 410 411L(move_long): 412 .p2align 4 4131: 414 cbz tmp1, 3f 415 416 add srcend, src, count 417 add dstend, dstin, count 418 419 and tmp1, srcend, 15 420 ldr D_q, [srcend, -16] 421 sub srcend, srcend, tmp1 422 sub count, count, tmp1 423 ldp A_q, B_q, [srcend, -32] 424 str D_q, [dstend, -16] 425 ldp C_q, D_q, [srcend, -64]! 426 sub dstend, dstend, tmp1 427 subs count, count, 128 428 b.ls 2f 429 430 .p2align 4 4311: 432 subs count, count, 64 433 stp A_q, B_q, [dstend, -32] 434 ldp A_q, B_q, [srcend, -32] 435 stp C_q, D_q, [dstend, -64]! 436 ldp C_q, D_q, [srcend, -64]! 437 b.hi 1b 438 439 /* Write the last full set of 64 bytes. The remainder is at most 64 440 bytes, so it is safe to always copy 64 bytes from the start even if 441 there is just 1 byte left. */ 4422: 443 ldp E_q, F_q, [src, 32] 444 ldp G_q, H_q, [src] 445 stp A_q, B_q, [dstend, -32] 446 stp C_q, D_q, [dstend, -64] 447 stp E_q, F_q, [dstin, 32] 448 stp G_q, H_q, [dstin] 4493: ret 450 451 452END (MEMCPY) 453 .section .rodata 454 .p2align 4 455 456L(ext_table): 457 /* The first entry is for the alignment of 0 and is never 458 actually used (could be any value). */ 459 .word 0 460 .word L(ext_size_1) -. 461 .word L(ext_size_2) -. 462 .word L(ext_size_3) -. 463 .word L(ext_size_4) -. 464 .word L(ext_size_5) -. 465 .word L(ext_size_6) -. 466 .word L(ext_size_7) -. 467 .word L(ext_size_8) -. 468 .word L(ext_size_9) -. 469 .word L(ext_size_10) -. 470 .word L(ext_size_11) -. 471 .word L(ext_size_12) -. 472 .word L(ext_size_13) -. 473 .word L(ext_size_14) -. 474 .word L(ext_size_15) -. 475 476libc_hidden_builtin_def (MEMCPY) 477#endif 478