1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. 2 Copyright (C) 2016-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#if IS_IN (libc) 22 23# include "asm-syntax.h" 24 25 .section .text.avx512,"ax",@progbits 26ENTRY (__mempcpy_chk_avx512_no_vzeroupper) 27 cmp %RDX_LP, %RCX_LP 28 jb HIDDEN_JUMPTARGET (__chk_fail) 29END (__mempcpy_chk_avx512_no_vzeroupper) 30 31ENTRY (__mempcpy_avx512_no_vzeroupper) 32 mov %RDI_LP, %RAX_LP 33 add %RDX_LP, %RAX_LP 34 jmp L(start) 35END (__mempcpy_avx512_no_vzeroupper) 36 37ENTRY (__memmove_chk_avx512_no_vzeroupper) 38 cmp %RDX_LP, %RCX_LP 39 jb HIDDEN_JUMPTARGET (__chk_fail) 40END (__memmove_chk_avx512_no_vzeroupper) 41 42ENTRY (__memmove_avx512_no_vzeroupper) 43 mov %RDI_LP, %RAX_LP 44# ifdef USE_AS_MEMPCPY 45 add %RDX_LP, %RAX_LP 46# endif 47L(start): 48# ifdef __ILP32__ 49 /* Clear the upper 32 bits. */ 50 mov %edx, %edx 51# endif 52 lea (%rsi, %rdx), %rcx 53 lea (%rdi, %rdx), %r9 54 cmp $512, %rdx 55 ja L(512bytesormore) 56 57L(check): 58 cmp $16, %rdx 59 jbe L(less_16bytes) 60 cmp $256, %rdx 61 jb L(less_256bytes) 62 vmovups (%rsi), %zmm0 63 vmovups 0x40(%rsi), %zmm1 64 vmovups 0x80(%rsi), %zmm2 65 vmovups 0xC0(%rsi), %zmm3 66 vmovups -0x100(%rcx), %zmm4 67 vmovups -0xC0(%rcx), %zmm5 68 vmovups -0x80(%rcx), %zmm6 69 vmovups -0x40(%rcx), %zmm7 70 vmovups %zmm0, (%rdi) 71 vmovups %zmm1, 0x40(%rdi) 72 vmovups %zmm2, 0x80(%rdi) 73 vmovups %zmm3, 0xC0(%rdi) 74 vmovups %zmm4, -0x100(%r9) 75 vmovups %zmm5, -0xC0(%r9) 76 vmovups %zmm6, -0x80(%r9) 77 vmovups %zmm7, -0x40(%r9) 78 ret 79 80L(less_256bytes): 81 cmp $128, %dl 82 jb L(less_128bytes) 83 vmovups (%rsi), %zmm0 84 vmovups 0x40(%rsi), %zmm1 85 vmovups -0x80(%rcx), %zmm2 86 vmovups -0x40(%rcx), %zmm3 87 vmovups %zmm0, (%rdi) 88 vmovups %zmm1, 0x40(%rdi) 89 vmovups %zmm2, -0x80(%r9) 90 vmovups %zmm3, -0x40(%r9) 91 ret 92 93L(less_128bytes): 94 cmp $64, %dl 95 jb L(less_64bytes) 96 vmovdqu (%rsi), %ymm0 97 vmovdqu 0x20(%rsi), %ymm1 98 vmovdqu -0x40(%rcx), %ymm2 99 vmovdqu -0x20(%rcx), %ymm3 100 vmovdqu %ymm0, (%rdi) 101 vmovdqu %ymm1, 0x20(%rdi) 102 vmovdqu %ymm2, -0x40(%r9) 103 vmovdqu %ymm3, -0x20(%r9) 104 ret 105 106L(less_64bytes): 107 cmp $32, %dl 108 jb L(less_32bytes) 109 vmovdqu (%rsi), %ymm0 110 vmovdqu -0x20(%rcx), %ymm1 111 vmovdqu %ymm0, (%rdi) 112 vmovdqu %ymm1, -0x20(%r9) 113 ret 114 115L(less_32bytes): 116 vmovdqu (%rsi), %xmm0 117 vmovdqu -0x10(%rcx), %xmm1 118 vmovdqu %xmm0, (%rdi) 119 vmovdqu %xmm1, -0x10(%r9) 120 ret 121 122L(less_16bytes): 123 cmp $8, %dl 124 jb L(less_8bytes) 125 movq (%rsi), %rsi 126 movq -0x8(%rcx), %rcx 127 movq %rsi, (%rdi) 128 movq %rcx, -0x8(%r9) 129 ret 130 131L(less_8bytes): 132 cmp $4, %dl 133 jb L(less_4bytes) 134 mov (%rsi), %esi 135 mov -0x4(%rcx), %ecx 136 mov %esi, (%rdi) 137 mov %ecx, -0x4(%r9) 138 ret 139 140L(less_4bytes): 141 cmp $2, %dl 142 jb L(less_2bytes) 143 mov (%rsi), %si 144 mov -0x2(%rcx), %cx 145 mov %si, (%rdi) 146 mov %cx, -0x2(%r9) 147 ret 148 149L(less_2bytes): 150 cmp $1, %dl 151 jb L(less_1bytes) 152 mov (%rsi), %cl 153 mov %cl, (%rdi) 154L(less_1bytes): 155 ret 156 157L(512bytesormore): 158# ifdef SHARED_CACHE_SIZE_HALF 159 mov $SHARED_CACHE_SIZE_HALF, %r8 160# else 161 mov __x86_shared_cache_size_half(%rip), %r8 162# endif 163 cmp %r8, %rdx 164 jae L(preloop_large) 165 cmp $1024, %rdx 166 ja L(1024bytesormore) 167 prefetcht1 (%rsi) 168 prefetcht1 0x40(%rsi) 169 prefetcht1 0x80(%rsi) 170 prefetcht1 0xC0(%rsi) 171 prefetcht1 0x100(%rsi) 172 prefetcht1 0x140(%rsi) 173 prefetcht1 0x180(%rsi) 174 prefetcht1 0x1C0(%rsi) 175 prefetcht1 -0x200(%rcx) 176 prefetcht1 -0x1C0(%rcx) 177 prefetcht1 -0x180(%rcx) 178 prefetcht1 -0x140(%rcx) 179 prefetcht1 -0x100(%rcx) 180 prefetcht1 -0xC0(%rcx) 181 prefetcht1 -0x80(%rcx) 182 prefetcht1 -0x40(%rcx) 183 vmovups (%rsi), %zmm0 184 vmovups 0x40(%rsi), %zmm1 185 vmovups 0x80(%rsi), %zmm2 186 vmovups 0xC0(%rsi), %zmm3 187 vmovups 0x100(%rsi), %zmm4 188 vmovups 0x140(%rsi), %zmm5 189 vmovups 0x180(%rsi), %zmm6 190 vmovups 0x1C0(%rsi), %zmm7 191 vmovups -0x200(%rcx), %zmm8 192 vmovups -0x1C0(%rcx), %zmm9 193 vmovups -0x180(%rcx), %zmm10 194 vmovups -0x140(%rcx), %zmm11 195 vmovups -0x100(%rcx), %zmm12 196 vmovups -0xC0(%rcx), %zmm13 197 vmovups -0x80(%rcx), %zmm14 198 vmovups -0x40(%rcx), %zmm15 199 vmovups %zmm0, (%rdi) 200 vmovups %zmm1, 0x40(%rdi) 201 vmovups %zmm2, 0x80(%rdi) 202 vmovups %zmm3, 0xC0(%rdi) 203 vmovups %zmm4, 0x100(%rdi) 204 vmovups %zmm5, 0x140(%rdi) 205 vmovups %zmm6, 0x180(%rdi) 206 vmovups %zmm7, 0x1C0(%rdi) 207 vmovups %zmm8, -0x200(%r9) 208 vmovups %zmm9, -0x1C0(%r9) 209 vmovups %zmm10, -0x180(%r9) 210 vmovups %zmm11, -0x140(%r9) 211 vmovups %zmm12, -0x100(%r9) 212 vmovups %zmm13, -0xC0(%r9) 213 vmovups %zmm14, -0x80(%r9) 214 vmovups %zmm15, -0x40(%r9) 215 ret 216 217L(1024bytesormore): 218 cmp %rsi, %rdi 219 ja L(1024bytesormore_bkw) 220 sub $512, %r9 221 vmovups -0x200(%rcx), %zmm8 222 vmovups -0x1C0(%rcx), %zmm9 223 vmovups -0x180(%rcx), %zmm10 224 vmovups -0x140(%rcx), %zmm11 225 vmovups -0x100(%rcx), %zmm12 226 vmovups -0xC0(%rcx), %zmm13 227 vmovups -0x80(%rcx), %zmm14 228 vmovups -0x40(%rcx), %zmm15 229 prefetcht1 (%rsi) 230 prefetcht1 0x40(%rsi) 231 prefetcht1 0x80(%rsi) 232 prefetcht1 0xC0(%rsi) 233 prefetcht1 0x100(%rsi) 234 prefetcht1 0x140(%rsi) 235 prefetcht1 0x180(%rsi) 236 prefetcht1 0x1C0(%rsi) 237 238/* Loop with unaligned memory access. */ 239L(gobble_512bytes_loop): 240 vmovups (%rsi), %zmm0 241 vmovups 0x40(%rsi), %zmm1 242 vmovups 0x80(%rsi), %zmm2 243 vmovups 0xC0(%rsi), %zmm3 244 vmovups 0x100(%rsi), %zmm4 245 vmovups 0x140(%rsi), %zmm5 246 vmovups 0x180(%rsi), %zmm6 247 vmovups 0x1C0(%rsi), %zmm7 248 add $512, %rsi 249 prefetcht1 (%rsi) 250 prefetcht1 0x40(%rsi) 251 prefetcht1 0x80(%rsi) 252 prefetcht1 0xC0(%rsi) 253 prefetcht1 0x100(%rsi) 254 prefetcht1 0x140(%rsi) 255 prefetcht1 0x180(%rsi) 256 prefetcht1 0x1C0(%rsi) 257 vmovups %zmm0, (%rdi) 258 vmovups %zmm1, 0x40(%rdi) 259 vmovups %zmm2, 0x80(%rdi) 260 vmovups %zmm3, 0xC0(%rdi) 261 vmovups %zmm4, 0x100(%rdi) 262 vmovups %zmm5, 0x140(%rdi) 263 vmovups %zmm6, 0x180(%rdi) 264 vmovups %zmm7, 0x1C0(%rdi) 265 add $512, %rdi 266 cmp %r9, %rdi 267 jb L(gobble_512bytes_loop) 268 vmovups %zmm8, (%r9) 269 vmovups %zmm9, 0x40(%r9) 270 vmovups %zmm10, 0x80(%r9) 271 vmovups %zmm11, 0xC0(%r9) 272 vmovups %zmm12, 0x100(%r9) 273 vmovups %zmm13, 0x140(%r9) 274 vmovups %zmm14, 0x180(%r9) 275 vmovups %zmm15, 0x1C0(%r9) 276 ret 277 278L(1024bytesormore_bkw): 279 add $512, %rdi 280 vmovups 0x1C0(%rsi), %zmm8 281 vmovups 0x180(%rsi), %zmm9 282 vmovups 0x140(%rsi), %zmm10 283 vmovups 0x100(%rsi), %zmm11 284 vmovups 0xC0(%rsi), %zmm12 285 vmovups 0x80(%rsi), %zmm13 286 vmovups 0x40(%rsi), %zmm14 287 vmovups (%rsi), %zmm15 288 prefetcht1 -0x40(%rcx) 289 prefetcht1 -0x80(%rcx) 290 prefetcht1 -0xC0(%rcx) 291 prefetcht1 -0x100(%rcx) 292 prefetcht1 -0x140(%rcx) 293 prefetcht1 -0x180(%rcx) 294 prefetcht1 -0x1C0(%rcx) 295 prefetcht1 -0x200(%rcx) 296 297/* Backward loop with unaligned memory access. */ 298L(gobble_512bytes_loop_bkw): 299 vmovups -0x40(%rcx), %zmm0 300 vmovups -0x80(%rcx), %zmm1 301 vmovups -0xC0(%rcx), %zmm2 302 vmovups -0x100(%rcx), %zmm3 303 vmovups -0x140(%rcx), %zmm4 304 vmovups -0x180(%rcx), %zmm5 305 vmovups -0x1C0(%rcx), %zmm6 306 vmovups -0x200(%rcx), %zmm7 307 sub $512, %rcx 308 prefetcht1 -0x40(%rcx) 309 prefetcht1 -0x80(%rcx) 310 prefetcht1 -0xC0(%rcx) 311 prefetcht1 -0x100(%rcx) 312 prefetcht1 -0x140(%rcx) 313 prefetcht1 -0x180(%rcx) 314 prefetcht1 -0x1C0(%rcx) 315 prefetcht1 -0x200(%rcx) 316 vmovups %zmm0, -0x40(%r9) 317 vmovups %zmm1, -0x80(%r9) 318 vmovups %zmm2, -0xC0(%r9) 319 vmovups %zmm3, -0x100(%r9) 320 vmovups %zmm4, -0x140(%r9) 321 vmovups %zmm5, -0x180(%r9) 322 vmovups %zmm6, -0x1C0(%r9) 323 vmovups %zmm7, -0x200(%r9) 324 sub $512, %r9 325 cmp %rdi, %r9 326 ja L(gobble_512bytes_loop_bkw) 327 vmovups %zmm8, -0x40(%rdi) 328 vmovups %zmm9, -0x80(%rdi) 329 vmovups %zmm10, -0xC0(%rdi) 330 vmovups %zmm11, -0x100(%rdi) 331 vmovups %zmm12, -0x140(%rdi) 332 vmovups %zmm13, -0x180(%rdi) 333 vmovups %zmm14, -0x1C0(%rdi) 334 vmovups %zmm15, -0x200(%rdi) 335 ret 336 337L(preloop_large): 338 cmp %rsi, %rdi 339 ja L(preloop_large_bkw) 340 vmovups (%rsi), %zmm4 341 vmovups 0x40(%rsi), %zmm5 342 343 mov %rdi, %r11 344/* Align destination for access with non-temporal stores in the loop. */ 345 mov %rdi, %r8 346 and $-0x80, %rdi 347 add $0x80, %rdi 348 sub %rdi, %r8 349 sub %r8, %rsi 350 add %r8, %rdx 351L(gobble_256bytes_nt_loop): 352 prefetcht1 0x200(%rsi) 353 prefetcht1 0x240(%rsi) 354 prefetcht1 0x280(%rsi) 355 prefetcht1 0x2C0(%rsi) 356 prefetcht1 0x300(%rsi) 357 prefetcht1 0x340(%rsi) 358 prefetcht1 0x380(%rsi) 359 prefetcht1 0x3C0(%rsi) 360 vmovdqu64 (%rsi), %zmm0 361 vmovdqu64 0x40(%rsi), %zmm1 362 vmovdqu64 0x80(%rsi), %zmm2 363 vmovdqu64 0xC0(%rsi), %zmm3 364 vmovntdq %zmm0, (%rdi) 365 vmovntdq %zmm1, 0x40(%rdi) 366 vmovntdq %zmm2, 0x80(%rdi) 367 vmovntdq %zmm3, 0xC0(%rdi) 368 sub $256, %rdx 369 add $256, %rsi 370 add $256, %rdi 371 cmp $256, %rdx 372 ja L(gobble_256bytes_nt_loop) 373 sfence 374 vmovups %zmm4, (%r11) 375 vmovups %zmm5, 0x40(%r11) 376 jmp L(check) 377 378L(preloop_large_bkw): 379 vmovups -0x80(%rcx), %zmm4 380 vmovups -0x40(%rcx), %zmm5 381 382/* Align end of destination for access with non-temporal stores. */ 383 mov %r9, %r8 384 and $-0x80, %r9 385 sub %r9, %r8 386 sub %r8, %rcx 387 sub %r8, %rdx 388 add %r9, %r8 389L(gobble_256bytes_nt_loop_bkw): 390 prefetcht1 -0x400(%rcx) 391 prefetcht1 -0x3C0(%rcx) 392 prefetcht1 -0x380(%rcx) 393 prefetcht1 -0x340(%rcx) 394 prefetcht1 -0x300(%rcx) 395 prefetcht1 -0x2C0(%rcx) 396 prefetcht1 -0x280(%rcx) 397 prefetcht1 -0x240(%rcx) 398 vmovdqu64 -0x100(%rcx), %zmm0 399 vmovdqu64 -0xC0(%rcx), %zmm1 400 vmovdqu64 -0x80(%rcx), %zmm2 401 vmovdqu64 -0x40(%rcx), %zmm3 402 vmovntdq %zmm0, -0x100(%r9) 403 vmovntdq %zmm1, -0xC0(%r9) 404 vmovntdq %zmm2, -0x80(%r9) 405 vmovntdq %zmm3, -0x40(%r9) 406 sub $256, %rdx 407 sub $256, %rcx 408 sub $256, %r9 409 cmp $256, %rdx 410 ja L(gobble_256bytes_nt_loop_bkw) 411 sfence 412 vmovups %zmm4, -0x80(%r8) 413 vmovups %zmm5, -0x40(%r8) 414 jmp L(check) 415END (__memmove_avx512_no_vzeroupper) 416 417strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) 418strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) 419#endif 420