1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRLEN 24# define STRLEN __strlen_evex 25# endif 26 27# define VMOVA vmovdqa64 28 29# ifdef USE_AS_WCSLEN 30# define VPCMP vpcmpd 31# define VPMINU vpminud 32# define SHIFT_REG ecx 33# define CHAR_SIZE 4 34# else 35# define VPCMP vpcmpb 36# define VPMINU vpminub 37# define SHIFT_REG edx 38# define CHAR_SIZE 1 39# endif 40 41# define XMMZERO xmm16 42# define YMMZERO ymm16 43# define YMM1 ymm17 44# define YMM2 ymm18 45# define YMM3 ymm19 46# define YMM4 ymm20 47# define YMM5 ymm21 48# define YMM6 ymm22 49 50# define VEC_SIZE 32 51# define PAGE_SIZE 4096 52# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 53 54 .section .text.evex,"ax",@progbits 55ENTRY (STRLEN) 56# ifdef USE_AS_STRNLEN 57 /* Check zero length. */ 58 test %RSI_LP, %RSI_LP 59 jz L(zero) 60# ifdef __ILP32__ 61 /* Clear the upper 32 bits. */ 62 movl %esi, %esi 63# endif 64 mov %RSI_LP, %R8_LP 65# endif 66 movl %edi, %eax 67 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 68 /* Clear high bits from edi. Only keeping bits relevant to page 69 cross check. */ 70 andl $(PAGE_SIZE - 1), %eax 71 /* Check if we may cross page boundary with one vector load. */ 72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 73 ja L(cross_page_boundary) 74 75 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a 76 null byte. */ 77 VPCMP $0, (%rdi), %YMMZERO, %k0 78 kmovd %k0, %eax 79# ifdef USE_AS_STRNLEN 80 /* If length < CHAR_PER_VEC handle special. */ 81 cmpq $CHAR_PER_VEC, %rsi 82 jbe L(first_vec_x0) 83# endif 84 testl %eax, %eax 85 jz L(aligned_more) 86 tzcntl %eax, %eax 87 ret 88# ifdef USE_AS_STRNLEN 89L(zero): 90 xorl %eax, %eax 91 ret 92 93 .p2align 4 94L(first_vec_x0): 95 /* Set bit for max len so that tzcnt will return min of max len 96 and position of first match. */ 97 btsq %rsi, %rax 98 tzcntl %eax, %eax 99 ret 100# endif 101 102 .p2align 4 103L(first_vec_x1): 104 tzcntl %eax, %eax 105 /* Safe to use 32 bit instructions as these are only called for 106 size = [1, 159]. */ 107# ifdef USE_AS_STRNLEN 108 /* Use ecx which was computed earlier to compute correct value. 109 */ 110 leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax 111# else 112 subl %edx, %edi 113# ifdef USE_AS_WCSLEN 114 /* NB: Divide bytes by 4 to get the wchar_t count. */ 115 sarl $2, %edi 116# endif 117 leal CHAR_PER_VEC(%rdi, %rax), %eax 118# endif 119 ret 120 121 .p2align 4 122L(first_vec_x2): 123 tzcntl %eax, %eax 124 /* Safe to use 32 bit instructions as these are only called for 125 size = [1, 159]. */ 126# ifdef USE_AS_STRNLEN 127 /* Use ecx which was computed earlier to compute correct value. 128 */ 129 leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax 130# else 131 subl %edx, %edi 132# ifdef USE_AS_WCSLEN 133 /* NB: Divide bytes by 4 to get the wchar_t count. */ 134 sarl $2, %edi 135# endif 136 leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax 137# endif 138 ret 139 140 .p2align 4 141L(first_vec_x3): 142 tzcntl %eax, %eax 143 /* Safe to use 32 bit instructions as these are only called for 144 size = [1, 159]. */ 145# ifdef USE_AS_STRNLEN 146 /* Use ecx which was computed earlier to compute correct value. 147 */ 148 leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax 149# else 150 subl %edx, %edi 151# ifdef USE_AS_WCSLEN 152 /* NB: Divide bytes by 4 to get the wchar_t count. */ 153 sarl $2, %edi 154# endif 155 leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax 156# endif 157 ret 158 159 .p2align 4 160L(first_vec_x4): 161 tzcntl %eax, %eax 162 /* Safe to use 32 bit instructions as these are only called for 163 size = [1, 159]. */ 164# ifdef USE_AS_STRNLEN 165 /* Use ecx which was computed earlier to compute correct value. 166 */ 167 leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax 168# else 169 subl %edx, %edi 170# ifdef USE_AS_WCSLEN 171 /* NB: Divide bytes by 4 to get the wchar_t count. */ 172 sarl $2, %edi 173# endif 174 leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax 175# endif 176 ret 177 178 .p2align 5 179L(aligned_more): 180 movq %rdi, %rdx 181 /* Align data to VEC_SIZE. */ 182 andq $-(VEC_SIZE), %rdi 183L(cross_page_continue): 184 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 185 since data is only aligned to VEC_SIZE. */ 186# ifdef USE_AS_STRNLEN 187 /* + CHAR_SIZE because it simplies the logic in 188 last_4x_vec_or_less. */ 189 leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx 190 subq %rdx, %rcx 191# ifdef USE_AS_WCSLEN 192 /* NB: Divide bytes by 4 to get the wchar_t count. */ 193 sarl $2, %ecx 194# endif 195# endif 196 /* Load first VEC regardless. */ 197 VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 198# ifdef USE_AS_STRNLEN 199 /* Adjust length. If near end handle specially. */ 200 subq %rcx, %rsi 201 jb L(last_4x_vec_or_less) 202# endif 203 kmovd %k0, %eax 204 testl %eax, %eax 205 jnz L(first_vec_x1) 206 207 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 208 kmovd %k0, %eax 209 test %eax, %eax 210 jnz L(first_vec_x2) 211 212 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 213 kmovd %k0, %eax 214 testl %eax, %eax 215 jnz L(first_vec_x3) 216 217 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 218 kmovd %k0, %eax 219 testl %eax, %eax 220 jnz L(first_vec_x4) 221 222 addq $VEC_SIZE, %rdi 223# ifdef USE_AS_STRNLEN 224 /* Check if at last VEC_SIZE * 4 length. */ 225 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi 226 jbe L(last_4x_vec_or_less_load) 227 movl %edi, %ecx 228 andl $(VEC_SIZE * 4 - 1), %ecx 229# ifdef USE_AS_WCSLEN 230 /* NB: Divide bytes by 4 to get the wchar_t count. */ 231 sarl $2, %ecx 232# endif 233 /* Readjust length. */ 234 addq %rcx, %rsi 235# endif 236 /* Align data to VEC_SIZE * 4. */ 237 andq $-(VEC_SIZE * 4), %rdi 238 239 /* Compare 4 * VEC at a time forward. */ 240 .p2align 4 241L(loop_4x_vec): 242 /* Load first VEC regardless. */ 243 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 244# ifdef USE_AS_STRNLEN 245 /* Break if at end of length. */ 246 subq $(CHAR_PER_VEC * 4), %rsi 247 jb L(last_4x_vec_or_less_cmpeq) 248# endif 249 /* Save some code size by microfusing VPMINU with the load. Since 250 the matches in ymm2/ymm4 can only be returned if there where no 251 matches in ymm1/ymm3 respectively there is no issue with overlap. 252 */ 253 VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 254 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 255 VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 256 257 VPCMP $0, %YMM2, %YMMZERO, %k0 258 VPCMP $0, %YMM4, %YMMZERO, %k1 259 subq $-(VEC_SIZE * 4), %rdi 260 kortestd %k0, %k1 261 jz L(loop_4x_vec) 262 263 /* Check if end was in first half. */ 264 kmovd %k0, %eax 265 subq %rdx, %rdi 266# ifdef USE_AS_WCSLEN 267 shrq $2, %rdi 268# endif 269 testl %eax, %eax 270 jz L(second_vec_return) 271 272 VPCMP $0, %YMM1, %YMMZERO, %k2 273 kmovd %k2, %edx 274 /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ 275# ifdef USE_AS_WCSLEN 276 sall $CHAR_PER_VEC, %eax 277 orl %edx, %eax 278 tzcntl %eax, %eax 279# else 280 salq $CHAR_PER_VEC, %rax 281 orq %rdx, %rax 282 tzcntq %rax, %rax 283# endif 284 addq %rdi, %rax 285 ret 286 287 288# ifdef USE_AS_STRNLEN 289 290L(last_4x_vec_or_less_load): 291 /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ 292 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 293L(last_4x_vec_or_less_cmpeq): 294 VPCMP $0, %YMM1, %YMMZERO, %k0 295 addq $(VEC_SIZE * 3), %rdi 296L(last_4x_vec_or_less): 297 kmovd %k0, %eax 298 /* If remaining length > VEC_SIZE * 2. This works if esi is off by 299 VEC_SIZE * 4. */ 300 testl $(CHAR_PER_VEC * 2), %esi 301 jnz L(last_4x_vec) 302 303 /* length may have been negative or positive by an offset of 304 CHAR_PER_VEC * 4 depending on where this was called from. This 305 fixes that. */ 306 andl $(CHAR_PER_VEC * 4 - 1), %esi 307 testl %eax, %eax 308 jnz L(last_vec_x1_check) 309 310 /* Check the end of data. */ 311 subl $CHAR_PER_VEC, %esi 312 jb L(max) 313 314 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 315 kmovd %k0, %eax 316 tzcntl %eax, %eax 317 /* Check the end of data. */ 318 cmpl %eax, %esi 319 jb L(max) 320 321 subq %rdx, %rdi 322# ifdef USE_AS_WCSLEN 323 /* NB: Divide bytes by 4 to get the wchar_t count. */ 324 sarq $2, %rdi 325# endif 326 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 327 ret 328L(max): 329 movq %r8, %rax 330 ret 331# endif 332 333 /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) 334 in the 4x VEC loop can use 2 byte encoding. */ 335 .p2align 4 336L(second_vec_return): 337 VPCMP $0, %YMM3, %YMMZERO, %k0 338 /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ 339# ifdef USE_AS_WCSLEN 340 kunpckbw %k0, %k1, %k0 341 kmovd %k0, %eax 342 tzcntl %eax, %eax 343# else 344 kunpckdq %k0, %k1, %k0 345 kmovq %k0, %rax 346 tzcntq %rax, %rax 347# endif 348 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 349 ret 350 351 352# ifdef USE_AS_STRNLEN 353L(last_vec_x1_check): 354 tzcntl %eax, %eax 355 /* Check the end of data. */ 356 cmpl %eax, %esi 357 jb L(max) 358 subq %rdx, %rdi 359# ifdef USE_AS_WCSLEN 360 /* NB: Divide bytes by 4 to get the wchar_t count. */ 361 sarq $2, %rdi 362# endif 363 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax 364 ret 365 366 .p2align 4 367L(last_4x_vec): 368 /* Test first 2x VEC normally. */ 369 testl %eax, %eax 370 jnz L(last_vec_x1) 371 372 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 373 kmovd %k0, %eax 374 testl %eax, %eax 375 jnz L(last_vec_x2) 376 377 /* Normalize length. */ 378 andl $(CHAR_PER_VEC * 4 - 1), %esi 379 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 380 kmovd %k0, %eax 381 testl %eax, %eax 382 jnz L(last_vec_x3) 383 384 /* Check the end of data. */ 385 subl $(CHAR_PER_VEC * 3), %esi 386 jb L(max) 387 388 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 389 kmovd %k0, %eax 390 tzcntl %eax, %eax 391 /* Check the end of data. */ 392 cmpl %eax, %esi 393 jb L(max_end) 394 395 subq %rdx, %rdi 396# ifdef USE_AS_WCSLEN 397 /* NB: Divide bytes by 4 to get the wchar_t count. */ 398 sarq $2, %rdi 399# endif 400 leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax 401 ret 402 403 .p2align 4 404L(last_vec_x1): 405 tzcntl %eax, %eax 406 subq %rdx, %rdi 407# ifdef USE_AS_WCSLEN 408 /* NB: Divide bytes by 4 to get the wchar_t count. */ 409 sarq $2, %rdi 410# endif 411 leaq (CHAR_PER_VEC)(%rdi, %rax), %rax 412 ret 413 414 .p2align 4 415L(last_vec_x2): 416 tzcntl %eax, %eax 417 subq %rdx, %rdi 418# ifdef USE_AS_WCSLEN 419 /* NB: Divide bytes by 4 to get the wchar_t count. */ 420 sarq $2, %rdi 421# endif 422 leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax 423 ret 424 425 .p2align 4 426L(last_vec_x3): 427 tzcntl %eax, %eax 428 subl $(CHAR_PER_VEC * 2), %esi 429 /* Check the end of data. */ 430 cmpl %eax, %esi 431 jb L(max_end) 432 subq %rdx, %rdi 433# ifdef USE_AS_WCSLEN 434 /* NB: Divide bytes by 4 to get the wchar_t count. */ 435 sarq $2, %rdi 436# endif 437 leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax 438 ret 439L(max_end): 440 movq %r8, %rax 441 ret 442# endif 443 444 /* Cold case for crossing page with first load. */ 445 .p2align 4 446L(cross_page_boundary): 447 movq %rdi, %rdx 448 /* Align data to VEC_SIZE. */ 449 andq $-VEC_SIZE, %rdi 450 VPCMP $0, (%rdi), %YMMZERO, %k0 451 kmovd %k0, %eax 452 /* Remove the leading bytes. */ 453# ifdef USE_AS_WCSLEN 454 /* NB: Divide shift count by 4 since each bit in K0 represent 4 455 bytes. */ 456 movl %edx, %ecx 457 shrl $2, %ecx 458 andl $(CHAR_PER_VEC - 1), %ecx 459# endif 460 /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ 461 sarxl %SHIFT_REG, %eax, %eax 462 testl %eax, %eax 463# ifndef USE_AS_STRNLEN 464 jz L(cross_page_continue) 465 tzcntl %eax, %eax 466 ret 467# else 468 jnz L(cross_page_less_vec) 469# ifndef USE_AS_WCSLEN 470 movl %edx, %ecx 471 andl $(CHAR_PER_VEC - 1), %ecx 472# endif 473 movl $CHAR_PER_VEC, %eax 474 subl %ecx, %eax 475 /* Check the end of data. */ 476 cmpq %rax, %rsi 477 ja L(cross_page_continue) 478 movl %esi, %eax 479 ret 480L(cross_page_less_vec): 481 tzcntl %eax, %eax 482 /* Select min of length and position of first null. */ 483 cmpq %rax, %rsi 484 cmovb %esi, %eax 485 ret 486# endif 487 488END (STRLEN) 489#endif 490