1/* memchr/wmemchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMCHR 24# define MEMCHR __memchr_evex 25# endif 26 27# ifdef USE_AS_WMEMCHR 28# define VPBROADCAST vpbroadcastd 29# define VPMINU vpminud 30# define VPCMP vpcmpd 31# define VPCMPEQ vpcmpeqd 32# define CHAR_SIZE 4 33# else 34# define VPBROADCAST vpbroadcastb 35# define VPMINU vpminub 36# define VPCMP vpcmpb 37# define VPCMPEQ vpcmpeqb 38# define CHAR_SIZE 1 39# endif 40 41 /* In the 4x loop the RTM and non-RTM versions have data pointer 42 off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. 43 This is represented by BASE_OFFSET. As well because the RTM 44 version uses vpcmp which stores a bit per element compared where 45 the non-RTM version uses vpcmpeq which stores a bit per byte 46 compared RET_SCALE of CHAR_SIZE is only relevant for the RTM 47 version. */ 48# ifdef USE_IN_RTM 49# define VZEROUPPER 50# define BASE_OFFSET (VEC_SIZE * 4) 51# define RET_SCALE CHAR_SIZE 52# else 53# define VZEROUPPER vzeroupper 54# define BASE_OFFSET 0 55# define RET_SCALE 1 56# endif 57 58 /* In the return from 4x loop memchr and rawmemchr versions have 59 data pointers off by VEC_SIZE * 4 with memchr version being 60 VEC_SIZE * 4 greater. */ 61# ifdef USE_AS_RAWMEMCHR 62# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) 63# define RAW_PTR_REG rcx 64# define ALGN_PTR_REG rdi 65# else 66# define RET_OFFSET BASE_OFFSET 67# define RAW_PTR_REG rdi 68# define ALGN_PTR_REG rcx 69# endif 70 71# define XMMZERO xmm23 72# define YMMZERO ymm23 73# define XMMMATCH xmm16 74# define YMMMATCH ymm16 75# define YMM1 ymm17 76# define YMM2 ymm18 77# define YMM3 ymm19 78# define YMM4 ymm20 79# define YMM5 ymm21 80# define YMM6 ymm22 81 82# ifndef SECTION 83# define SECTION(p) p##.evex 84# endif 85 86# define VEC_SIZE 32 87# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 88# define PAGE_SIZE 4096 89 90 .section SECTION(.text),"ax",@progbits 91ENTRY (MEMCHR) 92# ifndef USE_AS_RAWMEMCHR 93 /* Check for zero length. */ 94 test %RDX_LP, %RDX_LP 95 jz L(zero) 96 97# ifdef __ILP32__ 98 /* Clear the upper 32 bits. */ 99 movl %edx, %edx 100# endif 101# endif 102 /* Broadcast CHAR to YMMMATCH. */ 103 VPBROADCAST %esi, %YMMMATCH 104 /* Check if we may cross page boundary with one vector load. */ 105 movl %edi, %eax 106 andl $(PAGE_SIZE - 1), %eax 107 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 108 ja L(cross_page_boundary) 109 110 /* Check the first VEC_SIZE bytes. */ 111 VPCMP $0, (%rdi), %YMMMATCH, %k0 112 kmovd %k0, %eax 113# ifndef USE_AS_RAWMEMCHR 114 /* If length < CHAR_PER_VEC handle special. */ 115 cmpq $CHAR_PER_VEC, %rdx 116 jbe L(first_vec_x0) 117# endif 118 testl %eax, %eax 119 jz L(aligned_more) 120 tzcntl %eax, %eax 121# ifdef USE_AS_WMEMCHR 122 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 123 leaq (%rdi, %rax, CHAR_SIZE), %rax 124# else 125 addq %rdi, %rax 126# endif 127 ret 128 129# ifndef USE_AS_RAWMEMCHR 130L(zero): 131 xorl %eax, %eax 132 ret 133 134 .p2align 5 135L(first_vec_x0): 136 /* Check if first match was before length. */ 137 tzcntl %eax, %eax 138 xorl %ecx, %ecx 139 cmpl %eax, %edx 140 leaq (%rdi, %rax, CHAR_SIZE), %rax 141 cmovle %rcx, %rax 142 ret 143# else 144 /* NB: first_vec_x0 is 17 bytes which will leave 145 cross_page_boundary (which is relatively cold) close enough 146 to ideal alignment. So only realign L(cross_page_boundary) if 147 rawmemchr. */ 148 .p2align 4 149# endif 150L(cross_page_boundary): 151 /* Save pointer before aligning as its original value is 152 necessary for computer return address if byte is found or 153 adjusting length if it is not and this is memchr. */ 154 movq %rdi, %rcx 155 /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi 156 for rawmemchr. */ 157 andq $-VEC_SIZE, %ALGN_PTR_REG 158 VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 159 kmovd %k0, %r8d 160# ifdef USE_AS_WMEMCHR 161 /* NB: Divide shift count by 4 since each bit in K0 represent 4 162 bytes. */ 163 sarl $2, %eax 164# endif 165# ifndef USE_AS_RAWMEMCHR 166 movl $(PAGE_SIZE / CHAR_SIZE), %esi 167 subl %eax, %esi 168# endif 169# ifdef USE_AS_WMEMCHR 170 andl $(CHAR_PER_VEC - 1), %eax 171# endif 172 /* Remove the leading bytes. */ 173 sarxl %eax, %r8d, %eax 174# ifndef USE_AS_RAWMEMCHR 175 /* Check the end of data. */ 176 cmpq %rsi, %rdx 177 jbe L(first_vec_x0) 178# endif 179 testl %eax, %eax 180 jz L(cross_page_continue) 181 tzcntl %eax, %eax 182# ifdef USE_AS_WMEMCHR 183 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 184 leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax 185# else 186 addq %RAW_PTR_REG, %rax 187# endif 188 ret 189 190 .p2align 4 191L(first_vec_x1): 192 tzcntl %eax, %eax 193 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax 194 ret 195 196 .p2align 4 197L(first_vec_x2): 198 tzcntl %eax, %eax 199 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 200 ret 201 202 .p2align 4 203L(first_vec_x3): 204 tzcntl %eax, %eax 205 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 206 ret 207 208 .p2align 4 209L(first_vec_x4): 210 tzcntl %eax, %eax 211 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 212 ret 213 214 .p2align 5 215L(aligned_more): 216 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time 217 since data is only aligned to VEC_SIZE. */ 218 219# ifndef USE_AS_RAWMEMCHR 220 /* Align data to VEC_SIZE. */ 221L(cross_page_continue): 222 xorl %ecx, %ecx 223 subl %edi, %ecx 224 andq $-VEC_SIZE, %rdi 225 /* esi is for adjusting length to see if near the end. */ 226 leal (VEC_SIZE * 5)(%rdi, %rcx), %esi 227# ifdef USE_AS_WMEMCHR 228 /* NB: Divide bytes by 4 to get the wchar_t count. */ 229 sarl $2, %esi 230# endif 231# else 232 andq $-VEC_SIZE, %rdi 233L(cross_page_continue): 234# endif 235 /* Load first VEC regardless. */ 236 VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 237 kmovd %k0, %eax 238# ifndef USE_AS_RAWMEMCHR 239 /* Adjust length. If near end handle specially. */ 240 subq %rsi, %rdx 241 jbe L(last_4x_vec_or_less) 242# endif 243 testl %eax, %eax 244 jnz L(first_vec_x1) 245 246 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 247 kmovd %k0, %eax 248 testl %eax, %eax 249 jnz L(first_vec_x2) 250 251 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 252 kmovd %k0, %eax 253 testl %eax, %eax 254 jnz L(first_vec_x3) 255 256 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 257 kmovd %k0, %eax 258 testl %eax, %eax 259 jnz L(first_vec_x4) 260 261 262# ifndef USE_AS_RAWMEMCHR 263 /* Check if at last CHAR_PER_VEC * 4 length. */ 264 subq $(CHAR_PER_VEC * 4), %rdx 265 jbe L(last_4x_vec_or_less_cmpeq) 266 /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ 267 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi 268 269 /* Align data to VEC_SIZE * 4 for the loop and readjust length. 270 */ 271# ifdef USE_AS_WMEMCHR 272 movl %edi, %ecx 273 andq $-(4 * VEC_SIZE), %rdi 274 subl %edi, %ecx 275 /* NB: Divide bytes by 4 to get the wchar_t count. */ 276 sarl $2, %ecx 277 addq %rcx, %rdx 278# else 279 addq %rdi, %rdx 280 andq $-(4 * VEC_SIZE), %rdi 281 subq %rdi, %rdx 282# endif 283# else 284 addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi 285 andq $-(4 * VEC_SIZE), %rdi 286# endif 287# ifdef USE_IN_RTM 288 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 289# else 290 /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not 291 encodable with EVEX registers (ymm16-ymm31). */ 292 vmovdqa64 %YMMMATCH, %ymm0 293# endif 294 295 /* Compare 4 * VEC at a time forward. */ 296 .p2align 4 297L(loop_4x_vec): 298 /* Two versions of the loop. One that does not require 299 vzeroupper by not using ymm0-ymm15 and another does that require 300 vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 301 is used at all is because there is no EVEX encoding vpcmpeq and 302 with vpcmpeq this loop can be performed more efficiently. The 303 non-vzeroupper version is safe for RTM while the vzeroupper 304 version should be prefered if RTM are not supported. */ 305# ifdef USE_IN_RTM 306 /* It would be possible to save some instructions using 4x VPCMP 307 but bottleneck on port 5 makes it not woth it. */ 308 VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 309 /* xor will set bytes match esi to zero. */ 310 vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 311 vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 312 VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 313 /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ 314 VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} 315 VPCMP $0, %YMM3, %YMMZERO, %k2 316# else 317 /* Since vptern can only take 3x vectors fastest to do 1 vec 318 seperately with EVEX vpcmp. */ 319# ifdef USE_AS_WMEMCHR 320 /* vptern can only accept masks for epi32/epi64 so can only save 321 instruction using not equals mask on vptern with wmemchr. */ 322 VPCMP $4, (%rdi), %YMMMATCH, %k1 323# else 324 VPCMP $0, (%rdi), %YMMMATCH, %k1 325# endif 326 /* Compare 3x with vpcmpeq and or them all together with vptern. 327 */ 328 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 329 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 330 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 331# ifdef USE_AS_WMEMCHR 332 /* This takes the not of or between ymm2, ymm3, ymm4 as well as 333 combines result from VEC0 with zero mask. */ 334 vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} 335 vpmovmskb %ymm4, %ecx 336# else 337 /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ 338 vpternlogd $254, %ymm2, %ymm3, %ymm4 339 vpmovmskb %ymm4, %ecx 340 kmovd %k1, %eax 341# endif 342# endif 343 344# ifdef USE_AS_RAWMEMCHR 345 subq $-(VEC_SIZE * 4), %rdi 346# endif 347# ifdef USE_IN_RTM 348 kortestd %k2, %k3 349# else 350# ifdef USE_AS_WMEMCHR 351 /* ecx contains not of matches. All 1s means no matches. incl will 352 overflow and set zeroflag if that is the case. */ 353 incl %ecx 354# else 355 /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding 356 to ecx is not an issue because if eax is non-zero it will be 357 used for returning the match. If it is zero the add does 358 nothing. */ 359 addq %rax, %rcx 360# endif 361# endif 362# ifdef USE_AS_RAWMEMCHR 363 jz L(loop_4x_vec) 364# else 365 jnz L(loop_4x_vec_end) 366 367 subq $-(VEC_SIZE * 4), %rdi 368 369 subq $(CHAR_PER_VEC * 4), %rdx 370 ja L(loop_4x_vec) 371 372 /* Fall through into less than 4 remaining vectors of length case. 373 */ 374 VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 375 addq $(BASE_OFFSET - VEC_SIZE), %rdi 376 kmovd %k0, %eax 377 VZEROUPPER 378 379L(last_4x_vec_or_less): 380 /* Check if first VEC contained match. */ 381 testl %eax, %eax 382 jnz L(first_vec_x1_check) 383 384 /* If remaining length > CHAR_PER_VEC * 2. */ 385 addl $(CHAR_PER_VEC * 2), %edx 386 jg L(last_4x_vec) 387 388L(last_2x_vec): 389 /* If remaining length < CHAR_PER_VEC. */ 390 addl $CHAR_PER_VEC, %edx 391 jle L(zero_end) 392 393 /* Check VEC2 and compare any match with remaining length. */ 394 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 395 kmovd %k0, %eax 396 tzcntl %eax, %eax 397 cmpl %eax, %edx 398 jbe L(set_zero_end) 399 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 400L(zero_end): 401 ret 402 403 404 .p2align 4 405L(first_vec_x1_check): 406 tzcntl %eax, %eax 407 /* Adjust length. */ 408 subl $-(CHAR_PER_VEC * 4), %edx 409 /* Check if match within remaining length. */ 410 cmpl %eax, %edx 411 jbe L(set_zero_end) 412 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 413 leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax 414 ret 415L(set_zero_end): 416 xorl %eax, %eax 417 ret 418 419 .p2align 4 420L(loop_4x_vec_end): 421# endif 422 /* rawmemchr will fall through into this if match was found in 423 loop. */ 424 425# if defined USE_IN_RTM || defined USE_AS_WMEMCHR 426 /* k1 has not of matches with VEC1. */ 427 kmovd %k1, %eax 428# ifdef USE_AS_WMEMCHR 429 subl $((1 << CHAR_PER_VEC) - 1), %eax 430# else 431 incl %eax 432# endif 433# else 434 /* eax already has matches for VEC1. */ 435 testl %eax, %eax 436# endif 437 jnz L(last_vec_x1_return) 438 439# ifdef USE_IN_RTM 440 VPCMP $0, %YMM2, %YMMZERO, %k0 441 kmovd %k0, %eax 442# else 443 vpmovmskb %ymm2, %eax 444# endif 445 testl %eax, %eax 446 jnz L(last_vec_x2_return) 447 448# ifdef USE_IN_RTM 449 kmovd %k2, %eax 450 testl %eax, %eax 451 jnz L(last_vec_x3_return) 452 453 kmovd %k3, %eax 454 tzcntl %eax, %eax 455 leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax 456# else 457 vpmovmskb %ymm3, %eax 458 /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ 459 salq $VEC_SIZE, %rcx 460 orq %rcx, %rax 461 tzcntq %rax, %rax 462 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax 463 VZEROUPPER 464# endif 465 ret 466 467 .p2align 4 468L(last_vec_x1_return): 469 tzcntl %eax, %eax 470# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 471 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 472 leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax 473# else 474 addq %rdi, %rax 475# endif 476 VZEROUPPER 477 ret 478 479 .p2align 4 480L(last_vec_x2_return): 481 tzcntl %eax, %eax 482 /* NB: Multiply bytes by RET_SCALE to get the wchar_t count 483 if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and 484 USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ 485 leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax 486 VZEROUPPER 487 ret 488 489# ifdef USE_IN_RTM 490 .p2align 4 491L(last_vec_x3_return): 492 tzcntl %eax, %eax 493 /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ 494 leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax 495 ret 496# endif 497 498# ifndef USE_AS_RAWMEMCHR 499L(last_4x_vec_or_less_cmpeq): 500 VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 501 kmovd %k0, %eax 502 subq $-(VEC_SIZE * 4), %rdi 503 /* Check first VEC regardless. */ 504 testl %eax, %eax 505 jnz L(first_vec_x1_check) 506 507 /* If remaining length <= CHAR_PER_VEC * 2. */ 508 addl $(CHAR_PER_VEC * 2), %edx 509 jle L(last_2x_vec) 510 511 .p2align 4 512L(last_4x_vec): 513 VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 514 kmovd %k0, %eax 515 testl %eax, %eax 516 jnz L(last_vec_x2) 517 518 519 VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 520 kmovd %k0, %eax 521 /* Create mask for possible matches within remaining length. */ 522# ifdef USE_AS_WMEMCHR 523 movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx 524 bzhil %edx, %ecx, %ecx 525# else 526 movq $-1, %rcx 527 bzhiq %rdx, %rcx, %rcx 528# endif 529 /* Test matches in data against length match. */ 530 andl %ecx, %eax 531 jnz L(last_vec_x3) 532 533 /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after 534 remaining length was found to be > CHAR_PER_VEC * 2. */ 535 subl $CHAR_PER_VEC, %edx 536 jbe L(zero_end2) 537 538 539 VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 540 kmovd %k0, %eax 541 /* Shift remaining length mask for last VEC. */ 542# ifdef USE_AS_WMEMCHR 543 shrl $CHAR_PER_VEC, %ecx 544# else 545 shrq $CHAR_PER_VEC, %rcx 546# endif 547 andl %ecx, %eax 548 jz L(zero_end2) 549 tzcntl %eax, %eax 550 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 551L(zero_end2): 552 ret 553 554L(last_vec_x2): 555 tzcntl %eax, %eax 556 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 557 ret 558 559 .p2align 4 560L(last_vec_x3): 561 tzcntl %eax, %eax 562 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 563 ret 564# endif 565 566END (MEMCHR) 567#endif 568