1/* strlen with SSE2 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ 20 21#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) 22 23# ifndef USE_AS_STRCAT 24 25# include <sysdep.h> 26# define PARMS 4 27# define STR PARMS 28# define RETURN ret 29 30# ifdef USE_AS_STRNLEN 31# define LEN PARMS + 8 32# define CFI_PUSH(REG) \ 33 cfi_adjust_cfa_offset (4); \ 34 cfi_rel_offset (REG, 0) 35 36# define CFI_POP(REG) \ 37 cfi_adjust_cfa_offset (-4); \ 38 cfi_restore (REG) 39 40# define PUSH(REG) pushl REG; CFI_PUSH (REG) 41# define POP(REG) popl REG; CFI_POP (REG) 42# undef RETURN 43# define RETURN POP (%edi); CFI_PUSH(%edi); ret 44# endif 45 46# ifndef STRLEN 47# define STRLEN __strlen_sse2 48# endif 49 50 atom_text_section 51ENTRY (STRLEN) 52 mov STR(%esp), %edx 53# ifdef USE_AS_STRNLEN 54 PUSH (%edi) 55 movl LEN(%esp), %edi 56 sub $4, %edi 57 jbe L(len_less4_prolog) 58# endif 59# endif 60 xor %eax, %eax 61 cmpb $0, (%edx) 62 jz L(exit_tail0) 63 cmpb $0, 1(%edx) 64 jz L(exit_tail1) 65 cmpb $0, 2(%edx) 66 jz L(exit_tail2) 67 cmpb $0, 3(%edx) 68 jz L(exit_tail3) 69 70# ifdef USE_AS_STRNLEN 71 sub $4, %edi 72 jbe L(len_less8_prolog) 73# endif 74 75 cmpb $0, 4(%edx) 76 jz L(exit_tail4) 77 cmpb $0, 5(%edx) 78 jz L(exit_tail5) 79 cmpb $0, 6(%edx) 80 jz L(exit_tail6) 81 cmpb $0, 7(%edx) 82 jz L(exit_tail7) 83 84# ifdef USE_AS_STRNLEN 85 sub $4, %edi 86 jbe L(len_less12_prolog) 87# endif 88 89 cmpb $0, 8(%edx) 90 jz L(exit_tail8) 91 cmpb $0, 9(%edx) 92 jz L(exit_tail9) 93 cmpb $0, 10(%edx) 94 jz L(exit_tail10) 95 cmpb $0, 11(%edx) 96 jz L(exit_tail11) 97 98# ifdef USE_AS_STRNLEN 99 sub $4, %edi 100 jbe L(len_less16_prolog) 101# endif 102 103 cmpb $0, 12(%edx) 104 jz L(exit_tail12) 105 cmpb $0, 13(%edx) 106 jz L(exit_tail13) 107 cmpb $0, 14(%edx) 108 jz L(exit_tail14) 109 cmpb $0, 15(%edx) 110 jz L(exit_tail15) 111 112 pxor %xmm0, %xmm0 113 lea 16(%edx), %eax 114 mov %eax, %ecx 115 and $-16, %eax 116 117# ifdef USE_AS_STRNLEN 118 and $15, %edx 119 add %edx, %edi 120 sub $64, %edi 121 jbe L(len_less64) 122# endif 123 124 pcmpeqb (%eax), %xmm0 125 pmovmskb %xmm0, %edx 126 pxor %xmm1, %xmm1 127 test %edx, %edx 128 lea 16(%eax), %eax 129 jnz L(exit) 130 131 pcmpeqb (%eax), %xmm1 132 pmovmskb %xmm1, %edx 133 pxor %xmm2, %xmm2 134 test %edx, %edx 135 lea 16(%eax), %eax 136 jnz L(exit) 137 138 pcmpeqb (%eax), %xmm2 139 pmovmskb %xmm2, %edx 140 pxor %xmm3, %xmm3 141 test %edx, %edx 142 lea 16(%eax), %eax 143 jnz L(exit) 144 145 pcmpeqb (%eax), %xmm3 146 pmovmskb %xmm3, %edx 147 test %edx, %edx 148 lea 16(%eax), %eax 149 jnz L(exit) 150 151# ifdef USE_AS_STRNLEN 152 sub $64, %edi 153 jbe L(len_less64) 154# endif 155 156 pcmpeqb (%eax), %xmm0 157 pmovmskb %xmm0, %edx 158 test %edx, %edx 159 lea 16(%eax), %eax 160 jnz L(exit) 161 162 pcmpeqb (%eax), %xmm1 163 pmovmskb %xmm1, %edx 164 test %edx, %edx 165 lea 16(%eax), %eax 166 jnz L(exit) 167 168 pcmpeqb (%eax), %xmm2 169 pmovmskb %xmm2, %edx 170 test %edx, %edx 171 lea 16(%eax), %eax 172 jnz L(exit) 173 174 pcmpeqb (%eax), %xmm3 175 pmovmskb %xmm3, %edx 176 test %edx, %edx 177 lea 16(%eax), %eax 178 jnz L(exit) 179 180# ifdef USE_AS_STRNLEN 181 sub $64, %edi 182 jbe L(len_less64) 183# endif 184 185 pcmpeqb (%eax), %xmm0 186 pmovmskb %xmm0, %edx 187 test %edx, %edx 188 lea 16(%eax), %eax 189 jnz L(exit) 190 191 pcmpeqb (%eax), %xmm1 192 pmovmskb %xmm1, %edx 193 test %edx, %edx 194 lea 16(%eax), %eax 195 jnz L(exit) 196 197 pcmpeqb (%eax), %xmm2 198 pmovmskb %xmm2, %edx 199 test %edx, %edx 200 lea 16(%eax), %eax 201 jnz L(exit) 202 203 pcmpeqb (%eax), %xmm3 204 pmovmskb %xmm3, %edx 205 test %edx, %edx 206 lea 16(%eax), %eax 207 jnz L(exit) 208 209# ifdef USE_AS_STRNLEN 210 sub $64, %edi 211 jbe L(len_less64) 212# endif 213 214 pcmpeqb (%eax), %xmm0 215 pmovmskb %xmm0, %edx 216 test %edx, %edx 217 lea 16(%eax), %eax 218 jnz L(exit) 219 220 pcmpeqb (%eax), %xmm1 221 pmovmskb %xmm1, %edx 222 test %edx, %edx 223 lea 16(%eax), %eax 224 jnz L(exit) 225 226 pcmpeqb (%eax), %xmm2 227 pmovmskb %xmm2, %edx 228 test %edx, %edx 229 lea 16(%eax), %eax 230 jnz L(exit) 231 232 pcmpeqb (%eax), %xmm3 233 pmovmskb %xmm3, %edx 234 test %edx, %edx 235 lea 16(%eax), %eax 236 jnz L(exit) 237 238# ifdef USE_AS_STRNLEN 239 mov %eax, %edx 240 and $63, %edx 241 add %edx, %edi 242# endif 243 244 and $-0x40, %eax 245 246 .p2align 4 247L(aligned_64_loop): 248# ifdef USE_AS_STRNLEN 249 sub $64, %edi 250 jbe L(len_less64) 251# endif 252 movaps (%eax), %xmm0 253 movaps 16(%eax), %xmm1 254 movaps 32(%eax), %xmm2 255 movaps 48(%eax), %xmm6 256 pminub %xmm1, %xmm0 257 pminub %xmm6, %xmm2 258 pminub %xmm0, %xmm2 259 pcmpeqb %xmm3, %xmm2 260 pmovmskb %xmm2, %edx 261 test %edx, %edx 262 lea 64(%eax), %eax 263 jz L(aligned_64_loop) 264 265 pcmpeqb -64(%eax), %xmm3 266 pmovmskb %xmm3, %edx 267 test %edx, %edx 268 lea 48(%ecx), %ecx 269 jnz L(exit) 270 271 pcmpeqb %xmm1, %xmm3 272 pmovmskb %xmm3, %edx 273 test %edx, %edx 274 lea -16(%ecx), %ecx 275 jnz L(exit) 276 277 pcmpeqb -32(%eax), %xmm3 278 pmovmskb %xmm3, %edx 279 test %edx, %edx 280 lea -16(%ecx), %ecx 281 jnz L(exit) 282 283 pcmpeqb %xmm6, %xmm3 284 pmovmskb %xmm3, %edx 285 lea -16(%ecx), %ecx 286L(exit): 287 sub %ecx, %eax 288 test %dl, %dl 289 jz L(exit_high) 290 291 mov %dl, %cl 292 and $15, %cl 293 jz L(exit_8) 294 test $0x01, %dl 295 jnz L(exit_tail0) 296 test $0x02, %dl 297 jnz L(exit_tail1) 298 test $0x04, %dl 299 jnz L(exit_tail2) 300 add $3, %eax 301 RETURN 302 303 .p2align 4 304L(exit_8): 305 test $0x10, %dl 306 jnz L(exit_tail4) 307 test $0x20, %dl 308 jnz L(exit_tail5) 309 test $0x40, %dl 310 jnz L(exit_tail6) 311 add $7, %eax 312 RETURN 313 314 .p2align 4 315L(exit_high): 316 mov %dh, %ch 317 and $15, %ch 318 jz L(exit_high_8) 319 test $0x01, %dh 320 jnz L(exit_tail8) 321 test $0x02, %dh 322 jnz L(exit_tail9) 323 test $0x04, %dh 324 jnz L(exit_tail10) 325 add $11, %eax 326 RETURN 327 328 .p2align 4 329L(exit_high_8): 330 test $0x10, %dh 331 jnz L(exit_tail12) 332 test $0x20, %dh 333 jnz L(exit_tail13) 334 test $0x40, %dh 335 jnz L(exit_tail14) 336 add $15, %eax 337L(exit_tail0): 338 RETURN 339 340# ifdef USE_AS_STRNLEN 341 342 .p2align 4 343L(len_less64): 344 pxor %xmm0, %xmm0 345 add $64, %edi 346 347 pcmpeqb (%eax), %xmm0 348 pmovmskb %xmm0, %edx 349 pxor %xmm1, %xmm1 350 lea 16(%eax), %eax 351 test %edx, %edx 352 jnz L(strnlen_exit) 353 354 sub $16, %edi 355 jbe L(return_start_len) 356 357 pcmpeqb (%eax), %xmm1 358 pmovmskb %xmm1, %edx 359 lea 16(%eax), %eax 360 test %edx, %edx 361 jnz L(strnlen_exit) 362 363 sub $16, %edi 364 jbe L(return_start_len) 365 366 pcmpeqb (%eax), %xmm0 367 pmovmskb %xmm0, %edx 368 lea 16(%eax), %eax 369 test %edx, %edx 370 jnz L(strnlen_exit) 371 372 sub $16, %edi 373 jbe L(return_start_len) 374 375 pcmpeqb (%eax), %xmm1 376 pmovmskb %xmm1, %edx 377 lea 16(%eax), %eax 378 test %edx, %edx 379 jnz L(strnlen_exit) 380 381 movl LEN(%esp), %eax 382 RETURN 383 384 .p2align 4 385L(strnlen_exit): 386 sub %ecx, %eax 387 388 test %dl, %dl 389 jz L(strnlen_exit_high) 390 mov %dl, %cl 391 and $15, %cl 392 jz L(strnlen_exit_8) 393 test $0x01, %dl 394 jnz L(exit_tail0) 395 test $0x02, %dl 396 jnz L(strnlen_exit_tail1) 397 test $0x04, %dl 398 jnz L(strnlen_exit_tail2) 399 sub $4, %edi 400 jb L(return_start_len) 401 lea 3(%eax), %eax 402 RETURN 403 404 .p2align 4 405L(strnlen_exit_8): 406 test $0x10, %dl 407 jnz L(strnlen_exit_tail4) 408 test $0x20, %dl 409 jnz L(strnlen_exit_tail5) 410 test $0x40, %dl 411 jnz L(strnlen_exit_tail6) 412 sub $8, %edi 413 jb L(return_start_len) 414 lea 7(%eax), %eax 415 RETURN 416 417 .p2align 4 418L(strnlen_exit_high): 419 mov %dh, %ch 420 and $15, %ch 421 jz L(strnlen_exit_high_8) 422 test $0x01, %dh 423 jnz L(strnlen_exit_tail8) 424 test $0x02, %dh 425 jnz L(strnlen_exit_tail9) 426 test $0x04, %dh 427 jnz L(strnlen_exit_tail10) 428 sub $12, %edi 429 jb L(return_start_len) 430 lea 11(%eax), %eax 431 RETURN 432 433 .p2align 4 434L(strnlen_exit_high_8): 435 test $0x10, %dh 436 jnz L(strnlen_exit_tail12) 437 test $0x20, %dh 438 jnz L(strnlen_exit_tail13) 439 test $0x40, %dh 440 jnz L(strnlen_exit_tail14) 441 sub $16, %edi 442 jb L(return_start_len) 443 lea 15(%eax), %eax 444 RETURN 445 446 .p2align 4 447L(strnlen_exit_tail1): 448 sub $2, %edi 449 jb L(return_start_len) 450 lea 1(%eax), %eax 451 RETURN 452 453 .p2align 4 454L(strnlen_exit_tail2): 455 sub $3, %edi 456 jb L(return_start_len) 457 lea 2(%eax), %eax 458 RETURN 459 460 .p2align 4 461L(strnlen_exit_tail4): 462 sub $5, %edi 463 jb L(return_start_len) 464 lea 4(%eax), %eax 465 RETURN 466 467 .p2align 4 468L(strnlen_exit_tail5): 469 sub $6, %edi 470 jb L(return_start_len) 471 lea 5(%eax), %eax 472 RETURN 473 474 .p2align 4 475L(strnlen_exit_tail6): 476 sub $7, %edi 477 jb L(return_start_len) 478 lea 6(%eax), %eax 479 RETURN 480 481 .p2align 4 482L(strnlen_exit_tail8): 483 sub $9, %edi 484 jb L(return_start_len) 485 lea 8(%eax), %eax 486 RETURN 487 488 .p2align 4 489L(strnlen_exit_tail9): 490 sub $10, %edi 491 jb L(return_start_len) 492 lea 9(%eax), %eax 493 RETURN 494 495 .p2align 4 496L(strnlen_exit_tail10): 497 sub $11, %edi 498 jb L(return_start_len) 499 lea 10(%eax), %eax 500 RETURN 501 502 .p2align 4 503L(strnlen_exit_tail12): 504 sub $13, %edi 505 jb L(return_start_len) 506 lea 12(%eax), %eax 507 RETURN 508 509 .p2align 4 510L(strnlen_exit_tail13): 511 sub $14, %edi 512 jb L(return_start_len) 513 lea 13(%eax), %eax 514 RETURN 515 516 .p2align 4 517L(strnlen_exit_tail14): 518 sub $15, %edi 519 jb L(return_start_len) 520 lea 14(%eax), %eax 521 RETURN 522 523 .p2align 4 524L(return_start_len): 525 movl LEN(%esp), %eax 526 RETURN 527 528/* for prolog only */ 529 530 .p2align 4 531L(len_less4_prolog): 532 xor %eax, %eax 533 534 add $4, %edi 535 jz L(exit_tail0) 536 537 cmpb $0, (%edx) 538 jz L(exit_tail0) 539 cmp $1, %edi 540 je L(exit_tail1) 541 542 cmpb $0, 1(%edx) 543 jz L(exit_tail1) 544 cmp $2, %edi 545 je L(exit_tail2) 546 547 cmpb $0, 2(%edx) 548 jz L(exit_tail2) 549 cmp $3, %edi 550 je L(exit_tail3) 551 552 cmpb $0, 3(%edx) 553 jz L(exit_tail3) 554 mov $4, %eax 555 RETURN 556 557 .p2align 4 558L(len_less8_prolog): 559 add $4, %edi 560 561 cmpb $0, 4(%edx) 562 jz L(exit_tail4) 563 cmp $1, %edi 564 je L(exit_tail5) 565 566 cmpb $0, 5(%edx) 567 jz L(exit_tail5) 568 cmp $2, %edi 569 je L(exit_tail6) 570 571 cmpb $0, 6(%edx) 572 jz L(exit_tail6) 573 cmp $3, %edi 574 je L(exit_tail7) 575 576 cmpb $0, 7(%edx) 577 jz L(exit_tail7) 578 mov $8, %eax 579 RETURN 580 581 582 .p2align 4 583L(len_less12_prolog): 584 add $4, %edi 585 586 cmpb $0, 8(%edx) 587 jz L(exit_tail8) 588 cmp $1, %edi 589 je L(exit_tail9) 590 591 cmpb $0, 9(%edx) 592 jz L(exit_tail9) 593 cmp $2, %edi 594 je L(exit_tail10) 595 596 cmpb $0, 10(%edx) 597 jz L(exit_tail10) 598 cmp $3, %edi 599 je L(exit_tail11) 600 601 cmpb $0, 11(%edx) 602 jz L(exit_tail11) 603 mov $12, %eax 604 RETURN 605 606 .p2align 4 607L(len_less16_prolog): 608 add $4, %edi 609 610 cmpb $0, 12(%edx) 611 jz L(exit_tail12) 612 cmp $1, %edi 613 je L(exit_tail13) 614 615 cmpb $0, 13(%edx) 616 jz L(exit_tail13) 617 cmp $2, %edi 618 je L(exit_tail14) 619 620 cmpb $0, 14(%edx) 621 jz L(exit_tail14) 622 cmp $3, %edi 623 je L(exit_tail15) 624 625 cmpb $0, 15(%edx) 626 jz L(exit_tail15) 627 mov $16, %eax 628 RETURN 629# endif 630 631 .p2align 4 632L(exit_tail1): 633 add $1, %eax 634 RETURN 635 636L(exit_tail2): 637 add $2, %eax 638 RETURN 639 640L(exit_tail3): 641 add $3, %eax 642 RETURN 643 644L(exit_tail4): 645 add $4, %eax 646 RETURN 647 648L(exit_tail5): 649 add $5, %eax 650 RETURN 651 652L(exit_tail6): 653 add $6, %eax 654 RETURN 655 656L(exit_tail7): 657 add $7, %eax 658 RETURN 659 660L(exit_tail8): 661 add $8, %eax 662 RETURN 663 664L(exit_tail9): 665 add $9, %eax 666 RETURN 667 668L(exit_tail10): 669 add $10, %eax 670 RETURN 671 672L(exit_tail11): 673 add $11, %eax 674 RETURN 675 676L(exit_tail12): 677 add $12, %eax 678 RETURN 679 680L(exit_tail13): 681 add $13, %eax 682 RETURN 683 684L(exit_tail14): 685 add $14, %eax 686 RETURN 687 688L(exit_tail15): 689 add $15, %eax 690# ifndef USE_AS_STRCAT 691 RETURN 692END (STRLEN) 693# endif 694#endif 695