1/* memcmp with SSE2 2 Copyright (C) 2009-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 .text 22ENTRY (memcmp) 23#ifdef __ILP32__ 24 /* Clear the upper 32 bits. */ 25 movl %edx, %edx 26#endif 27 test %RDX_LP, %RDX_LP 28 jz L(finz) 29 cmpq $1, %rdx 30 jbe L(finr1b) 31 subq %rdi, %rsi 32 movq %rdx, %r10 33 cmpq $32, %r10 34 jae L(gt32) 35 /* Handle small chunks and last block of less than 32 bytes. */ 36L(small): 37 testq $1, %r10 38 jz L(s2b) 39 movzbl (%rdi), %eax 40 movzbl (%rdi, %rsi), %edx 41 subq $1, %r10 42 je L(finz1) 43 addq $1, %rdi 44 subl %edx, %eax 45 jnz L(exit) 46L(s2b): 47 testq $2, %r10 48 jz L(s4b) 49 movzwl (%rdi), %eax 50 movzwl (%rdi, %rsi), %edx 51 subq $2, %r10 52#ifdef USE_AS_MEMCMPEQ 53 je L(finz1) 54#else 55 je L(fin2_7) 56#endif 57 addq $2, %rdi 58 cmpl %edx, %eax 59#ifdef USE_AS_MEMCMPEQ 60 jnz L(neq_early) 61#else 62 jnz L(fin2_7) 63#endif 64L(s4b): 65 testq $4, %r10 66 jz L(s8b) 67 movl (%rdi), %eax 68 movl (%rdi, %rsi), %edx 69 subq $4, %r10 70#ifdef USE_AS_MEMCMPEQ 71 je L(finz1) 72#else 73 je L(fin2_7) 74#endif 75 addq $4, %rdi 76 cmpl %edx, %eax 77#ifdef USE_AS_MEMCMPEQ 78 jnz L(neq_early) 79#else 80 jnz L(fin2_7) 81#endif 82L(s8b): 83 testq $8, %r10 84 jz L(s16b) 85 movq (%rdi), %rax 86 movq (%rdi, %rsi), %rdx 87 subq $8, %r10 88#ifdef USE_AS_MEMCMPEQ 89 je L(sub_return8) 90#else 91 je L(fin2_7) 92#endif 93 addq $8, %rdi 94 cmpq %rdx, %rax 95#ifdef USE_AS_MEMCMPEQ 96 jnz L(neq_early) 97#else 98 jnz L(fin2_7) 99#endif 100L(s16b): 101 movdqu (%rdi), %xmm1 102 movdqu (%rdi, %rsi), %xmm0 103 pcmpeqb %xmm0, %xmm1 104#ifdef USE_AS_MEMCMPEQ 105 pmovmskb %xmm1, %eax 106 subl $0xffff, %eax 107 ret 108#else 109 pmovmskb %xmm1, %edx 110 xorl %eax, %eax 111 subl $0xffff, %edx 112 jz L(finz) 113 bsfl %edx, %ecx 114 leaq (%rdi, %rcx), %rcx 115 movzbl (%rcx), %eax 116 movzbl (%rsi, %rcx), %edx 117 jmp L(finz1) 118#endif 119 .p2align 4,, 4 120L(finr1b): 121 movzbl (%rdi), %eax 122 movzbl (%rsi), %edx 123L(finz1): 124 subl %edx, %eax 125L(exit): 126 ret 127#ifdef USE_AS_MEMCMPEQ 128 .p2align 4,, 4 129L(sub_return8): 130 subq %rdx, %rax 131 movl %eax, %edx 132 shrq $32, %rax 133 orl %edx, %eax 134 ret 135#else 136 .p2align 4,, 4 137L(fin2_7): 138 cmpq %rdx, %rax 139 jz L(finz) 140 movq %rax, %r11 141 subq %rdx, %r11 142 bsfq %r11, %rcx 143 sarq $3, %rcx 144 salq $3, %rcx 145 sarq %cl, %rax 146 movzbl %al, %eax 147 sarq %cl, %rdx 148 movzbl %dl, %edx 149 subl %edx, %eax 150 ret 151#endif 152 .p2align 4,, 4 153L(finz): 154 xorl %eax, %eax 155 ret 156#ifdef USE_AS_MEMCMPEQ 157 .p2align 4,, 4 158L(neq_early): 159 movl $1, %eax 160 ret 161#endif 162 /* For blocks bigger than 32 bytes 163 1. Advance one of the addr pointer to be 16B aligned. 164 2. Treat the case of both addr pointers aligned to 16B 165 separately to avoid movdqu. 166 3. Handle any blocks of greater than 64 consecutive bytes with 167 unrolling to reduce branches. 168 4. At least one addr pointer is 16B aligned, use memory version 169 of pcmbeqb. 170 */ 171 .p2align 4,, 4 172L(gt32): 173 movq %rdx, %r11 174 addq %rdi, %r11 175 movq %rdi, %r8 176 177 andq $15, %r8 178 jz L(16am) 179 /* Both pointers may be misaligned. */ 180 movdqu (%rdi), %xmm1 181 movdqu (%rdi, %rsi), %xmm0 182 pcmpeqb %xmm0, %xmm1 183 pmovmskb %xmm1, %edx 184 subl $0xffff, %edx 185 jnz L(neq) 186 neg %r8 187 leaq 16(%rdi, %r8), %rdi 188L(16am): 189 /* Handle two 16B aligned pointers separately. */ 190 testq $15, %rsi 191 jz L(ATR) 192 testq $16, %rdi 193 jz L(A32) 194 movdqu (%rdi, %rsi), %xmm0 195 pcmpeqb (%rdi), %xmm0 196 pmovmskb %xmm0, %edx 197 subl $0xffff, %edx 198 jnz L(neq) 199 addq $16, %rdi 200L(A32): 201 movq %r11, %r10 202 andq $-32, %r10 203 cmpq %r10, %rdi 204 jae L(mt16) 205 /* Pre-unroll to be ready for unrolled 64B loop. */ 206 testq $32, %rdi 207 jz L(A64) 208 movdqu (%rdi,%rsi), %xmm0 209 pcmpeqb (%rdi), %xmm0 210 pmovmskb %xmm0, %edx 211 subl $0xffff, %edx 212 jnz L(neq) 213 addq $16, %rdi 214 215 movdqu (%rdi,%rsi), %xmm0 216 pcmpeqb (%rdi), %xmm0 217 pmovmskb %xmm0, %edx 218 subl $0xffff, %edx 219 jnz L(neq) 220 addq $16, %rdi 221 222L(A64): 223 movq %r11, %r10 224 andq $-64, %r10 225 cmpq %r10, %rdi 226 jae L(mt32) 227 228L(A64main): 229 movdqu (%rdi,%rsi), %xmm0 230 pcmpeqb (%rdi), %xmm0 231 pmovmskb %xmm0, %edx 232 subl $0xffff, %edx 233 jnz L(neq) 234 addq $16, %rdi 235 236 movdqu (%rdi,%rsi), %xmm0 237 pcmpeqb (%rdi), %xmm0 238 pmovmskb %xmm0, %edx 239 subl $0xffff, %edx 240 jnz L(neq) 241 addq $16, %rdi 242 243 movdqu (%rdi,%rsi), %xmm0 244 pcmpeqb (%rdi), %xmm0 245 pmovmskb %xmm0, %edx 246 subl $0xffff, %edx 247 jnz L(neq) 248 addq $16, %rdi 249 250 movdqu (%rdi,%rsi), %xmm0 251 pcmpeqb (%rdi), %xmm0 252 pmovmskb %xmm0, %edx 253 subl $0xffff, %edx 254 jnz L(neq) 255 addq $16, %rdi 256 257 cmpq %rdi, %r10 258 jne L(A64main) 259 260L(mt32): 261 movq %r11, %r10 262 andq $-32, %r10 263 cmpq %r10, %rdi 264 jae L(mt16) 265 266L(A32main): 267 movdqu (%rdi,%rsi), %xmm0 268 pcmpeqb (%rdi), %xmm0 269 pmovmskb %xmm0, %edx 270 subl $0xffff, %edx 271 jnz L(neq) 272 addq $16, %rdi 273 274 movdqu (%rdi,%rsi), %xmm0 275 pcmpeqb (%rdi), %xmm0 276 pmovmskb %xmm0, %edx 277 subl $0xffff, %edx 278 jnz L(neq) 279 addq $16, %rdi 280 281 cmpq %rdi, %r10 282 jne L(A32main) 283L(mt16): 284 subq %rdi, %r11 285 je L(finz) 286 movq %r11, %r10 287 jmp L(small) 288 289 .p2align 4,, 4 290L(neq): 291#ifdef USE_AS_MEMCMPEQ 292 movl $1, %eax 293 ret 294#else 295 bsfl %edx, %ecx 296 movzbl (%rdi, %rcx), %eax 297 addq %rdi, %rsi 298 movzbl (%rsi,%rcx), %edx 299 jmp L(finz1) 300#endif 301 302 .p2align 4,, 4 303L(ATR): 304 movq %r11, %r10 305 andq $-32, %r10 306 cmpq %r10, %rdi 307 jae L(mt16) 308 testq $16, %rdi 309 jz L(ATR32) 310 311 movdqa (%rdi,%rsi), %xmm0 312 pcmpeqb (%rdi), %xmm0 313 pmovmskb %xmm0, %edx 314 subl $0xffff, %edx 315 jnz L(neq) 316 addq $16, %rdi 317 cmpq %rdi, %r10 318 je L(mt16) 319 320L(ATR32): 321 movq %r11, %r10 322 andq $-64, %r10 323 testq $32, %rdi 324 jz L(ATR64) 325 326 movdqa (%rdi,%rsi), %xmm0 327 pcmpeqb (%rdi), %xmm0 328 pmovmskb %xmm0, %edx 329 subl $0xffff, %edx 330 jnz L(neq) 331 addq $16, %rdi 332 333 movdqa (%rdi,%rsi), %xmm0 334 pcmpeqb (%rdi), %xmm0 335 pmovmskb %xmm0, %edx 336 subl $0xffff, %edx 337 jnz L(neq) 338 addq $16, %rdi 339 340L(ATR64): 341 cmpq %rdi, %r10 342 je L(mt32) 343 344L(ATR64main): 345 movdqa (%rdi,%rsi), %xmm0 346 pcmpeqb (%rdi), %xmm0 347 pmovmskb %xmm0, %edx 348 subl $0xffff, %edx 349 jnz L(neq) 350 addq $16, %rdi 351 352 movdqa (%rdi,%rsi), %xmm0 353 pcmpeqb (%rdi), %xmm0 354 pmovmskb %xmm0, %edx 355 subl $0xffff, %edx 356 jnz L(neq) 357 addq $16, %rdi 358 359 movdqa (%rdi,%rsi), %xmm0 360 pcmpeqb (%rdi), %xmm0 361 pmovmskb %xmm0, %edx 362 subl $0xffff, %edx 363 jnz L(neq) 364 addq $16, %rdi 365 366 movdqa (%rdi,%rsi), %xmm0 367 pcmpeqb (%rdi), %xmm0 368 pmovmskb %xmm0, %edx 369 subl $0xffff, %edx 370 jnz L(neq) 371 addq $16, %rdi 372 cmpq %rdi, %r10 373 jne L(ATR64main) 374 375 movq %r11, %r10 376 andq $-32, %r10 377 cmpq %r10, %rdi 378 jae L(mt16) 379 380L(ATR32res): 381 movdqa (%rdi,%rsi), %xmm0 382 pcmpeqb (%rdi), %xmm0 383 pmovmskb %xmm0, %edx 384 subl $0xffff, %edx 385 jnz L(neq) 386 addq $16, %rdi 387 388 movdqa (%rdi,%rsi), %xmm0 389 pcmpeqb (%rdi), %xmm0 390 pmovmskb %xmm0, %edx 391 subl $0xffff, %edx 392 jnz L(neq) 393 addq $16, %rdi 394 395 cmpq %r10, %rdi 396 jne L(ATR32res) 397 398 subq %rdi, %r11 399 je L(finz) 400 movq %r11, %r10 401 jmp L(small) 402 /* Align to 16byte to improve instruction fetch. */ 403 .p2align 4,, 4 404END(memcmp) 405 406#ifdef USE_AS_MEMCMPEQ 407libc_hidden_def (memcmp) 408#else 409# undef bcmp 410weak_alias (memcmp, bcmp) 411libc_hidden_builtin_def (memcmp) 412#endif 413