1/* Copyright (C) 2011-2021 Free Software Foundation, Inc. 2 This file is part of the GNU C Library. 3 4 The GNU C Library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Lesser General Public 6 License as published by the Free Software Foundation; either 7 version 2.1 of the License, or (at your option) any later version. 8 9 The GNU C Library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public 15 License along with the GNU C Library; if not, see 16 <https://www.gnu.org/licenses/>. */ 17 18#include <sysdep.h> 19 20#ifdef USE_AS_WMEMCHR 21# define MEMCHR wmemchr 22# define PCMPEQ pcmpeqd 23# define CHAR_PER_VEC 4 24#else 25# define MEMCHR memchr 26# define PCMPEQ pcmpeqb 27# define CHAR_PER_VEC 16 28#endif 29 30/* fast SSE2 version with using pmaxub and 64 byte loop */ 31 32 .text 33ENTRY(MEMCHR) 34 movd %esi, %xmm1 35 mov %edi, %ecx 36 37#ifdef __ILP32__ 38 /* Clear the upper 32 bits. */ 39 movl %edx, %edx 40#endif 41#ifdef USE_AS_WMEMCHR 42 test %RDX_LP, %RDX_LP 43 jz L(return_null) 44#else 45 punpcklbw %xmm1, %xmm1 46 test %RDX_LP, %RDX_LP 47 jz L(return_null) 48 punpcklbw %xmm1, %xmm1 49#endif 50 51 and $63, %ecx 52 pshufd $0, %xmm1, %xmm1 53 54 cmp $48, %ecx 55 ja L(crosscache) 56 57 movdqu (%rdi), %xmm0 58 PCMPEQ %xmm1, %xmm0 59 pmovmskb %xmm0, %eax 60 test %eax, %eax 61 62 jnz L(matches_1) 63 sub $CHAR_PER_VEC, %rdx 64 jbe L(return_null) 65 add $16, %rdi 66 and $15, %ecx 67 and $-16, %rdi 68#ifdef USE_AS_WMEMCHR 69 shr $2, %ecx 70#endif 71 add %rcx, %rdx 72 sub $(CHAR_PER_VEC * 4), %rdx 73 jbe L(exit_loop) 74 jmp L(loop_prolog) 75 76 .p2align 4 77L(crosscache): 78 and $15, %ecx 79 and $-16, %rdi 80 movdqa (%rdi), %xmm0 81 82 PCMPEQ %xmm1, %xmm0 83 /* Check if there is a match. */ 84 pmovmskb %xmm0, %eax 85 /* Remove the leading bytes. */ 86 sar %cl, %eax 87 test %eax, %eax 88 je L(unaligned_no_match) 89 /* Check which byte is a match. */ 90 bsf %eax, %eax 91#ifdef USE_AS_WMEMCHR 92 mov %eax, %esi 93 shr $2, %esi 94 sub %rsi, %rdx 95#else 96 sub %rax, %rdx 97#endif 98 jbe L(return_null) 99 add %rdi, %rax 100 add %rcx, %rax 101 ret 102 103 .p2align 4 104L(unaligned_no_match): 105 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using 106 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void 107 possible addition overflow. */ 108 neg %rcx 109 add $16, %rcx 110#ifdef USE_AS_WMEMCHR 111 shr $2, %ecx 112#endif 113 sub %rcx, %rdx 114 jbe L(return_null) 115 add $16, %rdi 116 sub $(CHAR_PER_VEC * 4), %rdx 117 jbe L(exit_loop) 118 119 .p2align 4 120L(loop_prolog): 121 movdqa (%rdi), %xmm0 122 PCMPEQ %xmm1, %xmm0 123 pmovmskb %xmm0, %eax 124 test %eax, %eax 125 jnz L(matches) 126 127 movdqa 16(%rdi), %xmm2 128 PCMPEQ %xmm1, %xmm2 129 pmovmskb %xmm2, %eax 130 test %eax, %eax 131 jnz L(matches16) 132 133 movdqa 32(%rdi), %xmm3 134 PCMPEQ %xmm1, %xmm3 135 pmovmskb %xmm3, %eax 136 test %eax, %eax 137 jnz L(matches32) 138 139 movdqa 48(%rdi), %xmm4 140 PCMPEQ %xmm1, %xmm4 141 add $64, %rdi 142 pmovmskb %xmm4, %eax 143 test %eax, %eax 144 jnz L(matches0) 145 146 test $0x3f, %rdi 147 jz L(align64_loop) 148 149 sub $(CHAR_PER_VEC * 4), %rdx 150 jbe L(exit_loop) 151 152 movdqa (%rdi), %xmm0 153 PCMPEQ %xmm1, %xmm0 154 pmovmskb %xmm0, %eax 155 test %eax, %eax 156 jnz L(matches) 157 158 movdqa 16(%rdi), %xmm2 159 PCMPEQ %xmm1, %xmm2 160 pmovmskb %xmm2, %eax 161 test %eax, %eax 162 jnz L(matches16) 163 164 movdqa 32(%rdi), %xmm3 165 PCMPEQ %xmm1, %xmm3 166 pmovmskb %xmm3, %eax 167 test %eax, %eax 168 jnz L(matches32) 169 170 movdqa 48(%rdi), %xmm3 171 PCMPEQ %xmm1, %xmm3 172 pmovmskb %xmm3, %eax 173 174 add $64, %rdi 175 test %eax, %eax 176 jnz L(matches0) 177 178 mov %rdi, %rcx 179 and $-64, %rdi 180 and $63, %ecx 181#ifdef USE_AS_WMEMCHR 182 shr $2, %ecx 183#endif 184 add %rcx, %rdx 185 186 .p2align 4 187L(align64_loop): 188 sub $(CHAR_PER_VEC * 4), %rdx 189 jbe L(exit_loop) 190 movdqa (%rdi), %xmm0 191 movdqa 16(%rdi), %xmm2 192 movdqa 32(%rdi), %xmm3 193 movdqa 48(%rdi), %xmm4 194 195 PCMPEQ %xmm1, %xmm0 196 PCMPEQ %xmm1, %xmm2 197 PCMPEQ %xmm1, %xmm3 198 PCMPEQ %xmm1, %xmm4 199 200 pmaxub %xmm0, %xmm3 201 pmaxub %xmm2, %xmm4 202 pmaxub %xmm3, %xmm4 203 pmovmskb %xmm4, %eax 204 205 add $64, %rdi 206 207 test %eax, %eax 208 jz L(align64_loop) 209 210 sub $64, %rdi 211 212 pmovmskb %xmm0, %eax 213 test %eax, %eax 214 jnz L(matches) 215 216 pmovmskb %xmm2, %eax 217 test %eax, %eax 218 jnz L(matches16) 219 220 movdqa 32(%rdi), %xmm3 221 PCMPEQ %xmm1, %xmm3 222 223 PCMPEQ 48(%rdi), %xmm1 224 pmovmskb %xmm3, %eax 225 test %eax, %eax 226 jnz L(matches32) 227 228 pmovmskb %xmm1, %eax 229 bsf %eax, %eax 230 lea 48(%rdi, %rax), %rax 231 ret 232 233 .p2align 4 234L(exit_loop): 235 add $(CHAR_PER_VEC * 2), %edx 236 jle L(exit_loop_32) 237 238 movdqa (%rdi), %xmm0 239 PCMPEQ %xmm1, %xmm0 240 pmovmskb %xmm0, %eax 241 test %eax, %eax 242 jnz L(matches) 243 244 movdqa 16(%rdi), %xmm2 245 PCMPEQ %xmm1, %xmm2 246 pmovmskb %xmm2, %eax 247 test %eax, %eax 248 jnz L(matches16) 249 250 movdqa 32(%rdi), %xmm3 251 PCMPEQ %xmm1, %xmm3 252 pmovmskb %xmm3, %eax 253 test %eax, %eax 254 jnz L(matches32_1) 255 sub $CHAR_PER_VEC, %edx 256 jle L(return_null) 257 258 PCMPEQ 48(%rdi), %xmm1 259 pmovmskb %xmm1, %eax 260 test %eax, %eax 261 jnz L(matches48_1) 262 xor %eax, %eax 263 ret 264 265 .p2align 4 266L(exit_loop_32): 267 add $(CHAR_PER_VEC * 2), %edx 268 movdqa (%rdi), %xmm0 269 PCMPEQ %xmm1, %xmm0 270 pmovmskb %xmm0, %eax 271 test %eax, %eax 272 jnz L(matches_1) 273 sub $CHAR_PER_VEC, %edx 274 jbe L(return_null) 275 276 PCMPEQ 16(%rdi), %xmm1 277 pmovmskb %xmm1, %eax 278 test %eax, %eax 279 jnz L(matches16_1) 280 xor %eax, %eax 281 ret 282 283 .p2align 4 284L(matches0): 285 bsf %eax, %eax 286 lea -16(%rax, %rdi), %rax 287 ret 288 289 .p2align 4 290L(matches): 291 bsf %eax, %eax 292 add %rdi, %rax 293 ret 294 295 .p2align 4 296L(matches16): 297 bsf %eax, %eax 298 lea 16(%rax, %rdi), %rax 299 ret 300 301 .p2align 4 302L(matches32): 303 bsf %eax, %eax 304 lea 32(%rax, %rdi), %rax 305 ret 306 307 .p2align 4 308L(matches_1): 309 bsf %eax, %eax 310#ifdef USE_AS_WMEMCHR 311 mov %eax, %esi 312 shr $2, %esi 313 sub %rsi, %rdx 314#else 315 sub %rax, %rdx 316#endif 317 jbe L(return_null) 318 add %rdi, %rax 319 ret 320 321 .p2align 4 322L(matches16_1): 323 bsf %eax, %eax 324#ifdef USE_AS_WMEMCHR 325 mov %eax, %esi 326 shr $2, %esi 327 sub %rsi, %rdx 328#else 329 sub %rax, %rdx 330#endif 331 jbe L(return_null) 332 lea 16(%rdi, %rax), %rax 333 ret 334 335 .p2align 4 336L(matches32_1): 337 bsf %eax, %eax 338#ifdef USE_AS_WMEMCHR 339 mov %eax, %esi 340 shr $2, %esi 341 sub %rsi, %rdx 342#else 343 sub %rax, %rdx 344#endif 345 jbe L(return_null) 346 lea 32(%rdi, %rax), %rax 347 ret 348 349 .p2align 4 350L(matches48_1): 351 bsf %eax, %eax 352#ifdef USE_AS_WMEMCHR 353 mov %eax, %esi 354 shr $2, %esi 355 sub %rsi, %rdx 356#else 357 sub %rax, %rdx 358#endif 359 jbe L(return_null) 360 lea 48(%rdi, %rax), %rax 361 ret 362 363 .p2align 4 364L(return_null): 365 xor %eax, %eax 366 ret 367END(MEMCHR) 368 369#ifndef USE_AS_WMEMCHR 370strong_alias (memchr, __memchr) 371libc_hidden_builtin_def(memchr) 372#endif 373