1/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using 2 3 Copyright (C) 2011-2021 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22 .text 23ENTRY (__memrchr) 24 movd %esi, %xmm1 25 26 sub $16, %RDX_LP 27 jbe L(length_less16) 28 29 punpcklbw %xmm1, %xmm1 30 punpcklbw %xmm1, %xmm1 31 32 add %RDX_LP, %RDI_LP 33 pshufd $0, %xmm1, %xmm1 34 35 movdqu (%rdi), %xmm0 36 pcmpeqb %xmm1, %xmm0 37 38/* Check if there is a match. */ 39 pmovmskb %xmm0, %eax 40 test %eax, %eax 41 jnz L(matches0) 42 43 sub $64, %rdi 44 mov %edi, %ecx 45 and $15, %ecx 46 jz L(loop_prolog) 47 48 add $16, %rdi 49 add $16, %rdx 50 and $-16, %rdi 51 sub %rcx, %rdx 52 53 .p2align 4 54L(loop_prolog): 55 sub $64, %rdx 56 jbe L(exit_loop) 57 58 movdqa 48(%rdi), %xmm0 59 pcmpeqb %xmm1, %xmm0 60 pmovmskb %xmm0, %eax 61 test %eax, %eax 62 jnz L(matches48) 63 64 movdqa 32(%rdi), %xmm2 65 pcmpeqb %xmm1, %xmm2 66 pmovmskb %xmm2, %eax 67 test %eax, %eax 68 jnz L(matches32) 69 70 movdqa 16(%rdi), %xmm3 71 pcmpeqb %xmm1, %xmm3 72 pmovmskb %xmm3, %eax 73 test %eax, %eax 74 jnz L(matches16) 75 76 movdqa (%rdi), %xmm4 77 pcmpeqb %xmm1, %xmm4 78 pmovmskb %xmm4, %eax 79 test %eax, %eax 80 jnz L(matches0) 81 82 sub $64, %rdi 83 sub $64, %rdx 84 jbe L(exit_loop) 85 86 movdqa 48(%rdi), %xmm0 87 pcmpeqb %xmm1, %xmm0 88 pmovmskb %xmm0, %eax 89 test %eax, %eax 90 jnz L(matches48) 91 92 movdqa 32(%rdi), %xmm2 93 pcmpeqb %xmm1, %xmm2 94 pmovmskb %xmm2, %eax 95 test %eax, %eax 96 jnz L(matches32) 97 98 movdqa 16(%rdi), %xmm3 99 pcmpeqb %xmm1, %xmm3 100 pmovmskb %xmm3, %eax 101 test %eax, %eax 102 jnz L(matches16) 103 104 movdqa (%rdi), %xmm3 105 pcmpeqb %xmm1, %xmm3 106 pmovmskb %xmm3, %eax 107 test %eax, %eax 108 jnz L(matches0) 109 110 mov %edi, %ecx 111 and $63, %ecx 112 jz L(align64_loop) 113 114 add $64, %rdi 115 add $64, %rdx 116 and $-64, %rdi 117 sub %rcx, %rdx 118 119 .p2align 4 120L(align64_loop): 121 sub $64, %rdi 122 sub $64, %rdx 123 jbe L(exit_loop) 124 125 movdqa (%rdi), %xmm0 126 movdqa 16(%rdi), %xmm2 127 movdqa 32(%rdi), %xmm3 128 movdqa 48(%rdi), %xmm4 129 130 pcmpeqb %xmm1, %xmm0 131 pcmpeqb %xmm1, %xmm2 132 pcmpeqb %xmm1, %xmm3 133 pcmpeqb %xmm1, %xmm4 134 135 pmaxub %xmm3, %xmm0 136 pmaxub %xmm4, %xmm2 137 pmaxub %xmm0, %xmm2 138 pmovmskb %xmm2, %eax 139 140 test %eax, %eax 141 jz L(align64_loop) 142 143 pmovmskb %xmm4, %eax 144 test %eax, %eax 145 jnz L(matches48) 146 147 pmovmskb %xmm3, %eax 148 test %eax, %eax 149 jnz L(matches32) 150 151 movdqa 16(%rdi), %xmm2 152 153 pcmpeqb %xmm1, %xmm2 154 pcmpeqb (%rdi), %xmm1 155 156 pmovmskb %xmm2, %eax 157 test %eax, %eax 158 jnz L(matches16) 159 160 pmovmskb %xmm1, %eax 161 bsr %eax, %eax 162 163 add %rdi, %rax 164 ret 165 166 .p2align 4 167L(exit_loop): 168 add $64, %edx 169 cmp $32, %edx 170 jbe L(exit_loop_32) 171 172 movdqa 48(%rdi), %xmm0 173 pcmpeqb %xmm1, %xmm0 174 pmovmskb %xmm0, %eax 175 test %eax, %eax 176 jnz L(matches48) 177 178 movdqa 32(%rdi), %xmm2 179 pcmpeqb %xmm1, %xmm2 180 pmovmskb %xmm2, %eax 181 test %eax, %eax 182 jnz L(matches32) 183 184 movdqa 16(%rdi), %xmm3 185 pcmpeqb %xmm1, %xmm3 186 pmovmskb %xmm3, %eax 187 test %eax, %eax 188 jnz L(matches16_1) 189 cmp $48, %edx 190 jbe L(return_null) 191 192 pcmpeqb (%rdi), %xmm1 193 pmovmskb %xmm1, %eax 194 test %eax, %eax 195 jnz L(matches0_1) 196 xor %eax, %eax 197 ret 198 199 .p2align 4 200L(exit_loop_32): 201 movdqa 48(%rdi), %xmm0 202 pcmpeqb %xmm1, %xmm0 203 pmovmskb %xmm0, %eax 204 test %eax, %eax 205 jnz L(matches48_1) 206 cmp $16, %edx 207 jbe L(return_null) 208 209 pcmpeqb 32(%rdi), %xmm1 210 pmovmskb %xmm1, %eax 211 test %eax, %eax 212 jnz L(matches32_1) 213 xor %eax, %eax 214 ret 215 216 .p2align 4 217L(matches0): 218 bsr %eax, %eax 219 add %rdi, %rax 220 ret 221 222 .p2align 4 223L(matches16): 224 bsr %eax, %eax 225 lea 16(%rax, %rdi), %rax 226 ret 227 228 .p2align 4 229L(matches32): 230 bsr %eax, %eax 231 lea 32(%rax, %rdi), %rax 232 ret 233 234 .p2align 4 235L(matches48): 236 bsr %eax, %eax 237 lea 48(%rax, %rdi), %rax 238 ret 239 240 .p2align 4 241L(matches0_1): 242 bsr %eax, %eax 243 sub $64, %rdx 244 add %rax, %rdx 245 jl L(return_null) 246 add %rdi, %rax 247 ret 248 249 .p2align 4 250L(matches16_1): 251 bsr %eax, %eax 252 sub $48, %rdx 253 add %rax, %rdx 254 jl L(return_null) 255 lea 16(%rdi, %rax), %rax 256 ret 257 258 .p2align 4 259L(matches32_1): 260 bsr %eax, %eax 261 sub $32, %rdx 262 add %rax, %rdx 263 jl L(return_null) 264 lea 32(%rdi, %rax), %rax 265 ret 266 267 .p2align 4 268L(matches48_1): 269 bsr %eax, %eax 270 sub $16, %rdx 271 add %rax, %rdx 272 jl L(return_null) 273 lea 48(%rdi, %rax), %rax 274 ret 275 276 .p2align 4 277L(return_null): 278 xor %eax, %eax 279 ret 280 281 .p2align 4 282L(length_less16_offset0): 283 test %edx, %edx 284 jz L(return_null) 285 286 mov %dl, %cl 287 pcmpeqb (%rdi), %xmm1 288 289 mov $1, %edx 290 sal %cl, %edx 291 sub $1, %edx 292 293 pmovmskb %xmm1, %eax 294 295 and %edx, %eax 296 test %eax, %eax 297 jz L(return_null) 298 299 bsr %eax, %eax 300 add %rdi, %rax 301 ret 302 303 .p2align 4 304L(length_less16): 305 punpcklbw %xmm1, %xmm1 306 punpcklbw %xmm1, %xmm1 307 308 add $16, %edx 309 310 pshufd $0, %xmm1, %xmm1 311 312 mov %edi, %ecx 313 and $15, %ecx 314 jz L(length_less16_offset0) 315 316 mov %cl, %dh 317 mov %ecx, %esi 318 add %dl, %dh 319 and $-16, %rdi 320 321 sub $16, %dh 322 ja L(length_less16_part2) 323 324 pcmpeqb (%rdi), %xmm1 325 pmovmskb %xmm1, %eax 326 327 sar %cl, %eax 328 mov %dl, %cl 329 330 mov $1, %edx 331 sal %cl, %edx 332 sub $1, %edx 333 334 and %edx, %eax 335 test %eax, %eax 336 jz L(return_null) 337 338 bsr %eax, %eax 339 add %rdi, %rax 340 add %rsi, %rax 341 ret 342 343 .p2align 4 344L(length_less16_part2): 345 movdqa 16(%rdi), %xmm2 346 pcmpeqb %xmm1, %xmm2 347 pmovmskb %xmm2, %eax 348 349 mov %dh, %cl 350 mov $1, %edx 351 sal %cl, %edx 352 sub $1, %edx 353 354 and %edx, %eax 355 356 test %eax, %eax 357 jnz L(length_less16_part2_return) 358 359 pcmpeqb (%rdi), %xmm1 360 pmovmskb %xmm1, %eax 361 362 mov %esi, %ecx 363 sar %cl, %eax 364 test %eax, %eax 365 jz L(return_null) 366 367 bsr %eax, %eax 368 add %rdi, %rax 369 add %rsi, %rax 370 ret 371 372 .p2align 4 373L(length_less16_part2_return): 374 bsr %eax, %eax 375 lea 16(%rax, %rdi), %rax 376 ret 377 378END (__memrchr) 379weak_alias (__memrchr, memrchr) 380