1/* strstr with unaligned loads 2 Copyright (C) 2009-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21ENTRY(__strstr_sse2_unaligned) 22 movzbl (%rsi), %eax 23 testb %al, %al 24 je L(empty) 25 movzbl 1(%rsi), %edx 26 testb %dl, %dl 27 je L(strchr) 28 movd %eax, %xmm1 29 movd %edx, %xmm2 30 movq %rdi, %rax 31 andl $4095, %eax 32 punpcklbw %xmm1, %xmm1 33 cmpq $4031, %rax 34 punpcklbw %xmm2, %xmm2 35 punpcklwd %xmm1, %xmm1 36 punpcklwd %xmm2, %xmm2 37 pshufd $0, %xmm1, %xmm1 38 pshufd $0, %xmm2, %xmm2 39 ja L(cross_page) 40 movdqu (%rdi), %xmm3 41 pxor %xmm5, %xmm5 42 movdqu 1(%rdi), %xmm4 43 movdqa %xmm3, %xmm6 44 pcmpeqb %xmm1, %xmm3 45 pcmpeqb %xmm2, %xmm4 46 movdqu 16(%rdi), %xmm0 47 pcmpeqb %xmm5, %xmm6 48 pminub %xmm4, %xmm3 49 movdqa %xmm3, %xmm4 50 movdqu 17(%rdi), %xmm3 51 pcmpeqb %xmm0, %xmm5 52 pcmpeqb %xmm2, %xmm3 53 por %xmm6, %xmm4 54 pcmpeqb %xmm1, %xmm0 55 pminub %xmm3, %xmm0 56 por %xmm5, %xmm0 57 pmovmskb %xmm4, %r8d 58 pmovmskb %xmm0, %eax 59 salq $16, %rax 60 orq %rax, %r8 61 je L(next_32_bytes) 62L(next_pair_index): 63 bsf %r8, %rax 64 addq %rdi, %rax 65 cmpb $0, (%rax) 66 je L(zero1) 67 movzbl 2(%rsi), %edx 68 testb %dl, %dl 69 je L(found1) 70 cmpb 2(%rax), %dl 71 jne L(next_pair) 72 xorl %edx, %edx 73 jmp L(pair_loop_start) 74 75 .p2align 4 76L(strchr): 77 movzbl %al, %esi 78 jmp __strchr_sse2 79 80 .p2align 4 81L(pair_loop): 82 addq $1, %rdx 83 cmpb 2(%rax,%rdx), %cl 84 jne L(next_pair) 85L(pair_loop_start): 86 movzbl 3(%rsi,%rdx), %ecx 87 testb %cl, %cl 88 jne L(pair_loop) 89L(found1): 90 ret 91L(zero1): 92 xorl %eax, %eax 93 ret 94 95 .p2align 4 96L(next_pair): 97 leaq -1(%r8), %rax 98 andq %rax, %r8 99 jne L(next_pair_index) 100 101 .p2align 4 102L(next_32_bytes): 103 movdqu 32(%rdi), %xmm3 104 pxor %xmm5, %xmm5 105 movdqu 33(%rdi), %xmm4 106 movdqa %xmm3, %xmm6 107 pcmpeqb %xmm1, %xmm3 108 pcmpeqb %xmm2, %xmm4 109 movdqu 48(%rdi), %xmm0 110 pcmpeqb %xmm5, %xmm6 111 pminub %xmm4, %xmm3 112 movdqa %xmm3, %xmm4 113 movdqu 49(%rdi), %xmm3 114 pcmpeqb %xmm0, %xmm5 115 pcmpeqb %xmm2, %xmm3 116 por %xmm6, %xmm4 117 pcmpeqb %xmm1, %xmm0 118 pminub %xmm3, %xmm0 119 por %xmm5, %xmm0 120 pmovmskb %xmm4, %eax 121 salq $32, %rax 122 pmovmskb %xmm0, %r8d 123 salq $48, %r8 124 orq %rax, %r8 125 je L(loop_header) 126L(next_pair2_index): 127 bsfq %r8, %rax 128 addq %rdi, %rax 129 cmpb $0, (%rax) 130 je L(zero2) 131 movzbl 2(%rsi), %edx 132 testb %dl, %dl 133 je L(found2) 134 cmpb 2(%rax), %dl 135 jne L(next_pair2) 136 xorl %edx, %edx 137 jmp L(pair_loop2_start) 138 139 .p2align 4 140L(pair_loop2): 141 addq $1, %rdx 142 cmpb 2(%rax,%rdx), %cl 143 jne L(next_pair2) 144L(pair_loop2_start): 145 movzbl 3(%rsi,%rdx), %ecx 146 testb %cl, %cl 147 jne L(pair_loop2) 148L(found2): 149 ret 150 L(zero2): 151 xorl %eax, %eax 152 ret 153L(empty): 154 mov %rdi, %rax 155 ret 156 157 .p2align 4 158L(next_pair2): 159 leaq -1(%r8), %rax 160 andq %rax, %r8 161 jne L(next_pair2_index) 162L(loop_header): 163 movq $-512, %r11 164 movq %rdi, %r9 165 166 pxor %xmm7, %xmm7 167 andq $-64, %rdi 168 169 .p2align 4 170L(loop): 171 movdqa 64(%rdi), %xmm3 172 movdqu 63(%rdi), %xmm6 173 movdqa %xmm3, %xmm0 174 pxor %xmm2, %xmm3 175 pxor %xmm1, %xmm6 176 movdqa 80(%rdi), %xmm10 177 por %xmm3, %xmm6 178 pminub %xmm10, %xmm0 179 movdqu 79(%rdi), %xmm3 180 pxor %xmm2, %xmm10 181 pxor %xmm1, %xmm3 182 movdqa 96(%rdi), %xmm9 183 por %xmm10, %xmm3 184 pminub %xmm9, %xmm0 185 pxor %xmm2, %xmm9 186 movdqa 112(%rdi), %xmm8 187 addq $64, %rdi 188 pminub %xmm6, %xmm3 189 movdqu 31(%rdi), %xmm4 190 pminub %xmm8, %xmm0 191 pxor %xmm2, %xmm8 192 pxor %xmm1, %xmm4 193 por %xmm9, %xmm4 194 pminub %xmm4, %xmm3 195 movdqu 47(%rdi), %xmm5 196 pxor %xmm1, %xmm5 197 por %xmm8, %xmm5 198 pminub %xmm5, %xmm3 199 pminub %xmm3, %xmm0 200 pcmpeqb %xmm7, %xmm0 201 pmovmskb %xmm0, %eax 202 testl %eax, %eax 203 je L(loop) 204 pminub (%rdi), %xmm6 205 pminub 32(%rdi),%xmm4 206 pminub 48(%rdi),%xmm5 207 pcmpeqb %xmm7, %xmm6 208 pcmpeqb %xmm7, %xmm5 209 pmovmskb %xmm6, %edx 210 movdqa 16(%rdi), %xmm8 211 pcmpeqb %xmm7, %xmm4 212 movdqu 15(%rdi), %xmm0 213 pmovmskb %xmm5, %r8d 214 movdqa %xmm8, %xmm3 215 pmovmskb %xmm4, %ecx 216 pcmpeqb %xmm1,%xmm0 217 pcmpeqb %xmm2,%xmm3 218 salq $32, %rcx 219 pcmpeqb %xmm7,%xmm8 220 salq $48, %r8 221 pminub %xmm0,%xmm3 222 orq %rcx, %rdx 223 por %xmm3,%xmm8 224 orq %rdx, %r8 225 pmovmskb %xmm8, %eax 226 salq $16, %rax 227 orq %rax, %r8 228 je L(loop) 229L(next_pair_index3): 230 bsfq %r8, %rcx 231 addq %rdi, %rcx 232 cmpb $0, (%rcx) 233 je L(zero) 234 xorl %eax, %eax 235 movzbl 2(%rsi), %edx 236 testb %dl, %dl 237 je L(success3) 238 cmpb 1(%rcx), %dl 239 jne L(next_pair3) 240 jmp L(pair_loop_start3) 241 242 .p2align 4 243L(pair_loop3): 244 addq $1, %rax 245 cmpb 1(%rcx,%rax), %dl 246 jne L(next_pair3) 247L(pair_loop_start3): 248 movzbl 3(%rsi,%rax), %edx 249 testb %dl, %dl 250 jne L(pair_loop3) 251L(success3): 252 lea -1(%rcx), %rax 253 ret 254 255 .p2align 4 256L(next_pair3): 257 addq %rax, %r11 258 movq %rdi, %rax 259 subq %r9, %rax 260 cmpq %r11, %rax 261 jl L(switch_strstr) 262 leaq -1(%r8), %rax 263 andq %rax, %r8 264 jne L(next_pair_index3) 265 jmp L(loop) 266 267 .p2align 4 268L(switch_strstr): 269 movq %rdi, %rdi 270 jmp __strstr_sse2 271 272 .p2align 4 273L(cross_page): 274 275 movq %rdi, %rax 276 pxor %xmm0, %xmm0 277 andq $-64, %rax 278 movdqa (%rax), %xmm3 279 movdqu -1(%rax), %xmm4 280 movdqa %xmm3, %xmm8 281 movdqa 16(%rax), %xmm5 282 pcmpeqb %xmm1, %xmm4 283 pcmpeqb %xmm0, %xmm8 284 pcmpeqb %xmm2, %xmm3 285 movdqa %xmm5, %xmm7 286 pminub %xmm4, %xmm3 287 movdqu 15(%rax), %xmm4 288 pcmpeqb %xmm0, %xmm7 289 por %xmm3, %xmm8 290 movdqa %xmm5, %xmm3 291 movdqa 32(%rax), %xmm5 292 pcmpeqb %xmm1, %xmm4 293 pcmpeqb %xmm2, %xmm3 294 movdqa %xmm5, %xmm6 295 pmovmskb %xmm8, %ecx 296 pminub %xmm4, %xmm3 297 movdqu 31(%rax), %xmm4 298 por %xmm3, %xmm7 299 movdqa %xmm5, %xmm3 300 pcmpeqb %xmm0, %xmm6 301 movdqa 48(%rax), %xmm5 302 pcmpeqb %xmm1, %xmm4 303 pmovmskb %xmm7, %r8d 304 pcmpeqb %xmm2, %xmm3 305 pcmpeqb %xmm5, %xmm0 306 pminub %xmm4, %xmm3 307 movdqu 47(%rax), %xmm4 308 por %xmm3, %xmm6 309 movdqa %xmm5, %xmm3 310 salq $16, %r8 311 pcmpeqb %xmm1, %xmm4 312 pcmpeqb %xmm2, %xmm3 313 pmovmskb %xmm6, %r10d 314 pminub %xmm4, %xmm3 315 por %xmm3, %xmm0 316 salq $32, %r10 317 orq %r10, %r8 318 orq %rcx, %r8 319 movl %edi, %ecx 320 pmovmskb %xmm0, %edx 321 subl %eax, %ecx 322 salq $48, %rdx 323 orq %rdx, %r8 324 shrq %cl, %r8 325 je L(loop_header) 326L(next_pair_index4): 327 bsfq %r8, %rax 328 addq %rdi, %rax 329 cmpb $0, (%rax) 330 je L(zero) 331 332 cmpq %rax,%rdi 333 je L(next_pair4) 334 335 movzbl 2(%rsi), %edx 336 testb %dl, %dl 337 je L(found3) 338 cmpb 1(%rax), %dl 339 jne L(next_pair4) 340 xorl %edx, %edx 341 jmp L(pair_loop_start4) 342 343 .p2align 4 344L(pair_loop4): 345 addq $1, %rdx 346 cmpb 1(%rax,%rdx), %cl 347 jne L(next_pair4) 348L(pair_loop_start4): 349 movzbl 3(%rsi,%rdx), %ecx 350 testb %cl, %cl 351 jne L(pair_loop4) 352L(found3): 353 subq $1, %rax 354 ret 355 356 .p2align 4 357L(next_pair4): 358 leaq -1(%r8), %rax 359 andq %rax, %r8 360 jne L(next_pair_index4) 361 jmp L(loop_header) 362 363 .p2align 4 364L(found): 365 rep 366 ret 367 368 .p2align 4 369L(zero): 370 xorl %eax, %eax 371 ret 372 373 374END(__strstr_sse2_unaligned) 375