1/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using 2 3 Copyright (C) 2011-2021 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22 .text 23ENTRY (__rawmemchr) 24 movd %rsi, %xmm1 25 mov %rdi, %rcx 26 27 punpcklbw %xmm1, %xmm1 28 punpcklbw %xmm1, %xmm1 29 30 and $63, %rcx 31 pshufd $0, %xmm1, %xmm1 32 33 cmp $48, %rcx 34 ja L(crosscache) 35 36 movdqu (%rdi), %xmm0 37 pcmpeqb %xmm1, %xmm0 38/* Check if there is a match. */ 39 pmovmskb %xmm0, %eax 40 test %eax, %eax 41 42 jnz L(matches) 43 add $16, %rdi 44 and $-16, %rdi 45 jmp L(loop_prolog) 46 47 .p2align 4 48L(crosscache): 49 and $15, %rcx 50 and $-16, %rdi 51 movdqa (%rdi), %xmm0 52 53 pcmpeqb %xmm1, %xmm0 54/* Check if there is a match. */ 55 pmovmskb %xmm0, %eax 56/* Remove the leading bytes. */ 57 sar %cl, %eax 58 test %eax, %eax 59 je L(unaligned_no_match) 60/* Check which byte is a match. */ 61 bsf %eax, %eax 62 63 add %rdi, %rax 64 add %rcx, %rax 65 ret 66 67 .p2align 4 68L(unaligned_no_match): 69 add $16, %rdi 70 71 .p2align 4 72L(loop_prolog): 73 movdqa (%rdi), %xmm0 74 pcmpeqb %xmm1, %xmm0 75 pmovmskb %xmm0, %eax 76 test %eax, %eax 77 jnz L(matches) 78 79 movdqa 16(%rdi), %xmm2 80 pcmpeqb %xmm1, %xmm2 81 pmovmskb %xmm2, %eax 82 test %eax, %eax 83 jnz L(matches16) 84 85 movdqa 32(%rdi), %xmm3 86 pcmpeqb %xmm1, %xmm3 87 pmovmskb %xmm3, %eax 88 test %eax, %eax 89 jnz L(matches32) 90 91 movdqa 48(%rdi), %xmm4 92 pcmpeqb %xmm1, %xmm4 93 add $64, %rdi 94 pmovmskb %xmm4, %eax 95 test %eax, %eax 96 jnz L(matches0) 97 98 test $0x3f, %rdi 99 jz L(align64_loop) 100 101 movdqa (%rdi), %xmm0 102 pcmpeqb %xmm1, %xmm0 103 pmovmskb %xmm0, %eax 104 test %eax, %eax 105 jnz L(matches) 106 107 movdqa 16(%rdi), %xmm2 108 pcmpeqb %xmm1, %xmm2 109 pmovmskb %xmm2, %eax 110 test %eax, %eax 111 jnz L(matches16) 112 113 movdqa 32(%rdi), %xmm3 114 pcmpeqb %xmm1, %xmm3 115 pmovmskb %xmm3, %eax 116 test %eax, %eax 117 jnz L(matches32) 118 119 movdqa 48(%rdi), %xmm3 120 pcmpeqb %xmm1, %xmm3 121 pmovmskb %xmm3, %eax 122 123 add $64, %rdi 124 test %eax, %eax 125 jnz L(matches0) 126 127 and $-64, %rdi 128 129 .p2align 4 130L(align64_loop): 131 movdqa (%rdi), %xmm0 132 movdqa 16(%rdi), %xmm2 133 movdqa 32(%rdi), %xmm3 134 movdqa 48(%rdi), %xmm4 135 136 pcmpeqb %xmm1, %xmm0 137 pcmpeqb %xmm1, %xmm2 138 pcmpeqb %xmm1, %xmm3 139 pcmpeqb %xmm1, %xmm4 140 141 pmaxub %xmm0, %xmm3 142 pmaxub %xmm2, %xmm4 143 pmaxub %xmm3, %xmm4 144 pmovmskb %xmm4, %eax 145 146 add $64, %rdi 147 148 test %eax, %eax 149 jz L(align64_loop) 150 151 sub $64, %rdi 152 153 pmovmskb %xmm0, %eax 154 test %eax, %eax 155 jnz L(matches) 156 157 pmovmskb %xmm2, %eax 158 test %eax, %eax 159 jnz L(matches16) 160 161 movdqa 32(%rdi), %xmm3 162 pcmpeqb %xmm1, %xmm3 163 164 pcmpeqb 48(%rdi), %xmm1 165 pmovmskb %xmm3, %eax 166 test %eax, %eax 167 jnz L(matches32) 168 169 pmovmskb %xmm1, %eax 170 bsf %eax, %eax 171 lea 48(%rdi, %rax), %rax 172 ret 173 174 .p2align 4 175L(matches0): 176 bsf %eax, %eax 177 lea -16(%rax, %rdi), %rax 178 ret 179 180 .p2align 4 181L(matches): 182 bsf %eax, %eax 183 add %rdi, %rax 184 ret 185 186 .p2align 4 187L(matches16): 188 bsf %eax, %eax 189 lea 16(%rax, %rdi), %rax 190 ret 191 192 .p2align 4 193L(matches32): 194 bsf %eax, %eax 195 lea 32(%rax, %rdi), %rax 196 ret 197 198END (__rawmemchr) 199 200weak_alias (__rawmemchr, rawmemchr) 201libc_hidden_builtin_def (__rawmemchr) 202