1/* wcsrchr with SSSE3 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 .text 22ENTRY (wcsrchr) 23 24 movd %rsi, %xmm1 25 mov %rdi, %rcx 26 punpckldq %xmm1, %xmm1 27 pxor %xmm2, %xmm2 28 punpckldq %xmm1, %xmm1 29 and $63, %rcx 30 cmp $48, %rcx 31 ja L(crosscache) 32 33 movdqu (%rdi), %xmm0 34 pcmpeqd %xmm0, %xmm2 35 pcmpeqd %xmm1, %xmm0 36 pmovmskb %xmm2, %rcx 37 pmovmskb %xmm0, %rax 38 add $16, %rdi 39 40 test %rax, %rax 41 jnz L(unaligned_match1) 42 43 test %rcx, %rcx 44 jnz L(return_null) 45 46 and $-16, %rdi 47 xor %r8, %r8 48 jmp L(loop) 49 50 .p2align 4 51L(unaligned_match1): 52 test %rcx, %rcx 53 jnz L(prolog_find_zero_1) 54 55 mov %rax, %r8 56 mov %rdi, %rsi 57 and $-16, %rdi 58 jmp L(loop) 59 60 .p2align 4 61L(crosscache): 62 and $15, %rcx 63 and $-16, %rdi 64 pxor %xmm3, %xmm3 65 movdqa (%rdi), %xmm0 66 pcmpeqd %xmm0, %xmm3 67 pcmpeqd %xmm1, %xmm0 68 pmovmskb %xmm3, %rdx 69 pmovmskb %xmm0, %rax 70 shr %cl, %rdx 71 shr %cl, %rax 72 add $16, %rdi 73 74 test %rax, %rax 75 jnz L(unaligned_match) 76 77 test %rdx, %rdx 78 jnz L(return_null) 79 80 xor %r8, %r8 81 jmp L(loop) 82 83 .p2align 4 84L(unaligned_match): 85 test %rdx, %rdx 86 jnz L(prolog_find_zero) 87 88 mov %rax, %r8 89 lea (%rdi, %rcx), %rsi 90 91/* Loop start on aligned string. */ 92 .p2align 4 93L(loop): 94 movdqa (%rdi), %xmm0 95 pcmpeqd %xmm0, %xmm2 96 add $16, %rdi 97 pcmpeqd %xmm1, %xmm0 98 pmovmskb %xmm2, %rcx 99 pmovmskb %xmm0, %rax 100 or %rax, %rcx 101 jnz L(matches) 102 103 movdqa (%rdi), %xmm3 104 pcmpeqd %xmm3, %xmm2 105 add $16, %rdi 106 pcmpeqd %xmm1, %xmm3 107 pmovmskb %xmm2, %rcx 108 pmovmskb %xmm3, %rax 109 or %rax, %rcx 110 jnz L(matches) 111 112 movdqa (%rdi), %xmm4 113 pcmpeqd %xmm4, %xmm2 114 add $16, %rdi 115 pcmpeqd %xmm1, %xmm4 116 pmovmskb %xmm2, %rcx 117 pmovmskb %xmm4, %rax 118 or %rax, %rcx 119 jnz L(matches) 120 121 movdqa (%rdi), %xmm5 122 pcmpeqd %xmm5, %xmm2 123 add $16, %rdi 124 pcmpeqd %xmm1, %xmm5 125 pmovmskb %xmm2, %rcx 126 pmovmskb %xmm5, %rax 127 or %rax, %rcx 128 jz L(loop) 129 130 .p2align 4 131L(matches): 132 test %rax, %rax 133 jnz L(match) 134L(return_value): 135 test %r8, %r8 136 jz L(return_null) 137 mov %r8, %rax 138 mov %rsi, %rdi 139 140 test $15 << 4, %ah 141 jnz L(match_fourth_wchar) 142 test %ah, %ah 143 jnz L(match_third_wchar) 144 test $15 << 4, %al 145 jnz L(match_second_wchar) 146 lea -16(%rdi), %rax 147 ret 148 149 .p2align 4 150L(match): 151 pmovmskb %xmm2, %rcx 152 test %rcx, %rcx 153 jnz L(find_zero) 154 mov %rax, %r8 155 mov %rdi, %rsi 156 jmp L(loop) 157 158 .p2align 4 159L(find_zero): 160 test $15, %cl 161 jnz L(find_zero_in_first_wchar) 162 test %cl, %cl 163 jnz L(find_zero_in_second_wchar) 164 test $15, %ch 165 jnz L(find_zero_in_third_wchar) 166 167 and $1 << 13 - 1, %rax 168 jz L(return_value) 169 170 test $15 << 4, %ah 171 jnz L(match_fourth_wchar) 172 test %ah, %ah 173 jnz L(match_third_wchar) 174 test $15 << 4, %al 175 jnz L(match_second_wchar) 176 lea -16(%rdi), %rax 177 ret 178 179 .p2align 4 180L(find_zero_in_first_wchar): 181 test $1, %rax 182 jz L(return_value) 183 lea -16(%rdi), %rax 184 ret 185 186 .p2align 4 187L(find_zero_in_second_wchar): 188 and $1 << 5 - 1, %rax 189 jz L(return_value) 190 191 test $15 << 4, %al 192 jnz L(match_second_wchar) 193 lea -16(%rdi), %rax 194 ret 195 196 .p2align 4 197L(find_zero_in_third_wchar): 198 and $1 << 9 - 1, %rax 199 jz L(return_value) 200 201 test %ah, %ah 202 jnz L(match_third_wchar) 203 test $15 << 4, %al 204 jnz L(match_second_wchar) 205 lea -16(%rdi), %rax 206 ret 207 208 .p2align 4 209L(prolog_find_zero): 210 add %rcx, %rdi 211 mov %rdx, %rcx 212L(prolog_find_zero_1): 213 test $15, %cl 214 jnz L(prolog_find_zero_in_first_wchar) 215 test %cl, %cl 216 jnz L(prolog_find_zero_in_second_wchar) 217 test $15, %ch 218 jnz L(prolog_find_zero_in_third_wchar) 219 220 and $1 << 13 - 1, %rax 221 jz L(return_null) 222 223 test $15 << 4, %ah 224 jnz L(match_fourth_wchar) 225 test %ah, %ah 226 jnz L(match_third_wchar) 227 test $15 << 4, %al 228 jnz L(match_second_wchar) 229 lea -16(%rdi), %rax 230 ret 231 232 .p2align 4 233L(prolog_find_zero_in_first_wchar): 234 test $1, %rax 235 jz L(return_null) 236 lea -16(%rdi), %rax 237 ret 238 239 .p2align 4 240L(prolog_find_zero_in_second_wchar): 241 and $1 << 5 - 1, %rax 242 jz L(return_null) 243 244 test $15 << 4, %al 245 jnz L(match_second_wchar) 246 lea -16(%rdi), %rax 247 ret 248 249 .p2align 4 250L(prolog_find_zero_in_third_wchar): 251 and $1 << 9 - 1, %rax 252 jz L(return_null) 253 254 test %ah, %ah 255 jnz L(match_third_wchar) 256 test $15 << 4, %al 257 jnz L(match_second_wchar) 258 lea -16(%rdi), %rax 259 ret 260 261 .p2align 4 262L(match_second_wchar): 263 lea -12(%rdi), %rax 264 ret 265 266 .p2align 4 267L(match_third_wchar): 268 lea -8(%rdi), %rax 269 ret 270 271 .p2align 4 272L(match_fourth_wchar): 273 lea -4(%rdi), %rax 274 ret 275 276 .p2align 4 277L(return_null): 278 xor %rax, %rax 279 ret 280 281END (wcsrchr) 282