1/* SSE2 version of strlen and SSE4.1 version of wcslen. 2 Copyright (C) 2012-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifdef AS_WCSLEN 22# define PMINU pminud 23# define PCMPEQ pcmpeqd 24# define SHIFT_RETURN shrq $2, %rax 25#else 26# define PMINU pminub 27# define PCMPEQ pcmpeqb 28# define SHIFT_RETURN 29#endif 30 31/* Long lived register in strlen(s), strnlen(s, n) are: 32 33 %xmm3 - zero 34 %rdi - s 35 %r10 (s+n) & (~(64-1)) 36 %r11 s+n 37*/ 38 39 40.text 41ENTRY(strlen) 42 43/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ 44#define FIND_ZERO \ 45 PCMPEQ (%rax), %xmm0; \ 46 PCMPEQ 16(%rax), %xmm1; \ 47 PCMPEQ 32(%rax), %xmm2; \ 48 PCMPEQ 48(%rax), %xmm3; \ 49 pmovmskb %xmm0, %esi; \ 50 pmovmskb %xmm1, %edx; \ 51 pmovmskb %xmm2, %r8d; \ 52 pmovmskb %xmm3, %ecx; \ 53 salq $16, %rdx; \ 54 salq $16, %rcx; \ 55 orq %rsi, %rdx; \ 56 orq %r8, %rcx; \ 57 salq $32, %rcx; \ 58 orq %rcx, %rdx; 59 60#ifdef AS_STRNLEN 61/* Do not read anything when n==0. */ 62 test %RSI_LP, %RSI_LP 63 jne L(n_nonzero) 64 xor %rax, %rax 65 ret 66L(n_nonzero): 67# ifdef AS_WCSLEN 68/* Check for overflow from maxlen * sizeof(wchar_t). If it would 69 overflow the only way this program doesn't have undefined behavior 70 is if there is a null terminator in valid memory so wcslen will 71 suffice. */ 72 mov %RSI_LP, %R10_LP 73 sar $62, %R10_LP 74 jnz __wcslen_sse4_1 75 sal $2, %RSI_LP 76# endif 77 78/* Initialize long lived registers. */ 79 add %RDI_LP, %RSI_LP 80 mov %RSI_LP, %R10_LP 81 and $-64, %R10_LP 82 mov %RSI_LP, %R11_LP 83#endif 84 85 pxor %xmm0, %xmm0 86 pxor %xmm1, %xmm1 87 pxor %xmm2, %xmm2 88 pxor %xmm3, %xmm3 89 movq %rdi, %rax 90 movq %rdi, %rcx 91 andq $4095, %rcx 92/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ 93 cmpq $4047, %rcx 94/* We cannot unify this branching as it would be ~6 cycles slower. */ 95 ja L(cross_page) 96 97#ifdef AS_STRNLEN 98/* Test if end is among first 64 bytes. */ 99# define STRNLEN_PROLOG \ 100 mov %r11, %rsi; \ 101 subq %rax, %rsi; \ 102 andq $-64, %rax; \ 103 testq $-64, %rsi; \ 104 je L(strnlen_ret) 105#else 106# define STRNLEN_PROLOG andq $-64, %rax; 107#endif 108 109/* Ignore bits in mask that come before start of string. */ 110#define PROLOG(lab) \ 111 movq %rdi, %rcx; \ 112 xorq %rax, %rcx; \ 113 STRNLEN_PROLOG; \ 114 sarq %cl, %rdx; \ 115 test %rdx, %rdx; \ 116 je L(lab); \ 117 bsfq %rdx, %rax; \ 118 SHIFT_RETURN; \ 119 ret 120 121#ifdef AS_STRNLEN 122 andq $-16, %rax 123 FIND_ZERO 124#else 125 /* Test first 16 bytes unaligned. */ 126 movdqu (%rax), %xmm4 127 PCMPEQ %xmm0, %xmm4 128 pmovmskb %xmm4, %edx 129 test %edx, %edx 130 je L(next48_bytes) 131 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ 132 SHIFT_RETURN 133 ret 134 135L(next48_bytes): 136/* Same as FIND_ZERO except we do not check first 16 bytes. */ 137 andq $-16, %rax 138 PCMPEQ 16(%rax), %xmm1 139 PCMPEQ 32(%rax), %xmm2 140 PCMPEQ 48(%rax), %xmm3 141 pmovmskb %xmm1, %edx 142 pmovmskb %xmm2, %r8d 143 pmovmskb %xmm3, %ecx 144 salq $16, %rdx 145 salq $16, %rcx 146 orq %r8, %rcx 147 salq $32, %rcx 148 orq %rcx, %rdx 149#endif 150 151 /* When no zero byte is found xmm1-3 are zero so we do not have to 152 zero them. */ 153 PROLOG(loop) 154 155 .p2align 4 156L(cross_page): 157 andq $-64, %rax 158 FIND_ZERO 159 PROLOG(loop_init) 160 161#ifdef AS_STRNLEN 162/* We must do this check to correctly handle strnlen (s, -1). */ 163L(strnlen_ret): 164 bts %rsi, %rdx 165 sarq %cl, %rdx 166 test %rdx, %rdx 167 je L(loop_init) 168 bsfq %rdx, %rax 169 SHIFT_RETURN 170 ret 171#endif 172 .p2align 4 173L(loop_init): 174 pxor %xmm1, %xmm1 175 pxor %xmm2, %xmm2 176 pxor %xmm3, %xmm3 177#ifdef AS_STRNLEN 178 .p2align 4 179L(loop): 180 181 addq $64, %rax 182 cmpq %rax, %r10 183 je L(exit_end) 184 185 movdqa (%rax), %xmm0 186 PMINU 16(%rax), %xmm0 187 PMINU 32(%rax), %xmm0 188 PMINU 48(%rax), %xmm0 189 PCMPEQ %xmm3, %xmm0 190 pmovmskb %xmm0, %edx 191 testl %edx, %edx 192 jne L(exit) 193 jmp L(loop) 194 195 .p2align 4 196L(exit_end): 197 cmp %rax, %r11 198 je L(first) /* Do not read when end is at page boundary. */ 199 pxor %xmm0, %xmm0 200 FIND_ZERO 201 202L(first): 203 bts %r11, %rdx 204 bsfq %rdx, %rdx 205 addq %rdx, %rax 206 subq %rdi, %rax 207 SHIFT_RETURN 208 ret 209 210 .p2align 4 211L(exit): 212 pxor %xmm0, %xmm0 213 FIND_ZERO 214 215 bsfq %rdx, %rdx 216 addq %rdx, %rax 217 subq %rdi, %rax 218 SHIFT_RETURN 219 ret 220 221#else 222 223 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ 224 .p2align 4 225L(loop): 226 227 movdqa 64(%rax), %xmm0 228 PMINU 80(%rax), %xmm0 229 PMINU 96(%rax), %xmm0 230 PMINU 112(%rax), %xmm0 231 PCMPEQ %xmm3, %xmm0 232 pmovmskb %xmm0, %edx 233 testl %edx, %edx 234 jne L(exit64) 235 236 subq $-128, %rax 237 238 movdqa (%rax), %xmm0 239 PMINU 16(%rax), %xmm0 240 PMINU 32(%rax), %xmm0 241 PMINU 48(%rax), %xmm0 242 PCMPEQ %xmm3, %xmm0 243 pmovmskb %xmm0, %edx 244 testl %edx, %edx 245 jne L(exit0) 246 jmp L(loop) 247 248 .p2align 4 249L(exit64): 250 addq $64, %rax 251L(exit0): 252 pxor %xmm0, %xmm0 253 FIND_ZERO 254 255 bsfq %rdx, %rdx 256 addq %rdx, %rax 257 subq %rdi, %rax 258 SHIFT_RETURN 259 ret 260 261#endif 262 263END(strlen) 264