1/* strchr/strchrnul optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCHR 24# define STRCHR __strchr_avx2 25# endif 26 27# ifdef USE_AS_WCSCHR 28# define VPBROADCAST vpbroadcastd 29# define VPCMPEQ vpcmpeqd 30# define VPMINU vpminud 31# define CHAR_REG esi 32# else 33# define VPBROADCAST vpbroadcastb 34# define VPCMPEQ vpcmpeqb 35# define VPMINU vpminub 36# define CHAR_REG sil 37# endif 38 39# ifndef VZEROUPPER 40# define VZEROUPPER vzeroupper 41# endif 42 43# ifndef SECTION 44# define SECTION(p) p##.avx 45# endif 46 47# define VEC_SIZE 32 48# define PAGE_SIZE 4096 49 50 .section SECTION(.text),"ax",@progbits 51ENTRY (STRCHR) 52 /* Broadcast CHAR to YMM0. */ 53 vmovd %esi, %xmm0 54 movl %edi, %eax 55 andl $(PAGE_SIZE - 1), %eax 56 VPBROADCAST %xmm0, %ymm0 57 vpxor %xmm9, %xmm9, %xmm9 58 59 /* Check if we cross page boundary with one vector load. */ 60 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 61 ja L(cross_page_boundary) 62 63 /* Check the first VEC_SIZE bytes. Search for both CHAR and the 64 null byte. */ 65 vmovdqu (%rdi), %ymm8 66 VPCMPEQ %ymm8, %ymm0, %ymm1 67 VPCMPEQ %ymm8, %ymm9, %ymm2 68 vpor %ymm1, %ymm2, %ymm1 69 vpmovmskb %ymm1, %eax 70 testl %eax, %eax 71 jz L(aligned_more) 72 tzcntl %eax, %eax 73# ifndef USE_AS_STRCHRNUL 74 /* Found CHAR or the null byte. */ 75 cmp (%rdi, %rax), %CHAR_REG 76 jne L(zero) 77# endif 78 addq %rdi, %rax 79 VZEROUPPER_RETURN 80 81 /* .p2align 5 helps keep performance more consistent if ENTRY() 82 alignment % 32 was either 16 or 0. As well this makes the 83 alignment % 32 of the loop_4x_vec fixed which makes tuning it 84 easier. */ 85 .p2align 5 86L(first_vec_x4): 87 tzcntl %eax, %eax 88 addq $(VEC_SIZE * 3 + 1), %rdi 89# ifndef USE_AS_STRCHRNUL 90 /* Found CHAR or the null byte. */ 91 cmp (%rdi, %rax), %CHAR_REG 92 jne L(zero) 93# endif 94 addq %rdi, %rax 95 VZEROUPPER_RETURN 96 97# ifndef USE_AS_STRCHRNUL 98L(zero): 99 xorl %eax, %eax 100 VZEROUPPER_RETURN 101# endif 102 103 104 .p2align 4 105L(first_vec_x1): 106 tzcntl %eax, %eax 107 incq %rdi 108# ifndef USE_AS_STRCHRNUL 109 /* Found CHAR or the null byte. */ 110 cmp (%rdi, %rax), %CHAR_REG 111 jne L(zero) 112# endif 113 addq %rdi, %rax 114 VZEROUPPER_RETURN 115 116 .p2align 4 117L(first_vec_x2): 118 tzcntl %eax, %eax 119 addq $(VEC_SIZE + 1), %rdi 120# ifndef USE_AS_STRCHRNUL 121 /* Found CHAR or the null byte. */ 122 cmp (%rdi, %rax), %CHAR_REG 123 jne L(zero) 124# endif 125 addq %rdi, %rax 126 VZEROUPPER_RETURN 127 128 .p2align 4 129L(first_vec_x3): 130 tzcntl %eax, %eax 131 addq $(VEC_SIZE * 2 + 1), %rdi 132# ifndef USE_AS_STRCHRNUL 133 /* Found CHAR or the null byte. */ 134 cmp (%rdi, %rax), %CHAR_REG 135 jne L(zero) 136# endif 137 addq %rdi, %rax 138 VZEROUPPER_RETURN 139 140 .p2align 4 141L(aligned_more): 142 /* Align data to VEC_SIZE - 1. This is the same number of 143 instructions as using andq -VEC_SIZE but saves 4 bytes of code 144 on x4 check. */ 145 orq $(VEC_SIZE - 1), %rdi 146L(cross_page_continue): 147 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time 148 since data is only aligned to VEC_SIZE. */ 149 vmovdqa 1(%rdi), %ymm8 150 VPCMPEQ %ymm8, %ymm0, %ymm1 151 VPCMPEQ %ymm8, %ymm9, %ymm2 152 vpor %ymm1, %ymm2, %ymm1 153 vpmovmskb %ymm1, %eax 154 testl %eax, %eax 155 jnz L(first_vec_x1) 156 157 vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 158 VPCMPEQ %ymm8, %ymm0, %ymm1 159 VPCMPEQ %ymm8, %ymm9, %ymm2 160 vpor %ymm1, %ymm2, %ymm1 161 vpmovmskb %ymm1, %eax 162 testl %eax, %eax 163 jnz L(first_vec_x2) 164 165 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 166 VPCMPEQ %ymm8, %ymm0, %ymm1 167 VPCMPEQ %ymm8, %ymm9, %ymm2 168 vpor %ymm1, %ymm2, %ymm1 169 vpmovmskb %ymm1, %eax 170 testl %eax, %eax 171 jnz L(first_vec_x3) 172 173 vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 174 VPCMPEQ %ymm8, %ymm0, %ymm1 175 VPCMPEQ %ymm8, %ymm9, %ymm2 176 vpor %ymm1, %ymm2, %ymm1 177 vpmovmskb %ymm1, %eax 178 testl %eax, %eax 179 jnz L(first_vec_x4) 180 /* Align data to VEC_SIZE * 4 - 1. */ 181 addq $(VEC_SIZE * 4 + 1), %rdi 182 andq $-(VEC_SIZE * 4), %rdi 183 .p2align 4 184L(loop_4x_vec): 185 /* Compare 4 * VEC at a time forward. */ 186 vmovdqa (%rdi), %ymm5 187 vmovdqa (VEC_SIZE)(%rdi), %ymm6 188 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 189 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 190 191 /* Leaves only CHARS matching esi as 0. */ 192 vpxor %ymm5, %ymm0, %ymm1 193 vpxor %ymm6, %ymm0, %ymm2 194 vpxor %ymm7, %ymm0, %ymm3 195 vpxor %ymm8, %ymm0, %ymm4 196 197 VPMINU %ymm1, %ymm5, %ymm1 198 VPMINU %ymm2, %ymm6, %ymm2 199 VPMINU %ymm3, %ymm7, %ymm3 200 VPMINU %ymm4, %ymm8, %ymm4 201 202 VPMINU %ymm1, %ymm2, %ymm5 203 VPMINU %ymm3, %ymm4, %ymm6 204 205 VPMINU %ymm5, %ymm6, %ymm6 206 207 VPCMPEQ %ymm6, %ymm9, %ymm6 208 vpmovmskb %ymm6, %ecx 209 subq $-(VEC_SIZE * 4), %rdi 210 testl %ecx, %ecx 211 jz L(loop_4x_vec) 212 213 214 VPCMPEQ %ymm1, %ymm9, %ymm1 215 vpmovmskb %ymm1, %eax 216 testl %eax, %eax 217 jnz L(last_vec_x0) 218 219 220 VPCMPEQ %ymm5, %ymm9, %ymm2 221 vpmovmskb %ymm2, %eax 222 testl %eax, %eax 223 jnz L(last_vec_x1) 224 225 VPCMPEQ %ymm3, %ymm9, %ymm3 226 vpmovmskb %ymm3, %eax 227 /* rcx has combined result from all 4 VEC. It will only be used 228 if the first 3 other VEC all did not contain a match. */ 229 salq $32, %rcx 230 orq %rcx, %rax 231 tzcntq %rax, %rax 232 subq $(VEC_SIZE * 2), %rdi 233# ifndef USE_AS_STRCHRNUL 234 /* Found CHAR or the null byte. */ 235 cmp (%rdi, %rax), %CHAR_REG 236 jne L(zero_end) 237# endif 238 addq %rdi, %rax 239 VZEROUPPER_RETURN 240 241 242 .p2align 4 243L(last_vec_x0): 244 tzcntl %eax, %eax 245 addq $-(VEC_SIZE * 4), %rdi 246# ifndef USE_AS_STRCHRNUL 247 /* Found CHAR or the null byte. */ 248 cmp (%rdi, %rax), %CHAR_REG 249 jne L(zero_end) 250# endif 251 addq %rdi, %rax 252 VZEROUPPER_RETURN 253 254# ifndef USE_AS_STRCHRNUL 255L(zero_end): 256 xorl %eax, %eax 257 VZEROUPPER_RETURN 258# endif 259 260 .p2align 4 261L(last_vec_x1): 262 tzcntl %eax, %eax 263 subq $(VEC_SIZE * 3), %rdi 264# ifndef USE_AS_STRCHRNUL 265 /* Found CHAR or the null byte. */ 266 cmp (%rdi, %rax), %CHAR_REG 267 jne L(zero_end) 268# endif 269 addq %rdi, %rax 270 VZEROUPPER_RETURN 271 272 273 /* Cold case for crossing page with first load. */ 274 .p2align 4 275L(cross_page_boundary): 276 movq %rdi, %rdx 277 /* Align rdi to VEC_SIZE - 1. */ 278 orq $(VEC_SIZE - 1), %rdi 279 vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 280 VPCMPEQ %ymm8, %ymm0, %ymm1 281 VPCMPEQ %ymm8, %ymm9, %ymm2 282 vpor %ymm1, %ymm2, %ymm1 283 vpmovmskb %ymm1, %eax 284 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT 285 so no need to manually mod edx. */ 286 sarxl %edx, %eax, %eax 287 testl %eax, %eax 288 jz L(cross_page_continue) 289 tzcntl %eax, %eax 290# ifndef USE_AS_STRCHRNUL 291 xorl %ecx, %ecx 292 /* Found CHAR or the null byte. */ 293 cmp (%rdx, %rax), %CHAR_REG 294 leaq (%rdx, %rax), %rax 295 cmovne %rcx, %rax 296# else 297 addq %rdx, %rax 298# endif 299L(return_vzeroupper): 300 ZERO_UPPER_VEC_REGISTERS_RETURN 301 302END (STRCHR) 303# endif 304