1/* memrchr optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef MEMRCHR 24# define MEMRCHR __memrchr_avx2 25# endif 26 27# ifndef VZEROUPPER 28# define VZEROUPPER vzeroupper 29# endif 30 31# ifndef SECTION 32# define SECTION(p) p##.avx 33# endif 34 35# define VEC_SIZE 32 36 37 .section SECTION(.text),"ax",@progbits 38ENTRY (MEMRCHR) 39 /* Broadcast CHAR to YMM0. */ 40 vmovd %esi, %xmm0 41 vpbroadcastb %xmm0, %ymm0 42 43 sub $VEC_SIZE, %RDX_LP 44 jbe L(last_vec_or_less) 45 46 add %RDX_LP, %RDI_LP 47 48 /* Check the last VEC_SIZE bytes. */ 49 vpcmpeqb (%rdi), %ymm0, %ymm1 50 vpmovmskb %ymm1, %eax 51 testl %eax, %eax 52 jnz L(last_vec_x0) 53 54 subq $(VEC_SIZE * 4), %rdi 55 movl %edi, %ecx 56 andl $(VEC_SIZE - 1), %ecx 57 jz L(aligned_more) 58 59 /* Align data for aligned loads in the loop. */ 60 addq $VEC_SIZE, %rdi 61 addq $VEC_SIZE, %rdx 62 andq $-VEC_SIZE, %rdi 63 subq %rcx, %rdx 64 65 .p2align 4 66L(aligned_more): 67 subq $(VEC_SIZE * 4), %rdx 68 jbe L(last_4x_vec_or_less) 69 70 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time 71 since data is only aligned to VEC_SIZE. */ 72 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 73 vpmovmskb %ymm1, %eax 74 testl %eax, %eax 75 jnz L(last_vec_x3) 76 77 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 78 vpmovmskb %ymm2, %eax 79 testl %eax, %eax 80 jnz L(last_vec_x2) 81 82 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 83 vpmovmskb %ymm3, %eax 84 testl %eax, %eax 85 jnz L(last_vec_x1) 86 87 vpcmpeqb (%rdi), %ymm0, %ymm4 88 vpmovmskb %ymm4, %eax 89 testl %eax, %eax 90 jnz L(last_vec_x0) 91 92 /* Align data to 4 * VEC_SIZE for loop with fewer branches. 93 There are some overlaps with above if data isn't aligned 94 to 4 * VEC_SIZE. */ 95 movl %edi, %ecx 96 andl $(VEC_SIZE * 4 - 1), %ecx 97 jz L(loop_4x_vec) 98 99 addq $(VEC_SIZE * 4), %rdi 100 addq $(VEC_SIZE * 4), %rdx 101 andq $-(VEC_SIZE * 4), %rdi 102 subq %rcx, %rdx 103 104 .p2align 4 105L(loop_4x_vec): 106 /* Compare 4 * VEC at a time forward. */ 107 subq $(VEC_SIZE * 4), %rdi 108 subq $(VEC_SIZE * 4), %rdx 109 jbe L(last_4x_vec_or_less) 110 111 vmovdqa (%rdi), %ymm1 112 vmovdqa VEC_SIZE(%rdi), %ymm2 113 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 114 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 115 116 vpcmpeqb %ymm1, %ymm0, %ymm1 117 vpcmpeqb %ymm2, %ymm0, %ymm2 118 vpcmpeqb %ymm3, %ymm0, %ymm3 119 vpcmpeqb %ymm4, %ymm0, %ymm4 120 121 vpor %ymm1, %ymm2, %ymm5 122 vpor %ymm3, %ymm4, %ymm6 123 vpor %ymm5, %ymm6, %ymm5 124 125 vpmovmskb %ymm5, %eax 126 testl %eax, %eax 127 jz L(loop_4x_vec) 128 129 /* There is a match. */ 130 vpmovmskb %ymm4, %eax 131 testl %eax, %eax 132 jnz L(last_vec_x3) 133 134 vpmovmskb %ymm3, %eax 135 testl %eax, %eax 136 jnz L(last_vec_x2) 137 138 vpmovmskb %ymm2, %eax 139 testl %eax, %eax 140 jnz L(last_vec_x1) 141 142 vpmovmskb %ymm1, %eax 143 bsrl %eax, %eax 144 addq %rdi, %rax 145L(return_vzeroupper): 146 ZERO_UPPER_VEC_REGISTERS_RETURN 147 148 .p2align 4 149L(last_4x_vec_or_less): 150 addl $(VEC_SIZE * 4), %edx 151 cmpl $(VEC_SIZE * 2), %edx 152 jbe L(last_2x_vec) 153 154 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 155 vpmovmskb %ymm1, %eax 156 testl %eax, %eax 157 jnz L(last_vec_x3) 158 159 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 160 vpmovmskb %ymm2, %eax 161 testl %eax, %eax 162 jnz L(last_vec_x2) 163 164 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 165 vpmovmskb %ymm3, %eax 166 testl %eax, %eax 167 jnz L(last_vec_x1_check) 168 cmpl $(VEC_SIZE * 3), %edx 169 jbe L(zero) 170 171 vpcmpeqb (%rdi), %ymm0, %ymm4 172 vpmovmskb %ymm4, %eax 173 testl %eax, %eax 174 jz L(zero) 175 bsrl %eax, %eax 176 subq $(VEC_SIZE * 4), %rdx 177 addq %rax, %rdx 178 jl L(zero) 179 addq %rdi, %rax 180 VZEROUPPER_RETURN 181 182 .p2align 4 183L(last_2x_vec): 184 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 185 vpmovmskb %ymm1, %eax 186 testl %eax, %eax 187 jnz L(last_vec_x3_check) 188 cmpl $VEC_SIZE, %edx 189 jbe L(zero) 190 191 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 192 vpmovmskb %ymm1, %eax 193 testl %eax, %eax 194 jz L(zero) 195 bsrl %eax, %eax 196 subq $(VEC_SIZE * 2), %rdx 197 addq %rax, %rdx 198 jl L(zero) 199 addl $(VEC_SIZE * 2), %eax 200 addq %rdi, %rax 201 VZEROUPPER_RETURN 202 203 .p2align 4 204L(last_vec_x0): 205 bsrl %eax, %eax 206 addq %rdi, %rax 207 VZEROUPPER_RETURN 208 209 .p2align 4 210L(last_vec_x1): 211 bsrl %eax, %eax 212 addl $VEC_SIZE, %eax 213 addq %rdi, %rax 214 VZEROUPPER_RETURN 215 216 .p2align 4 217L(last_vec_x2): 218 bsrl %eax, %eax 219 addl $(VEC_SIZE * 2), %eax 220 addq %rdi, %rax 221 VZEROUPPER_RETURN 222 223 .p2align 4 224L(last_vec_x3): 225 bsrl %eax, %eax 226 addl $(VEC_SIZE * 3), %eax 227 addq %rdi, %rax 228 ret 229 230 .p2align 4 231L(last_vec_x1_check): 232 bsrl %eax, %eax 233 subq $(VEC_SIZE * 3), %rdx 234 addq %rax, %rdx 235 jl L(zero) 236 addl $VEC_SIZE, %eax 237 addq %rdi, %rax 238 VZEROUPPER_RETURN 239 240 .p2align 4 241L(last_vec_x3_check): 242 bsrl %eax, %eax 243 subq $VEC_SIZE, %rdx 244 addq %rax, %rdx 245 jl L(zero) 246 addl $(VEC_SIZE * 3), %eax 247 addq %rdi, %rax 248 VZEROUPPER_RETURN 249 250 .p2align 4 251L(zero): 252 xorl %eax, %eax 253 VZEROUPPER_RETURN 254 255 .p2align 4 256L(null): 257 xorl %eax, %eax 258 ret 259 260 .p2align 4 261L(last_vec_or_less_aligned): 262 movl %edx, %ecx 263 264 vpcmpeqb (%rdi), %ymm0, %ymm1 265 266 movl $1, %edx 267 /* Support rdx << 32. */ 268 salq %cl, %rdx 269 subq $1, %rdx 270 271 vpmovmskb %ymm1, %eax 272 273 /* Remove the trailing bytes. */ 274 andl %edx, %eax 275 testl %eax, %eax 276 jz L(zero) 277 278 bsrl %eax, %eax 279 addq %rdi, %rax 280 VZEROUPPER_RETURN 281 282 .p2align 4 283L(last_vec_or_less): 284 addl $VEC_SIZE, %edx 285 286 /* Check for zero length. */ 287 testl %edx, %edx 288 jz L(null) 289 290 movl %edi, %ecx 291 andl $(VEC_SIZE - 1), %ecx 292 jz L(last_vec_or_less_aligned) 293 294 movl %ecx, %esi 295 movl %ecx, %r8d 296 addl %edx, %esi 297 andq $-VEC_SIZE, %rdi 298 299 subl $VEC_SIZE, %esi 300 ja L(last_vec_2x_aligned) 301 302 /* Check the last VEC. */ 303 vpcmpeqb (%rdi), %ymm0, %ymm1 304 vpmovmskb %ymm1, %eax 305 306 /* Remove the leading and trailing bytes. */ 307 sarl %cl, %eax 308 movl %edx, %ecx 309 310 movl $1, %edx 311 sall %cl, %edx 312 subl $1, %edx 313 314 andl %edx, %eax 315 testl %eax, %eax 316 jz L(zero) 317 318 bsrl %eax, %eax 319 addq %rdi, %rax 320 addq %r8, %rax 321 VZEROUPPER_RETURN 322 323 .p2align 4 324L(last_vec_2x_aligned): 325 movl %esi, %ecx 326 327 /* Check the last VEC. */ 328 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 329 330 movl $1, %edx 331 sall %cl, %edx 332 subl $1, %edx 333 334 vpmovmskb %ymm1, %eax 335 336 /* Remove the trailing bytes. */ 337 andl %edx, %eax 338 339 testl %eax, %eax 340 jnz L(last_vec_x1) 341 342 /* Check the second last VEC. */ 343 vpcmpeqb (%rdi), %ymm0, %ymm1 344 345 movl %r8d, %ecx 346 347 vpmovmskb %ymm1, %eax 348 349 /* Remove the leading bytes. Must use unsigned right shift for 350 bsrl below. */ 351 shrl %cl, %eax 352 testl %eax, %eax 353 jz L(zero) 354 355 bsrl %eax, %eax 356 addq %rdi, %rax 357 addq %r8, %rax 358 VZEROUPPER_RETURN 359END (MEMRCHR) 360#endif 361