1/* strchr/strchrnul optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCHR 24# define STRCHR __strchr_evex 25# endif 26 27# define VMOVU vmovdqu64 28# define VMOVA vmovdqa64 29 30# ifdef USE_AS_WCSCHR 31# define VPBROADCAST vpbroadcastd 32# define VPCMP vpcmpd 33# define VPMINU vpminud 34# define CHAR_REG esi 35# define SHIFT_REG ecx 36# define CHAR_SIZE 4 37# else 38# define VPBROADCAST vpbroadcastb 39# define VPCMP vpcmpb 40# define VPMINU vpminub 41# define CHAR_REG sil 42# define SHIFT_REG edx 43# define CHAR_SIZE 1 44# endif 45 46# define XMMZERO xmm16 47 48# define YMMZERO ymm16 49# define YMM0 ymm17 50# define YMM1 ymm18 51# define YMM2 ymm19 52# define YMM3 ymm20 53# define YMM4 ymm21 54# define YMM5 ymm22 55# define YMM6 ymm23 56# define YMM7 ymm24 57# define YMM8 ymm25 58 59# define VEC_SIZE 32 60# define PAGE_SIZE 4096 61# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) 62 63 .section .text.evex,"ax",@progbits 64ENTRY (STRCHR) 65 /* Broadcast CHAR to YMM0. */ 66 VPBROADCAST %esi, %YMM0 67 movl %edi, %eax 68 andl $(PAGE_SIZE - 1), %eax 69 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 70 71 /* Check if we cross page boundary with one vector load. 72 Otherwise it is safe to use an unaligned load. */ 73 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 74 ja L(cross_page_boundary) 75 76 /* Check the first VEC_SIZE bytes. Search for both CHAR and the 77 null bytes. */ 78 VMOVU (%rdi), %YMM1 79 80 /* Leaves only CHARS matching esi as 0. */ 81 vpxorq %YMM1, %YMM0, %YMM2 82 VPMINU %YMM2, %YMM1, %YMM2 83 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 84 VPCMP $0, %YMMZERO, %YMM2, %k0 85 kmovd %k0, %eax 86 testl %eax, %eax 87 jz L(aligned_more) 88 tzcntl %eax, %eax 89# ifdef USE_AS_WCSCHR 90 /* NB: Multiply wchar_t count by 4 to get the number of bytes. 91 */ 92 leaq (%rdi, %rax, CHAR_SIZE), %rax 93# else 94 addq %rdi, %rax 95# endif 96# ifndef USE_AS_STRCHRNUL 97 /* Found CHAR or the null byte. */ 98 cmp (%rax), %CHAR_REG 99 jne L(zero) 100# endif 101 ret 102 103 /* .p2align 5 helps keep performance more consistent if ENTRY() 104 alignment % 32 was either 16 or 0. As well this makes the 105 alignment % 32 of the loop_4x_vec fixed which makes tuning it 106 easier. */ 107 .p2align 5 108L(first_vec_x3): 109 tzcntl %eax, %eax 110# ifndef USE_AS_STRCHRNUL 111 /* Found CHAR or the null byte. */ 112 cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 113 jne L(zero) 114# endif 115 /* NB: Multiply sizeof char type (1 or 4) to get the number of 116 bytes. */ 117 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax 118 ret 119 120# ifndef USE_AS_STRCHRNUL 121L(zero): 122 xorl %eax, %eax 123 ret 124# endif 125 126 .p2align 4 127L(first_vec_x4): 128# ifndef USE_AS_STRCHRNUL 129 /* Check to see if first match was CHAR (k0) or null (k1). */ 130 kmovd %k0, %eax 131 tzcntl %eax, %eax 132 kmovd %k1, %ecx 133 /* bzhil will not be 0 if first match was null. */ 134 bzhil %eax, %ecx, %ecx 135 jne L(zero) 136# else 137 /* Combine CHAR and null matches. */ 138 kord %k0, %k1, %k0 139 kmovd %k0, %eax 140 tzcntl %eax, %eax 141# endif 142 /* NB: Multiply sizeof char type (1 or 4) to get the number of 143 bytes. */ 144 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax 145 ret 146 147 .p2align 4 148L(first_vec_x1): 149 tzcntl %eax, %eax 150# ifndef USE_AS_STRCHRNUL 151 /* Found CHAR or the null byte. */ 152 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 153 jne L(zero) 154 155# endif 156 /* NB: Multiply sizeof char type (1 or 4) to get the number of 157 bytes. */ 158 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax 159 ret 160 161 .p2align 4 162L(first_vec_x2): 163# ifndef USE_AS_STRCHRNUL 164 /* Check to see if first match was CHAR (k0) or null (k1). */ 165 kmovd %k0, %eax 166 tzcntl %eax, %eax 167 kmovd %k1, %ecx 168 /* bzhil will not be 0 if first match was null. */ 169 bzhil %eax, %ecx, %ecx 170 jne L(zero) 171# else 172 /* Combine CHAR and null matches. */ 173 kord %k0, %k1, %k0 174 kmovd %k0, %eax 175 tzcntl %eax, %eax 176# endif 177 /* NB: Multiply sizeof char type (1 or 4) to get the number of 178 bytes. */ 179 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 180 ret 181 182 .p2align 4 183L(aligned_more): 184 /* Align data to VEC_SIZE. */ 185 andq $-VEC_SIZE, %rdi 186L(cross_page_continue): 187 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since 188 data is only aligned to VEC_SIZE. Use two alternating methods 189 for checking VEC to balance latency and port contention. */ 190 191 /* This method has higher latency but has better port 192 distribution. */ 193 VMOVA (VEC_SIZE)(%rdi), %YMM1 194 /* Leaves only CHARS matching esi as 0. */ 195 vpxorq %YMM1, %YMM0, %YMM2 196 VPMINU %YMM2, %YMM1, %YMM2 197 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 198 VPCMP $0, %YMMZERO, %YMM2, %k0 199 kmovd %k0, %eax 200 testl %eax, %eax 201 jnz L(first_vec_x1) 202 203 /* This method has higher latency but has better port 204 distribution. */ 205 VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 206 /* Each bit in K0 represents a CHAR in YMM1. */ 207 VPCMP $0, %YMM1, %YMM0, %k0 208 /* Each bit in K1 represents a CHAR in YMM1. */ 209 VPCMP $0, %YMM1, %YMMZERO, %k1 210 kortestd %k0, %k1 211 jnz L(first_vec_x2) 212 213 VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 214 /* Leaves only CHARS matching esi as 0. */ 215 vpxorq %YMM1, %YMM0, %YMM2 216 VPMINU %YMM2, %YMM1, %YMM2 217 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 218 VPCMP $0, %YMMZERO, %YMM2, %k0 219 kmovd %k0, %eax 220 testl %eax, %eax 221 jnz L(first_vec_x3) 222 223 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 224 /* Each bit in K0 represents a CHAR in YMM1. */ 225 VPCMP $0, %YMM1, %YMM0, %k0 226 /* Each bit in K1 represents a CHAR in YMM1. */ 227 VPCMP $0, %YMM1, %YMMZERO, %k1 228 kortestd %k0, %k1 229 jnz L(first_vec_x4) 230 231 /* Align data to VEC_SIZE * 4 for the loop. */ 232 addq $VEC_SIZE, %rdi 233 andq $-(VEC_SIZE * 4), %rdi 234 235 .p2align 4 236L(loop_4x_vec): 237 /* Check 4x VEC at a time. No penalty to imm32 offset with evex 238 encoding. */ 239 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 240 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 241 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 242 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 243 244 /* For YMM1 and YMM3 use xor to set the CHARs matching esi to 245 zero. */ 246 vpxorq %YMM1, %YMM0, %YMM5 247 /* For YMM2 and YMM4 cmp not equals to CHAR and store result in 248 k register. Its possible to save either 1 or 2 instructions 249 using cmp no equals method for either YMM1 or YMM1 and YMM3 250 respectively but bottleneck on p5 makes it not worth it. */ 251 VPCMP $4, %YMM0, %YMM2, %k2 252 vpxorq %YMM3, %YMM0, %YMM7 253 VPCMP $4, %YMM0, %YMM4, %k4 254 255 /* Use min to select all zeros from either xor or end of string). 256 */ 257 VPMINU %YMM1, %YMM5, %YMM1 258 VPMINU %YMM3, %YMM7, %YMM3 259 260 /* Use min + zeromask to select for zeros. Since k2 and k4 will 261 have 0 as positions that matched with CHAR which will set 262 zero in the corresponding destination bytes in YMM2 / YMM4. 263 */ 264 VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} 265 VPMINU %YMM3, %YMM4, %YMM4 266 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} 267 268 VPCMP $0, %YMMZERO, %YMM4, %k1 269 kmovd %k1, %ecx 270 subq $-(VEC_SIZE * 4), %rdi 271 testl %ecx, %ecx 272 jz L(loop_4x_vec) 273 274 VPCMP $0, %YMMZERO, %YMM1, %k0 275 kmovd %k0, %eax 276 testl %eax, %eax 277 jnz L(last_vec_x1) 278 279 VPCMP $0, %YMMZERO, %YMM2, %k0 280 kmovd %k0, %eax 281 testl %eax, %eax 282 jnz L(last_vec_x2) 283 284 VPCMP $0, %YMMZERO, %YMM3, %k0 285 kmovd %k0, %eax 286 /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ 287# ifdef USE_AS_WCSCHR 288 sall $8, %ecx 289 orl %ecx, %eax 290 tzcntl %eax, %eax 291# else 292 salq $32, %rcx 293 orq %rcx, %rax 294 tzcntq %rax, %rax 295# endif 296# ifndef USE_AS_STRCHRNUL 297 /* Check if match was CHAR or null. */ 298 cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 299 jne L(zero_end) 300# endif 301 /* NB: Multiply sizeof char type (1 or 4) to get the number of 302 bytes. */ 303 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax 304 ret 305 306# ifndef USE_AS_STRCHRNUL 307L(zero_end): 308 xorl %eax, %eax 309 ret 310# endif 311 312 .p2align 4 313L(last_vec_x1): 314 tzcntl %eax, %eax 315# ifndef USE_AS_STRCHRNUL 316 /* Check if match was null. */ 317 cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG 318 jne L(zero_end) 319# endif 320 /* NB: Multiply sizeof char type (1 or 4) to get the number of 321 bytes. */ 322 leaq (%rdi, %rax, CHAR_SIZE), %rax 323 ret 324 325 .p2align 4 326L(last_vec_x2): 327 tzcntl %eax, %eax 328# ifndef USE_AS_STRCHRNUL 329 /* Check if match was null. */ 330 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG 331 jne L(zero_end) 332# endif 333 /* NB: Multiply sizeof char type (1 or 4) to get the number of 334 bytes. */ 335 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax 336 ret 337 338 /* Cold case for crossing page with first load. */ 339 .p2align 4 340L(cross_page_boundary): 341 movq %rdi, %rdx 342 /* Align rdi. */ 343 andq $-VEC_SIZE, %rdi 344 VMOVA (%rdi), %YMM1 345 /* Leaves only CHARS matching esi as 0. */ 346 vpxorq %YMM1, %YMM0, %YMM2 347 VPMINU %YMM2, %YMM1, %YMM2 348 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ 349 VPCMP $0, %YMMZERO, %YMM2, %k0 350 kmovd %k0, %eax 351 /* Remove the leading bits. */ 352# ifdef USE_AS_WCSCHR 353 movl %edx, %SHIFT_REG 354 /* NB: Divide shift count by 4 since each bit in K1 represent 4 355 bytes. */ 356 sarl $2, %SHIFT_REG 357 andl $(CHAR_PER_VEC - 1), %SHIFT_REG 358# endif 359 sarxl %SHIFT_REG, %eax, %eax 360 /* If eax is zero continue. */ 361 testl %eax, %eax 362 jz L(cross_page_continue) 363 tzcntl %eax, %eax 364# ifndef USE_AS_STRCHRNUL 365 /* Check to see if match was CHAR or null. */ 366 cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG 367 jne L(zero_end) 368# endif 369# ifdef USE_AS_WCSCHR 370 /* NB: Multiply wchar_t count by 4 to get the number of 371 bytes. */ 372 leaq (%rdx, %rax, CHAR_SIZE), %rax 373# else 374 addq %rdx, %rax 375# endif 376 ret 377 378END (STRCHR) 379# endif 380