1/* memrchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# define VMOVA vmovdqa64 24 25# define YMMMATCH ymm16 26 27# define VEC_SIZE 32 28 29 .section .text.evex,"ax",@progbits 30ENTRY (__memrchr_evex) 31 /* Broadcast CHAR to YMMMATCH. */ 32 vpbroadcastb %esi, %YMMMATCH 33 34 sub $VEC_SIZE, %RDX_LP 35 jbe L(last_vec_or_less) 36 37 add %RDX_LP, %RDI_LP 38 39 /* Check the last VEC_SIZE bytes. */ 40 vpcmpb $0, (%rdi), %YMMMATCH, %k1 41 kmovd %k1, %eax 42 testl %eax, %eax 43 jnz L(last_vec_x0) 44 45 subq $(VEC_SIZE * 4), %rdi 46 movl %edi, %ecx 47 andl $(VEC_SIZE - 1), %ecx 48 jz L(aligned_more) 49 50 /* Align data for aligned loads in the loop. */ 51 addq $VEC_SIZE, %rdi 52 addq $VEC_SIZE, %rdx 53 andq $-VEC_SIZE, %rdi 54 subq %rcx, %rdx 55 56 .p2align 4 57L(aligned_more): 58 subq $(VEC_SIZE * 4), %rdx 59 jbe L(last_4x_vec_or_less) 60 61 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time 62 since data is only aligned to VEC_SIZE. */ 63 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 64 kmovd %k1, %eax 65 testl %eax, %eax 66 jnz L(last_vec_x3) 67 68 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 69 kmovd %k2, %eax 70 testl %eax, %eax 71 jnz L(last_vec_x2) 72 73 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 74 kmovd %k3, %eax 75 testl %eax, %eax 76 jnz L(last_vec_x1) 77 78 vpcmpb $0, (%rdi), %YMMMATCH, %k4 79 kmovd %k4, %eax 80 testl %eax, %eax 81 jnz L(last_vec_x0) 82 83 /* Align data to 4 * VEC_SIZE for loop with fewer branches. 84 There are some overlaps with above if data isn't aligned 85 to 4 * VEC_SIZE. */ 86 movl %edi, %ecx 87 andl $(VEC_SIZE * 4 - 1), %ecx 88 jz L(loop_4x_vec) 89 90 addq $(VEC_SIZE * 4), %rdi 91 addq $(VEC_SIZE * 4), %rdx 92 andq $-(VEC_SIZE * 4), %rdi 93 subq %rcx, %rdx 94 95 .p2align 4 96L(loop_4x_vec): 97 /* Compare 4 * VEC at a time forward. */ 98 subq $(VEC_SIZE * 4), %rdi 99 subq $(VEC_SIZE * 4), %rdx 100 jbe L(last_4x_vec_or_less) 101 102 vpcmpb $0, (%rdi), %YMMMATCH, %k1 103 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 104 kord %k1, %k2, %k5 105 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 106 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 107 108 kord %k3, %k4, %k6 109 kortestd %k5, %k6 110 jz L(loop_4x_vec) 111 112 /* There is a match. */ 113 kmovd %k4, %eax 114 testl %eax, %eax 115 jnz L(last_vec_x3) 116 117 kmovd %k3, %eax 118 testl %eax, %eax 119 jnz L(last_vec_x2) 120 121 kmovd %k2, %eax 122 testl %eax, %eax 123 jnz L(last_vec_x1) 124 125 kmovd %k1, %eax 126 bsrl %eax, %eax 127 addq %rdi, %rax 128 ret 129 130 .p2align 4 131L(last_4x_vec_or_less): 132 addl $(VEC_SIZE * 4), %edx 133 cmpl $(VEC_SIZE * 2), %edx 134 jbe L(last_2x_vec) 135 136 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 137 kmovd %k1, %eax 138 testl %eax, %eax 139 jnz L(last_vec_x3) 140 141 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 142 kmovd %k2, %eax 143 testl %eax, %eax 144 jnz L(last_vec_x2) 145 146 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 147 kmovd %k3, %eax 148 testl %eax, %eax 149 jnz L(last_vec_x1_check) 150 cmpl $(VEC_SIZE * 3), %edx 151 jbe L(zero) 152 153 vpcmpb $0, (%rdi), %YMMMATCH, %k4 154 kmovd %k4, %eax 155 testl %eax, %eax 156 jz L(zero) 157 bsrl %eax, %eax 158 subq $(VEC_SIZE * 4), %rdx 159 addq %rax, %rdx 160 jl L(zero) 161 addq %rdi, %rax 162 ret 163 164 .p2align 4 165L(last_2x_vec): 166 vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 167 kmovd %k1, %eax 168 testl %eax, %eax 169 jnz L(last_vec_x3_check) 170 cmpl $VEC_SIZE, %edx 171 jbe L(zero) 172 173 vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 174 kmovd %k1, %eax 175 testl %eax, %eax 176 jz L(zero) 177 bsrl %eax, %eax 178 subq $(VEC_SIZE * 2), %rdx 179 addq %rax, %rdx 180 jl L(zero) 181 addl $(VEC_SIZE * 2), %eax 182 addq %rdi, %rax 183 ret 184 185 .p2align 4 186L(last_vec_x0): 187 bsrl %eax, %eax 188 addq %rdi, %rax 189 ret 190 191 .p2align 4 192L(last_vec_x1): 193 bsrl %eax, %eax 194 addl $VEC_SIZE, %eax 195 addq %rdi, %rax 196 ret 197 198 .p2align 4 199L(last_vec_x2): 200 bsrl %eax, %eax 201 addl $(VEC_SIZE * 2), %eax 202 addq %rdi, %rax 203 ret 204 205 .p2align 4 206L(last_vec_x3): 207 bsrl %eax, %eax 208 addl $(VEC_SIZE * 3), %eax 209 addq %rdi, %rax 210 ret 211 212 .p2align 4 213L(last_vec_x1_check): 214 bsrl %eax, %eax 215 subq $(VEC_SIZE * 3), %rdx 216 addq %rax, %rdx 217 jl L(zero) 218 addl $VEC_SIZE, %eax 219 addq %rdi, %rax 220 ret 221 222 .p2align 4 223L(last_vec_x3_check): 224 bsrl %eax, %eax 225 subq $VEC_SIZE, %rdx 226 addq %rax, %rdx 227 jl L(zero) 228 addl $(VEC_SIZE * 3), %eax 229 addq %rdi, %rax 230 ret 231 232 .p2align 4 233L(zero): 234 xorl %eax, %eax 235 ret 236 237 .p2align 4 238L(last_vec_or_less_aligned): 239 movl %edx, %ecx 240 241 vpcmpb $0, (%rdi), %YMMMATCH, %k1 242 243 movl $1, %edx 244 /* Support rdx << 32. */ 245 salq %cl, %rdx 246 subq $1, %rdx 247 248 kmovd %k1, %eax 249 250 /* Remove the trailing bytes. */ 251 andl %edx, %eax 252 testl %eax, %eax 253 jz L(zero) 254 255 bsrl %eax, %eax 256 addq %rdi, %rax 257 ret 258 259 .p2align 4 260L(last_vec_or_less): 261 addl $VEC_SIZE, %edx 262 263 /* Check for zero length. */ 264 testl %edx, %edx 265 jz L(zero) 266 267 movl %edi, %ecx 268 andl $(VEC_SIZE - 1), %ecx 269 jz L(last_vec_or_less_aligned) 270 271 movl %ecx, %esi 272 movl %ecx, %r8d 273 addl %edx, %esi 274 andq $-VEC_SIZE, %rdi 275 276 subl $VEC_SIZE, %esi 277 ja L(last_vec_2x_aligned) 278 279 /* Check the last VEC. */ 280 vpcmpb $0, (%rdi), %YMMMATCH, %k1 281 kmovd %k1, %eax 282 283 /* Remove the leading and trailing bytes. */ 284 sarl %cl, %eax 285 movl %edx, %ecx 286 287 movl $1, %edx 288 sall %cl, %edx 289 subl $1, %edx 290 291 andl %edx, %eax 292 testl %eax, %eax 293 jz L(zero) 294 295 bsrl %eax, %eax 296 addq %rdi, %rax 297 addq %r8, %rax 298 ret 299 300 .p2align 4 301L(last_vec_2x_aligned): 302 movl %esi, %ecx 303 304 /* Check the last VEC. */ 305 vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 306 307 movl $1, %edx 308 sall %cl, %edx 309 subl $1, %edx 310 311 kmovd %k1, %eax 312 313 /* Remove the trailing bytes. */ 314 andl %edx, %eax 315 316 testl %eax, %eax 317 jnz L(last_vec_x1) 318 319 /* Check the second last VEC. */ 320 vpcmpb $0, (%rdi), %YMMMATCH, %k1 321 322 movl %r8d, %ecx 323 324 kmovd %k1, %eax 325 326 /* Remove the leading bytes. Must use unsigned right shift for 327 bsrl below. */ 328 shrl %cl, %eax 329 testl %eax, %eax 330 jz L(zero) 331 332 bsrl %eax, %eax 333 addq %rdi, %rax 334 addq %r8, %rax 335 ret 336END (__memrchr_evex) 337#endif 338