1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRRCHR 24# define STRRCHR __strrchr_evex 25# endif 26 27# define VMOVU vmovdqu64 28# define VMOVA vmovdqa64 29 30# ifdef USE_AS_WCSRCHR 31# define VPBROADCAST vpbroadcastd 32# define VPCMP vpcmpd 33# define SHIFT_REG r8d 34# else 35# define VPBROADCAST vpbroadcastb 36# define VPCMP vpcmpb 37# define SHIFT_REG ecx 38# endif 39 40# define XMMZERO xmm16 41# define YMMZERO ymm16 42# define YMMMATCH ymm17 43# define YMM1 ymm18 44 45# define VEC_SIZE 32 46 47 .section .text.evex,"ax",@progbits 48ENTRY (STRRCHR) 49 movl %edi, %ecx 50 /* Broadcast CHAR to YMMMATCH. */ 51 VPBROADCAST %esi, %YMMMATCH 52 53 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 54 55 /* Check if we may cross page boundary with one vector load. */ 56 andl $(2 * VEC_SIZE - 1), %ecx 57 cmpl $VEC_SIZE, %ecx 58 ja L(cros_page_boundary) 59 60 VMOVU (%rdi), %YMM1 61 62 /* Each bit in K0 represents a null byte in YMM1. */ 63 VPCMP $0, %YMMZERO, %YMM1, %k0 64 /* Each bit in K1 represents a CHAR in YMM1. */ 65 VPCMP $0, %YMMMATCH, %YMM1, %k1 66 kmovd %k0, %ecx 67 kmovd %k1, %eax 68 69 addq $VEC_SIZE, %rdi 70 71 testl %eax, %eax 72 jnz L(first_vec) 73 74 testl %ecx, %ecx 75 jnz L(return_null) 76 77 andq $-VEC_SIZE, %rdi 78 xorl %edx, %edx 79 jmp L(aligned_loop) 80 81 .p2align 4 82L(first_vec): 83 /* Check if there is a null byte. */ 84 testl %ecx, %ecx 85 jnz L(char_and_nul_in_first_vec) 86 87 /* Remember the match and keep searching. */ 88 movl %eax, %edx 89 movq %rdi, %rsi 90 andq $-VEC_SIZE, %rdi 91 jmp L(aligned_loop) 92 93 .p2align 4 94L(cros_page_boundary): 95 andl $(VEC_SIZE - 1), %ecx 96 andq $-VEC_SIZE, %rdi 97 98# ifdef USE_AS_WCSRCHR 99 /* NB: Divide shift count by 4 since each bit in K1 represent 4 100 bytes. */ 101 movl %ecx, %SHIFT_REG 102 sarl $2, %SHIFT_REG 103# endif 104 105 VMOVA (%rdi), %YMM1 106 107 /* Each bit in K0 represents a null byte in YMM1. */ 108 VPCMP $0, %YMMZERO, %YMM1, %k0 109 /* Each bit in K1 represents a CHAR in YMM1. */ 110 VPCMP $0, %YMMMATCH, %YMM1, %k1 111 kmovd %k0, %edx 112 kmovd %k1, %eax 113 114 shrxl %SHIFT_REG, %edx, %edx 115 shrxl %SHIFT_REG, %eax, %eax 116 addq $VEC_SIZE, %rdi 117 118 /* Check if there is a CHAR. */ 119 testl %eax, %eax 120 jnz L(found_char) 121 122 testl %edx, %edx 123 jnz L(return_null) 124 125 jmp L(aligned_loop) 126 127 .p2align 4 128L(found_char): 129 testl %edx, %edx 130 jnz L(char_and_nul) 131 132 /* Remember the match and keep searching. */ 133 movl %eax, %edx 134 leaq (%rdi, %rcx), %rsi 135 136 .p2align 4 137L(aligned_loop): 138 VMOVA (%rdi), %YMM1 139 addq $VEC_SIZE, %rdi 140 141 /* Each bit in K0 represents a null byte in YMM1. */ 142 VPCMP $0, %YMMZERO, %YMM1, %k0 143 /* Each bit in K1 represents a CHAR in YMM1. */ 144 VPCMP $0, %YMMMATCH, %YMM1, %k1 145 kmovd %k0, %ecx 146 kmovd %k1, %eax 147 orl %eax, %ecx 148 jnz L(char_nor_null) 149 150 VMOVA (%rdi), %YMM1 151 add $VEC_SIZE, %rdi 152 153 /* Each bit in K0 represents a null byte in YMM1. */ 154 VPCMP $0, %YMMZERO, %YMM1, %k0 155 /* Each bit in K1 represents a CHAR in YMM1. */ 156 VPCMP $0, %YMMMATCH, %YMM1, %k1 157 kmovd %k0, %ecx 158 kmovd %k1, %eax 159 orl %eax, %ecx 160 jnz L(char_nor_null) 161 162 VMOVA (%rdi), %YMM1 163 addq $VEC_SIZE, %rdi 164 165 /* Each bit in K0 represents a null byte in YMM1. */ 166 VPCMP $0, %YMMZERO, %YMM1, %k0 167 /* Each bit in K1 represents a CHAR in YMM1. */ 168 VPCMP $0, %YMMMATCH, %YMM1, %k1 169 kmovd %k0, %ecx 170 kmovd %k1, %eax 171 orl %eax, %ecx 172 jnz L(char_nor_null) 173 174 VMOVA (%rdi), %YMM1 175 addq $VEC_SIZE, %rdi 176 177 /* Each bit in K0 represents a null byte in YMM1. */ 178 VPCMP $0, %YMMZERO, %YMM1, %k0 179 /* Each bit in K1 represents a CHAR in YMM1. */ 180 VPCMP $0, %YMMMATCH, %YMM1, %k1 181 kmovd %k0, %ecx 182 kmovd %k1, %eax 183 orl %eax, %ecx 184 jz L(aligned_loop) 185 186 .p2align 4 187L(char_nor_null): 188 /* Find a CHAR or a null byte in a loop. */ 189 testl %eax, %eax 190 jnz L(match) 191L(return_value): 192 testl %edx, %edx 193 jz L(return_null) 194 movl %edx, %eax 195 movq %rsi, %rdi 196 bsrl %eax, %eax 197# ifdef USE_AS_WCSRCHR 198 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 199 leaq -VEC_SIZE(%rdi, %rax, 4), %rax 200# else 201 leaq -VEC_SIZE(%rdi, %rax), %rax 202# endif 203 ret 204 205 .p2align 4 206L(match): 207 /* Find a CHAR. Check if there is a null byte. */ 208 kmovd %k0, %ecx 209 testl %ecx, %ecx 210 jnz L(find_nul) 211 212 /* Remember the match and keep searching. */ 213 movl %eax, %edx 214 movq %rdi, %rsi 215 jmp L(aligned_loop) 216 217 .p2align 4 218L(find_nul): 219 /* Mask out any matching bits after the null byte. */ 220 movl %ecx, %r8d 221 subl $1, %r8d 222 xorl %ecx, %r8d 223 andl %r8d, %eax 224 testl %eax, %eax 225 /* If there is no CHAR here, return the remembered one. */ 226 jz L(return_value) 227 bsrl %eax, %eax 228# ifdef USE_AS_WCSRCHR 229 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 230 leaq -VEC_SIZE(%rdi, %rax, 4), %rax 231# else 232 leaq -VEC_SIZE(%rdi, %rax), %rax 233# endif 234 ret 235 236 .p2align 4 237L(char_and_nul): 238 /* Find both a CHAR and a null byte. */ 239 addq %rcx, %rdi 240 movl %edx, %ecx 241L(char_and_nul_in_first_vec): 242 /* Mask out any matching bits after the null byte. */ 243 movl %ecx, %r8d 244 subl $1, %r8d 245 xorl %ecx, %r8d 246 andl %r8d, %eax 247 testl %eax, %eax 248 /* Return null pointer if the null byte comes first. */ 249 jz L(return_null) 250 bsrl %eax, %eax 251# ifdef USE_AS_WCSRCHR 252 /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ 253 leaq -VEC_SIZE(%rdi, %rax, 4), %rax 254# else 255 leaq -VEC_SIZE(%rdi, %rax), %rax 256# endif 257 ret 258 259 .p2align 4 260L(return_null): 261 xorl %eax, %eax 262 ret 263 264END (STRRCHR) 265#endif 266