1/* __memcmpeq optimized with AVX2. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21/* __memcmpeq is implemented as: 22 1. Use ymm vector compares when possible. The only case where 23 vector compares is not possible for when size < VEC_SIZE 24 and loading from either s1 or s2 would cause a page cross. 25 2. Use xmm vector compare when size >= 8 bytes. 26 3. Optimistically compare up to first 4 * VEC_SIZE one at a 27 to check for early mismatches. Only do this if its guranteed the 28 work is not wasted. 29 4. If size is 8 * VEC_SIZE or less, unroll the loop. 30 5. Compare 4 * VEC_SIZE at a time with the aligned first memory 31 area. 32 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. 33 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. 34 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ 35 36# include <sysdep.h> 37 38# ifndef MEMCMPEQ 39# define MEMCMPEQ __memcmpeq_avx2 40# endif 41 42# define VPCMPEQ vpcmpeqb 43 44# ifndef VZEROUPPER 45# define VZEROUPPER vzeroupper 46# endif 47 48# ifndef SECTION 49# define SECTION(p) p##.avx 50# endif 51 52# define VEC_SIZE 32 53# define PAGE_SIZE 4096 54 55 .section SECTION(.text), "ax", @progbits 56ENTRY_P2ALIGN (MEMCMPEQ, 6) 57# ifdef __ILP32__ 58 /* Clear the upper 32 bits. */ 59 movl %edx, %edx 60# endif 61 cmp $VEC_SIZE, %RDX_LP 62 jb L(less_vec) 63 64 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ 65 vmovdqu (%rsi), %ymm1 66 VPCMPEQ (%rdi), %ymm1, %ymm1 67 vpmovmskb %ymm1, %eax 68 incl %eax 69 jnz L(return_neq0) 70 cmpq $(VEC_SIZE * 2), %rdx 71 jbe L(last_1x_vec) 72 73 /* Check second VEC no matter what. */ 74 vmovdqu VEC_SIZE(%rsi), %ymm2 75 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 76 vpmovmskb %ymm2, %eax 77 /* If all 4 VEC where equal eax will be all 1s so incl will overflow 78 and set zero flag. */ 79 incl %eax 80 jnz L(return_neq0) 81 82 /* Less than 4 * VEC. */ 83 cmpq $(VEC_SIZE * 4), %rdx 84 jbe L(last_2x_vec) 85 86 /* Check third and fourth VEC no matter what. */ 87 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 88 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 89 vpmovmskb %ymm3, %eax 90 incl %eax 91 jnz L(return_neq0) 92 93 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 94 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 95 vpmovmskb %ymm4, %eax 96 incl %eax 97 jnz L(return_neq0) 98 99 /* Go to 4x VEC loop. */ 100 cmpq $(VEC_SIZE * 8), %rdx 101 ja L(more_8x_vec) 102 103 /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any 104 branches. */ 105 106 /* Adjust rsi and rdi to avoid indexed address mode. This end up 107 saving a 16 bytes of code, prevents unlamination, and bottlenecks in 108 the AGU. */ 109 addq %rdx, %rsi 110 vmovdqu -(VEC_SIZE * 4)(%rsi), %ymm1 111 vmovdqu -(VEC_SIZE * 3)(%rsi), %ymm2 112 addq %rdx, %rdi 113 114 VPCMPEQ -(VEC_SIZE * 4)(%rdi), %ymm1, %ymm1 115 VPCMPEQ -(VEC_SIZE * 3)(%rdi), %ymm2, %ymm2 116 117 vmovdqu -(VEC_SIZE * 2)(%rsi), %ymm3 118 VPCMPEQ -(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 119 vmovdqu -VEC_SIZE(%rsi), %ymm4 120 VPCMPEQ -VEC_SIZE(%rdi), %ymm4, %ymm4 121 122 /* Reduce VEC0 - VEC4. */ 123 vpand %ymm1, %ymm2, %ymm2 124 vpand %ymm3, %ymm4, %ymm4 125 vpand %ymm2, %ymm4, %ymm4 126 vpmovmskb %ymm4, %eax 127 incl %eax 128L(return_neq0): 129L(return_vzeroupper): 130 ZERO_UPPER_VEC_REGISTERS_RETURN 131 132 /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32 byte 133 aligned. */ 134 .p2align 5 135L(less_vec): 136 /* Check if one or less char. This is necessary for size = 0 but is 137 also faster for size = 1. */ 138 cmpl $1, %edx 139 jbe L(one_or_less) 140 141 /* Check if loading one VEC from either s1 or s2 could cause a page 142 cross. This can have false positives but is by far the fastest 143 method. */ 144 movl %edi, %eax 145 orl %esi, %eax 146 andl $(PAGE_SIZE - 1), %eax 147 cmpl $(PAGE_SIZE - VEC_SIZE), %eax 148 jg L(page_cross_less_vec) 149 150 /* No page cross possible. */ 151 vmovdqu (%rsi), %ymm2 152 VPCMPEQ (%rdi), %ymm2, %ymm2 153 vpmovmskb %ymm2, %eax 154 incl %eax 155 /* Result will be zero if s1 and s2 match. Otherwise first set bit 156 will be first mismatch. */ 157 bzhil %edx, %eax, %eax 158 VZEROUPPER_RETURN 159 160 /* Relatively cold but placing close to L(less_vec) for 2 byte jump 161 encoding. */ 162 .p2align 4 163L(one_or_less): 164 jb L(zero) 165 movzbl (%rsi), %ecx 166 movzbl (%rdi), %eax 167 subl %ecx, %eax 168 /* No ymm register was touched. */ 169 ret 170 /* Within the same 16 byte block is L(one_or_less). */ 171L(zero): 172 xorl %eax, %eax 173 ret 174 175 .p2align 4 176L(last_1x_vec): 177 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 178 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 179 vpmovmskb %ymm1, %eax 180 incl %eax 181 VZEROUPPER_RETURN 182 183 .p2align 4 184L(last_2x_vec): 185 vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 186 VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 187 vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm2 188 VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm2, %ymm2 189 vpand %ymm1, %ymm2, %ymm2 190 vpmovmskb %ymm2, %eax 191 incl %eax 192 VZEROUPPER_RETURN 193 194 .p2align 4 195L(more_8x_vec): 196 /* Set end of s1 in rdx. */ 197 leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx 198 /* rsi stores s2 - s1. This allows loop to only update one pointer. 199 */ 200 subq %rdi, %rsi 201 /* Align s1 pointer. */ 202 andq $-VEC_SIZE, %rdi 203 /* Adjust because first 4x vec where check already. */ 204 subq $-(VEC_SIZE * 4), %rdi 205 .p2align 4 206L(loop_4x_vec): 207 /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). */ 208 vmovdqu (%rsi, %rdi), %ymm1 209 VPCMPEQ (%rdi), %ymm1, %ymm1 210 211 vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 212 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 213 214 vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 215 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 216 217 vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 218 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 219 220 vpand %ymm1, %ymm2, %ymm2 221 vpand %ymm3, %ymm4, %ymm4 222 vpand %ymm2, %ymm4, %ymm4 223 vpmovmskb %ymm4, %eax 224 incl %eax 225 jnz L(return_neq1) 226 subq $-(VEC_SIZE * 4), %rdi 227 /* Check if s1 pointer at end. */ 228 cmpq %rdx, %rdi 229 jb L(loop_4x_vec) 230 231 vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 232 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 233 subq %rdx, %rdi 234 /* rdi has 4 * VEC_SIZE - remaining length. */ 235 cmpl $(VEC_SIZE * 3), %edi 236 jae L(8x_last_1x_vec) 237 /* Load regardless of branch. */ 238 vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 239 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 240 cmpl $(VEC_SIZE * 2), %edi 241 jae L(8x_last_2x_vec) 242 /* Check last 4 VEC. */ 243 vmovdqu VEC_SIZE(%rsi, %rdx), %ymm1 244 VPCMPEQ VEC_SIZE(%rdx), %ymm1, %ymm1 245 246 vmovdqu (%rsi, %rdx), %ymm2 247 VPCMPEQ (%rdx), %ymm2, %ymm2 248 249 vpand %ymm3, %ymm4, %ymm4 250 vpand %ymm1, %ymm2, %ymm3 251L(8x_last_2x_vec): 252 vpand %ymm3, %ymm4, %ymm4 253L(8x_last_1x_vec): 254 vpmovmskb %ymm4, %eax 255 /* Restore s1 pointer to rdi. */ 256 incl %eax 257L(return_neq1): 258 VZEROUPPER_RETURN 259 260 /* Relatively cold case as page cross are unexpected. */ 261 .p2align 4 262L(page_cross_less_vec): 263 cmpl $16, %edx 264 jae L(between_16_31) 265 cmpl $8, %edx 266 ja L(between_9_15) 267 cmpl $4, %edx 268 jb L(between_2_3) 269 /* From 4 to 8 bytes. No branch when size == 4. */ 270 movl (%rdi), %eax 271 subl (%rsi), %eax 272 movl -4(%rdi, %rdx), %ecx 273 movl -4(%rsi, %rdx), %edi 274 subl %edi, %ecx 275 orl %ecx, %eax 276 ret 277 278 .p2align 4,, 8 279L(between_16_31): 280 /* From 16 to 31 bytes. No branch when size == 16. */ 281 282 /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe. 283 */ 284 vmovdqu (%rsi), %xmm1 285 vpcmpeqb (%rdi), %xmm1, %xmm1 286 vmovdqu -16(%rsi, %rdx), %xmm2 287 vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2 288 vpand %xmm1, %xmm2, %xmm2 289 vpmovmskb %xmm2, %eax 290 notw %ax 291 /* No ymm register was touched. */ 292 ret 293 294 .p2align 4,, 8 295L(between_9_15): 296 /* From 9 to 15 bytes. */ 297 movq (%rdi), %rax 298 subq (%rsi), %rax 299 movq -8(%rdi, %rdx), %rcx 300 movq -8(%rsi, %rdx), %rdi 301 subq %rdi, %rcx 302 orq %rcx, %rax 303 /* edx is guranteed to be a non-zero int. */ 304 cmovnz %edx, %eax 305 ret 306 307 /* Don't align. This is cold and aligning here will cause code 308 to spill into next cache line. */ 309L(between_2_3): 310 /* From 2 to 3 bytes. No branch when size == 2. */ 311 movzwl (%rdi), %eax 312 movzwl (%rsi), %ecx 313 subl %ecx, %eax 314 movzbl -1(%rdi, %rdx), %ecx 315 /* All machines that support evex will insert a "merging uop" 316 avoiding any serious partial register stalls. */ 317 subb -1(%rsi, %rdx), %cl 318 orl %ecx, %eax 319 /* No ymm register was touched. */ 320 ret 321 322 /* 2 Bytes from next cache line. */ 323END (MEMCMPEQ) 324#endif 325