1/* strcat with AVX2 2 Copyright (C) 2011-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCAT 24# define STRCAT __strcat_avx2 25# endif 26 27# define USE_AS_STRCAT 28 29/* Number of bytes in a vector register */ 30# define VEC_SIZE 32 31 32# ifndef SECTION 33# define SECTION(p) p##.avx 34# endif 35 36 .section SECTION(.text),"ax",@progbits 37ENTRY (STRCAT) 38 mov %rdi, %r9 39# ifdef USE_AS_STRNCAT 40 mov %rdx, %r8 41# endif 42 43 xor %eax, %eax 44 mov %edi, %ecx 45 and $((VEC_SIZE * 4) - 1), %ecx 46 vpxor %xmm6, %xmm6, %xmm6 47 cmp $(VEC_SIZE * 3), %ecx 48 ja L(fourth_vector_boundary) 49 vpcmpeqb (%rdi), %ymm6, %ymm0 50 vpmovmskb %ymm0, %edx 51 test %edx, %edx 52 jnz L(exit_null_on_first_vector) 53 mov %rdi, %rax 54 and $-VEC_SIZE, %rax 55 jmp L(align_vec_size_start) 56L(fourth_vector_boundary): 57 mov %rdi, %rax 58 and $-VEC_SIZE, %rax 59 vpcmpeqb (%rax), %ymm6, %ymm0 60 mov $-1, %r10d 61 sub %rax, %rcx 62 shl %cl, %r10d 63 vpmovmskb %ymm0, %edx 64 and %r10d, %edx 65 jnz L(exit) 66 67L(align_vec_size_start): 68 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 69 vpmovmskb %ymm0, %edx 70 test %edx, %edx 71 jnz L(exit_null_on_second_vector) 72 73 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 74 vpmovmskb %ymm1, %edx 75 test %edx, %edx 76 jnz L(exit_null_on_third_vector) 77 78 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 79 vpmovmskb %ymm2, %edx 80 test %edx, %edx 81 jnz L(exit_null_on_fourth_vector) 82 83 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 84 vpmovmskb %ymm3, %edx 85 test %edx, %edx 86 jnz L(exit_null_on_fifth_vector) 87 88 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 89 add $(VEC_SIZE * 4), %rax 90 vpmovmskb %ymm0, %edx 91 test %edx, %edx 92 jnz L(exit_null_on_second_vector) 93 94 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 95 vpmovmskb %ymm1, %edx 96 test %edx, %edx 97 jnz L(exit_null_on_third_vector) 98 99 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 100 vpmovmskb %ymm2, %edx 101 test %edx, %edx 102 jnz L(exit_null_on_fourth_vector) 103 104 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 105 vpmovmskb %ymm3, %edx 106 test %edx, %edx 107 jnz L(exit_null_on_fifth_vector) 108 109 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 110 add $(VEC_SIZE * 4), %rax 111 vpmovmskb %ymm0, %edx 112 test %edx, %edx 113 jnz L(exit_null_on_second_vector) 114 115 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 116 vpmovmskb %ymm1, %edx 117 test %edx, %edx 118 jnz L(exit_null_on_third_vector) 119 120 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 121 vpmovmskb %ymm2, %edx 122 test %edx, %edx 123 jnz L(exit_null_on_fourth_vector) 124 125 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 126 vpmovmskb %ymm3, %edx 127 test %edx, %edx 128 jnz L(exit_null_on_fifth_vector) 129 130 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 131 add $(VEC_SIZE * 4), %rax 132 vpmovmskb %ymm0, %edx 133 test %edx, %edx 134 jnz L(exit_null_on_second_vector) 135 136 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 137 vpmovmskb %ymm1, %edx 138 test %edx, %edx 139 jnz L(exit_null_on_third_vector) 140 141 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 142 vpmovmskb %ymm2, %edx 143 test %edx, %edx 144 jnz L(exit_null_on_fourth_vector) 145 146 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 147 vpmovmskb %ymm3, %edx 148 test %edx, %edx 149 jnz L(exit_null_on_fifth_vector) 150 151 test $((VEC_SIZE * 4) - 1), %rax 152 jz L(align_four_vec_loop) 153 154 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 155 add $(VEC_SIZE * 5), %rax 156 vpmovmskb %ymm0, %edx 157 test %edx, %edx 158 jnz L(exit) 159 160 test $((VEC_SIZE * 4) - 1), %rax 161 jz L(align_four_vec_loop) 162 163 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 164 add $VEC_SIZE, %rax 165 vpmovmskb %ymm1, %edx 166 test %edx, %edx 167 jnz L(exit) 168 169 test $((VEC_SIZE * 4) - 1), %rax 170 jz L(align_four_vec_loop) 171 172 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 173 add $VEC_SIZE, %rax 174 vpmovmskb %ymm2, %edx 175 test %edx, %edx 176 jnz L(exit) 177 178 test $((VEC_SIZE * 4) - 1), %rax 179 jz L(align_four_vec_loop) 180 181 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 182 add $VEC_SIZE, %rax 183 vpmovmskb %ymm3, %edx 184 test %edx, %edx 185 jnz L(exit) 186 187 add $VEC_SIZE, %rax 188 189 .p2align 4 190L(align_four_vec_loop): 191 vmovaps (%rax), %ymm4 192 vpminub VEC_SIZE(%rax), %ymm4, %ymm4 193 vmovaps (VEC_SIZE * 2)(%rax), %ymm5 194 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 195 add $(VEC_SIZE * 4), %rax 196 vpminub %ymm4, %ymm5, %ymm5 197 vpcmpeqb %ymm5, %ymm6, %ymm5 198 vpmovmskb %ymm5, %edx 199 test %edx, %edx 200 jz L(align_four_vec_loop) 201 202 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 203 sub $(VEC_SIZE * 5), %rax 204 vpmovmskb %ymm0, %edx 205 test %edx, %edx 206 jnz L(exit_null_on_second_vector) 207 208 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 209 vpmovmskb %ymm1, %edx 210 test %edx, %edx 211 jnz L(exit_null_on_third_vector) 212 213 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 214 vpmovmskb %ymm2, %edx 215 test %edx, %edx 216 jnz L(exit_null_on_fourth_vector) 217 218 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 219 vpmovmskb %ymm3, %edx 220 sub %rdi, %rax 221 bsf %rdx, %rdx 222 add %rdx, %rax 223 add $(VEC_SIZE * 4), %rax 224 jmp L(StartStrcpyPart) 225 226 .p2align 4 227L(exit): 228 sub %rdi, %rax 229L(exit_null_on_first_vector): 230 bsf %rdx, %rdx 231 add %rdx, %rax 232 jmp L(StartStrcpyPart) 233 234 .p2align 4 235L(exit_null_on_second_vector): 236 sub %rdi, %rax 237 bsf %rdx, %rdx 238 add %rdx, %rax 239 add $VEC_SIZE, %rax 240 jmp L(StartStrcpyPart) 241 242 .p2align 4 243L(exit_null_on_third_vector): 244 sub %rdi, %rax 245 bsf %rdx, %rdx 246 add %rdx, %rax 247 add $(VEC_SIZE * 2), %rax 248 jmp L(StartStrcpyPart) 249 250 .p2align 4 251L(exit_null_on_fourth_vector): 252 sub %rdi, %rax 253 bsf %rdx, %rdx 254 add %rdx, %rax 255 add $(VEC_SIZE * 3), %rax 256 jmp L(StartStrcpyPart) 257 258 .p2align 4 259L(exit_null_on_fifth_vector): 260 sub %rdi, %rax 261 bsf %rdx, %rdx 262 add %rdx, %rax 263 add $(VEC_SIZE * 4), %rax 264 265 .p2align 4 266L(StartStrcpyPart): 267 lea (%r9, %rax), %rdi 268 mov %rsi, %rcx 269 mov %r9, %rax /* save result */ 270 271# ifdef USE_AS_STRNCAT 272 test %r8, %r8 273 jz L(ExitZero) 274# define USE_AS_STRNCPY 275# endif 276 277# include "strcpy-avx2.S" 278#endif 279