1/* strcat with 256-bit EVEX instructions. 2 Copyright (C) 2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#if IS_IN (libc) 20 21# include <sysdep.h> 22 23# ifndef STRCAT 24# define STRCAT __strcat_evex 25# endif 26 27# define VMOVU vmovdqu64 28# define VMOVA vmovdqa64 29 30/* zero register */ 31# define XMMZERO xmm16 32# define YMMZERO ymm16 33# define YMM0 ymm17 34# define YMM1 ymm18 35 36# define USE_AS_STRCAT 37 38/* Number of bytes in a vector register */ 39# define VEC_SIZE 32 40 41 .section .text.evex,"ax",@progbits 42ENTRY (STRCAT) 43 mov %rdi, %r9 44# ifdef USE_AS_STRNCAT 45 mov %rdx, %r8 46# endif 47 48 xor %eax, %eax 49 mov %edi, %ecx 50 and $((VEC_SIZE * 4) - 1), %ecx 51 vpxorq %XMMZERO, %XMMZERO, %XMMZERO 52 cmp $(VEC_SIZE * 3), %ecx 53 ja L(fourth_vector_boundary) 54 vpcmpb $0, (%rdi), %YMMZERO, %k0 55 kmovd %k0, %edx 56 test %edx, %edx 57 jnz L(exit_null_on_first_vector) 58 mov %rdi, %rax 59 and $-VEC_SIZE, %rax 60 jmp L(align_vec_size_start) 61L(fourth_vector_boundary): 62 mov %rdi, %rax 63 and $-VEC_SIZE, %rax 64 vpcmpb $0, (%rax), %YMMZERO, %k0 65 mov $-1, %r10d 66 sub %rax, %rcx 67 shl %cl, %r10d 68 kmovd %k0, %edx 69 and %r10d, %edx 70 jnz L(exit) 71 72L(align_vec_size_start): 73 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 74 kmovd %k0, %edx 75 test %edx, %edx 76 jnz L(exit_null_on_second_vector) 77 78 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 79 kmovd %k1, %edx 80 test %edx, %edx 81 jnz L(exit_null_on_third_vector) 82 83 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 84 kmovd %k2, %edx 85 test %edx, %edx 86 jnz L(exit_null_on_fourth_vector) 87 88 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 89 kmovd %k3, %edx 90 test %edx, %edx 91 jnz L(exit_null_on_fifth_vector) 92 93 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 94 add $(VEC_SIZE * 4), %rax 95 kmovd %k4, %edx 96 test %edx, %edx 97 jnz L(exit_null_on_second_vector) 98 99 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 100 kmovd %k1, %edx 101 test %edx, %edx 102 jnz L(exit_null_on_third_vector) 103 104 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 105 kmovd %k2, %edx 106 test %edx, %edx 107 jnz L(exit_null_on_fourth_vector) 108 109 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 110 kmovd %k3, %edx 111 test %edx, %edx 112 jnz L(exit_null_on_fifth_vector) 113 114 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 115 kmovd %k4, %edx 116 add $(VEC_SIZE * 4), %rax 117 test %edx, %edx 118 jnz L(exit_null_on_second_vector) 119 120 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 121 kmovd %k1, %edx 122 test %edx, %edx 123 jnz L(exit_null_on_third_vector) 124 125 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 126 kmovd %k2, %edx 127 test %edx, %edx 128 jnz L(exit_null_on_fourth_vector) 129 130 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 131 kmovd %k3, %edx 132 test %edx, %edx 133 jnz L(exit_null_on_fifth_vector) 134 135 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 136 add $(VEC_SIZE * 4), %rax 137 kmovd %k4, %edx 138 test %edx, %edx 139 jnz L(exit_null_on_second_vector) 140 141 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 142 kmovd %k1, %edx 143 test %edx, %edx 144 jnz L(exit_null_on_third_vector) 145 146 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 147 kmovd %k2, %edx 148 test %edx, %edx 149 jnz L(exit_null_on_fourth_vector) 150 151 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 152 kmovd %k3, %edx 153 test %edx, %edx 154 jnz L(exit_null_on_fifth_vector) 155 156 test $((VEC_SIZE * 4) - 1), %rax 157 jz L(align_four_vec_loop) 158 159 vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 160 add $(VEC_SIZE * 5), %rax 161 kmovd %k4, %edx 162 test %edx, %edx 163 jnz L(exit) 164 165 test $((VEC_SIZE * 4) - 1), %rax 166 jz L(align_four_vec_loop) 167 168 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 169 add $VEC_SIZE, %rax 170 kmovd %k0, %edx 171 test %edx, %edx 172 jnz L(exit) 173 174 test $((VEC_SIZE * 4) - 1), %rax 175 jz L(align_four_vec_loop) 176 177 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 178 add $VEC_SIZE, %rax 179 kmovd %k0, %edx 180 test %edx, %edx 181 jnz L(exit) 182 183 test $((VEC_SIZE * 4) - 1), %rax 184 jz L(align_four_vec_loop) 185 186 vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 187 add $VEC_SIZE, %rax 188 kmovd %k1, %edx 189 test %edx, %edx 190 jnz L(exit) 191 192 add $VEC_SIZE, %rax 193 194 .p2align 4 195L(align_four_vec_loop): 196 VMOVA (%rax), %YMM0 197 VMOVA (VEC_SIZE * 2)(%rax), %YMM1 198 vpminub VEC_SIZE(%rax), %YMM0, %YMM0 199 vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 200 vpminub %YMM0, %YMM1, %YMM0 201 /* If K0 != 0, there is a null byte. */ 202 vpcmpb $0, %YMM0, %YMMZERO, %k0 203 add $(VEC_SIZE * 4), %rax 204 ktestd %k0, %k0 205 jz L(align_four_vec_loop) 206 207 vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 208 sub $(VEC_SIZE * 5), %rax 209 kmovd %k0, %edx 210 test %edx, %edx 211 jnz L(exit_null_on_second_vector) 212 213 vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 214 kmovd %k1, %edx 215 test %edx, %edx 216 jnz L(exit_null_on_third_vector) 217 218 vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 219 kmovd %k2, %edx 220 test %edx, %edx 221 jnz L(exit_null_on_fourth_vector) 222 223 vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 224 kmovd %k3, %edx 225 sub %rdi, %rax 226 bsf %rdx, %rdx 227 add %rdx, %rax 228 add $(VEC_SIZE * 4), %rax 229 jmp L(StartStrcpyPart) 230 231 .p2align 4 232L(exit): 233 sub %rdi, %rax 234L(exit_null_on_first_vector): 235 bsf %rdx, %rdx 236 add %rdx, %rax 237 jmp L(StartStrcpyPart) 238 239 .p2align 4 240L(exit_null_on_second_vector): 241 sub %rdi, %rax 242 bsf %rdx, %rdx 243 add %rdx, %rax 244 add $VEC_SIZE, %rax 245 jmp L(StartStrcpyPart) 246 247 .p2align 4 248L(exit_null_on_third_vector): 249 sub %rdi, %rax 250 bsf %rdx, %rdx 251 add %rdx, %rax 252 add $(VEC_SIZE * 2), %rax 253 jmp L(StartStrcpyPart) 254 255 .p2align 4 256L(exit_null_on_fourth_vector): 257 sub %rdi, %rax 258 bsf %rdx, %rdx 259 add %rdx, %rax 260 add $(VEC_SIZE * 3), %rax 261 jmp L(StartStrcpyPart) 262 263 .p2align 4 264L(exit_null_on_fifth_vector): 265 sub %rdi, %rax 266 bsf %rdx, %rdx 267 add %rdx, %rax 268 add $(VEC_SIZE * 4), %rax 269 270 .p2align 4 271L(StartStrcpyPart): 272 lea (%r9, %rax), %rdi 273 mov %rsi, %rcx 274 mov %r9, %rax /* save result */ 275 276# ifdef USE_AS_STRNCAT 277 test %r8, %r8 278 jz L(ExitZero) 279# define USE_AS_STRNCPY 280# endif 281 282# include "strcpy-evex.S" 283#endif 284