1/* Optimized strncpy implementation for POWER9 LE. 2 Copyright (C) 2020-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifdef USE_AS_STPNCPY 22# ifndef STPNCPY 23# define FUNC_NAME __stpncpy 24# else 25# define FUNC_NAME STPNCPY 26# endif 27#else 28# ifndef STRNCPY 29# define FUNC_NAME strncpy 30# else 31# define FUNC_NAME STRNCPY 32# endif 33#endif /* !USE_AS_STPNCPY */ 34 35#ifndef MEMSET 36/* For builds without IFUNC support, local calls should be made to internal 37 GLIBC symbol (created by libc_hidden_builtin_def). */ 38# ifdef SHARED 39# define MEMSET_is_local 40# define MEMSET __GI_memset 41# else 42# define MEMSET memset 43# endif 44#endif 45 46#define FRAMESIZE (FRAME_MIN_SIZE+8) 47 48/* Implements the function 49 50 char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) 51 52 or 53 54 char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) 55 56 if USE_AS_STPNCPY is defined. 57 58 The implementation can load bytes past a null terminator, but only 59 up to the next 16-byte aligned address, so it never crosses a page. */ 60 61.machine power9 62#ifdef MEMSET_is_local 63ENTRY_TOCLESS (FUNC_NAME, 4) 64#else 65ENTRY (FUNC_NAME, 4) 66#endif 67 CALL_MCOUNT 2 68 69 /* NULL string optimizations */ 70 cmpdi r5, 0 71 beqlr 72 73 lbz r0,0(r4) 74 stb r0,0(r3) 75 addi r11,r3,1 76 addi r5,r5,-1 77 vspltisb v18,0 /* Zeroes in v18 */ 78 cmpdi r0,0 79 beq L(zero_padding) 80 81 /* Empty/1-byte string optimization */ 82 cmpdi r5,0 83#ifdef USE_AS_STPNCPY 84 bgt L(cont) 85 /* Compute pointer to last byte copied into dest. */ 86 addi r3,r3,1 87 blr 88L(cont): 89#else 90 beqlr 91#endif 92 93 addi r4,r4,1 94 neg r7,r4 95 rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */ 96 97 /* Get source 16B aligned */ 98 lvx v0,0,r4 99 lvsr v1,0,r4 100 vperm v0,v18,v0,v1 101 102 vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ 103 vctzlsbb r7,v6 /* Number of trailing zeroes */ 104 addi r8,r7,1 /* Add null terminator */ 105 106 /* r8 = bytes including null 107 r9 = bytes to get source 16B aligned 108 if r8 > r9 109 no null, copy r9 bytes 110 else 111 there is a null, copy r8 bytes and return. */ 112 cmpld r8,r9 113 bgt L(no_null) 114 115 cmpld cr6,r8,r5 /* r8 <= n? */ 116 ble cr6,L(null) 117 118 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 119 stxvl 32+v0,r11,r10 /* Partial store */ 120 121#ifdef USE_AS_STPNCPY 122 /* Compute pointer to last byte copied into dest. */ 123 add r3,r11,r5 124#endif 125 blr 126 127L(null): 128 sldi r10,r8,56 /* stxvl wants size in top 8 bits */ 129 stxvl 32+v0,r11,r10 /* Partial store */ 130 131#ifdef USE_AS_STPNCPY 132 /* Compute pointer to last byte copied into dest. */ 133 add r3,r11,r7 134#endif 135 add r11,r11,r8 136 sub r5,r5,r8 137 b L(zero_padding) 138 139L(no_null): 140 cmpld r9,r5 /* Check if length was reached. */ 141 bge L(n_tail1) 142 143 sldi r10,r9,56 /* stxvl wants size in top 8 bits */ 144 stxvl 32+v0,r11,r10 /* Partial store */ 145 146 add r4,r4,r9 147 add r11,r11,r9 148 sub r5,r5,r9 149 150L(loop): 151 cmpldi cr6,r5,64 /* Check if length was reached. */ 152 ble cr6,L(final_loop) 153 154 lxv 32+v0,0(r4) 155 vcmpequb. v6,v0,v18 /* Any zero bytes? */ 156 bne cr6,L(prep_tail1) 157 158 lxv 32+v1,16(r4) 159 vcmpequb. v6,v1,v18 /* Any zero bytes? */ 160 bne cr6,L(prep_tail2) 161 162 lxv 32+v2,32(r4) 163 vcmpequb. v6,v2,v18 /* Any zero bytes? */ 164 bne cr6,L(prep_tail3) 165 166 lxv 32+v3,48(r4) 167 vcmpequb. v6,v3,v18 /* Any zero bytes? */ 168 bne cr6,L(prep_tail4) 169 170 stxv 32+v0,0(r11) 171 stxv 32+v1,16(r11) 172 stxv 32+v2,32(r11) 173 stxv 32+v3,48(r11) 174 175 addi r4,r4,64 176 addi r11,r11,64 177 addi r5,r5,-64 178 179 b L(loop) 180 181L(final_loop): 182 cmpldi cr5,r5,16 183 lxv 32+v0,0(r4) 184 vcmpequb. v6,v0,v18 /* Any zero bytes? */ 185 ble cr5,L(prep_n_tail1) 186 bne cr6,L(count_tail1) 187 addi r5,r5,-16 188 189 cmpldi cr5,r5,16 190 lxv 32+v1,16(r4) 191 vcmpequb. v6,v1,v18 /* Any zero bytes? */ 192 ble cr5,L(prep_n_tail2) 193 bne cr6,L(count_tail2) 194 addi r5,r5,-16 195 196 cmpldi cr5,r5,16 197 lxv 32+v2,32(r4) 198 vcmpequb. v6,v2,v18 /* Any zero bytes? */ 199 ble cr5,L(prep_n_tail3) 200 bne cr6,L(count_tail3) 201 addi r5,r5,-16 202 203 lxv 32+v3,48(r4) 204 vcmpequb. v6,v3,v18 /* Any zero bytes? */ 205 beq cr6,L(n_tail4) 206 207 vctzlsbb r8,v6 /* Number of trailing zeroes */ 208 cmpld r8,r5 /* r8 < n? */ 209 blt L(tail4) 210 211L(n_tail4): 212 stxv 32+v0,0(r11) 213 stxv 32+v1,16(r11) 214 stxv 32+v2,32(r11) 215 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 216 addi r11,r11,48 /* Offset */ 217 stxvl 32+v3,r11,r10 /* Partial store */ 218#ifdef USE_AS_STPNCPY 219 /* Compute pointer to last byte copied into dest. */ 220 add r3,r11,r5 221#endif 222 blr 223 224L(prep_n_tail1): 225 beq cr6,L(n_tail1) /* Any zero bytes? */ 226 vctzlsbb r8,v6 /* Number of trailing zeroes */ 227 cmpld r8,r5 /* r8 < n? */ 228 blt L(tail1) 229 230L(n_tail1): 231 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 232 stxvl 32+v0,r11,r10 /* Partial store */ 233#ifdef USE_AS_STPNCPY 234 /* Compute pointer to last byte copied into dest. */ 235 add r3,r11,r5 236#endif 237 blr 238 239L(prep_n_tail2): 240 beq cr6,L(n_tail2) /* Any zero bytes? */ 241 vctzlsbb r8,v6 /* Number of trailing zeroes */ 242 cmpld r8,r5 /* r8 < n? */ 243 blt L(tail2) 244 245L(n_tail2): 246 stxv 32+v0,0(r11) 247 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 248 addi r11,r11,16 /* offset */ 249 stxvl 32+v1,r11,r10 /* Partial store */ 250#ifdef USE_AS_STPNCPY 251 /* Compute pointer to last byte copied into dest. */ 252 add r3,r11,r5 253#endif 254 blr 255 256L(prep_n_tail3): 257 beq cr6,L(n_tail3) /* Any zero bytes? */ 258 vctzlsbb r8,v6 /* Number of trailing zeroes */ 259 cmpld r8,r5 /* r8 < n? */ 260 blt L(tail3) 261 262L(n_tail3): 263 stxv 32+v0,0(r11) 264 stxv 32+v1,16(r11) 265 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 266 addi r11,r11,32 /* Offset */ 267 stxvl 32+v2,r11,r10 /* Partial store */ 268#ifdef USE_AS_STPNCPY 269 /* Compute pointer to last byte copied into dest. */ 270 add r3,r11,r5 271#endif 272 blr 273 274L(prep_tail1): 275L(count_tail1): 276 vctzlsbb r8,v6 /* Number of trailing zeroes */ 277L(tail1): 278 addi r9,r8,1 /* Add null terminator */ 279 sldi r10,r9,56 /* stxvl wants size in top 8 bits */ 280 stxvl 32+v0,r11,r10 /* Partial store */ 281#ifdef USE_AS_STPNCPY 282 /* Compute pointer to last byte copied into dest. */ 283 add r3,r11,r8 284#endif 285 add r11,r11,r9 286 sub r5,r5,r9 287 b L(zero_padding) 288 289L(prep_tail2): 290 addi r5,r5,-16 291L(count_tail2): 292 vctzlsbb r8,v6 /* Number of trailing zeroes */ 293L(tail2): 294 addi r9,r8,1 /* Add null terminator */ 295 stxv 32+v0,0(r11) 296 sldi r10,r9,56 /* stxvl wants size in top 8 bits */ 297 addi r11,r11,16 /* offset */ 298 stxvl 32+v1,r11,r10 /* Partial store */ 299#ifdef USE_AS_STPNCPY 300 /* Compute pointer to last byte copied into dest. */ 301 add r3,r11,r8 302#endif 303 add r11,r11,r9 304 sub r5,r5,r9 305 b L(zero_padding) 306 307L(prep_tail3): 308 addi r5,r5,-32 309L(count_tail3): 310 vctzlsbb r8,v6 /* Number of trailing zeroes */ 311L(tail3): 312 addi r9,r8,1 /* Add null terminator */ 313 stxv 32+v0,0(r11) 314 stxv 32+v1,16(r11) 315 sldi r10,r9,56 /* stxvl wants size in top 8 bits */ 316 addi r11,r11,32 /* offset */ 317 stxvl 32+v2,r11,r10 /* Partial store */ 318#ifdef USE_AS_STPNCPY 319 /* Compute pointer to last byte copied into dest. */ 320 add r3,r11,r8 321#endif 322 add r11,r11,r9 323 sub r5,r5,r9 324 b L(zero_padding) 325 326L(prep_tail4): 327 addi r5,r5,-48 328 vctzlsbb r8,v6 /* Number of trailing zeroes */ 329L(tail4): 330 addi r9,r8,1 /* Add null terminator */ 331 stxv 32+v0,0(r11) 332 stxv 32+v1,16(r11) 333 stxv 32+v2,32(r11) 334 sldi r10,r9,56 /* stxvl wants size in top 8 bits */ 335 addi r11,r11,48 /* offset */ 336 stxvl 32+v3,r11,r10 /* Partial store */ 337#ifdef USE_AS_STPNCPY 338 /* Compute pointer to last byte copied into dest. */ 339 add r3,r11,r8 340#endif 341 add r11,r11,r9 342 sub r5,r5,r9 343 344/* This code pads the remainder of dest with NULL bytes. For large numbers 345 memset gives a better performance, 255 was chosen through experimentation. 346 */ 347L(zero_padding): 348 cmpldi r5,255 349 bge L(zero_padding_memset) 350 351L(zero_padding_loop): 352 cmpldi cr6,r5,16 /* Check if length was reached. */ 353 ble cr6,L(zero_padding_end) 354 355 stxv v18,0(r11) 356 addi r11,r11,16 357 addi r5,r5,-16 358 359 b L(zero_padding_loop) 360 361L(zero_padding_end): 362 sldi r10,r5,56 /* stxvl wants size in top 8 bits */ 363 stxvl v18,r11,r10 /* Partial store */ 364 blr 365 366 .align 4 367L(zero_padding_memset): 368 std r30,-8(r1) /* Save r30 on the stack. */ 369 cfi_offset(r30, -8) 370 mr r30,r3 /* Save the return value of strncpy. */ 371 /* Prepare the call to memset. */ 372 mr r3,r11 /* Pointer to the area to be zero-filled. */ 373 li r4,0 /* Byte to be written (zero). */ 374 375 /* We delayed the creation of the stack frame, as well as the saving of 376 the link register, because only at this point, we are sure that 377 doing so is actually needed. */ 378 379 /* Save the link register. */ 380 mflr r0 381 std r0,16(r1) 382 383 /* Create the stack frame. */ 384 stdu r1,-FRAMESIZE(r1) 385 cfi_adjust_cfa_offset(FRAMESIZE) 386 cfi_offset(lr, 16) 387 388 bl MEMSET 389#ifndef MEMSET_is_local 390 nop 391#endif 392 393 ld r0,FRAMESIZE+16(r1) 394 395 mr r3,r30 /* Restore the return value of strncpy, i.e.: 396 dest. For stpncpy, the return value is the 397 same as return value of memset. */ 398 ld r30,FRAMESIZE-8(r1) /* Restore r30. */ 399 /* Restore the stack frame. */ 400 addi r1,r1,FRAMESIZE 401 cfi_adjust_cfa_offset(-FRAMESIZE) 402 /* Restore the link register. */ 403 mtlr r0 404 cfi_restore(lr) 405 blr 406 407END (FUNC_NAME) 408#ifndef USE_AS_STPNCPY 409libc_hidden_builtin_def (strncpy) 410#endif 411