1/* Vector optimized 32/64 bit S/390 version of wcsncat. 2 Copyright (C) 2015-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <ifunc-wcsncat.h> 20#if HAVE_WCSNCAT_Z13 21 22# include "sysdep.h" 23# include "asm-syntax.h" 24 25 .text 26 27/* wchar_t * wcsncat (wchar_t *dest, const wchar_t *src, size_t n) 28 Concatenate two strings - at most n characters of src. 29 30 Register usage: 31 -r0=saved dest pointer for return 32 -r1=tmp 33 -r2=dest 34 -r3=src 35 -r4=n 36 -r5=current_len 37 -r6=tmp 38 -r7=tmp 39 -v16=part of src 40 -v17=index of zero 41 -v18=part of src 42 -v31=register save area for r6, r7 43*/ 44ENTRY(WCSNCAT_Z13) 45 .machine "z13" 46 .machinemode "zarch_nohighgprs" 47 48# if !defined __s390x__ 49 llgfr %r4,%r4 50# endif /* !defined __s390x__ */ 51 52 clgfi %r4,0 53 ber %r14 /* Nothing to do, if n == 0. */ 54 55 vlbb %v16,0(%r2),6 /* Load s until next 4k-byte boundary. */ 56 lcbb %r1,0(%r2),6 /* Get bytes to 4k-byte boundary or 16. */ 57 58 /* If either src or dest is not 4byte aligned, use __wcsncat_c. */ 59 tmll %r2,3 /* Test if s is 4-byte aligned? */ 60 jne .Lfallback /* And use common-code variant if not. */ 61 tmll %r3,3 /* Test if src is 4-byte aligned? */ 62 jne .Lfallback /* And use common-code variant if not. */ 63 64 lgr %r0,%r2 /* Save destination pointer for return. */ 65 vlvgp %v31,%r6,%r7 /* Save registers. */ 66 67 /* WCSLEN 68 %r1 = loaded bytes (tmp) 69 %r6 = zero byte index (tmp) 70 %r2 = dst 71 */ 72 vfenezf %v16,%v16,%v16 /* Find element not equal with zero search. */ 73 vlgvb %r5,%v16,7 /* Load zero index or 16 if not found. */ 74 clrjl %r5,%r1,.Llen_end /* Found zero within loaded bytes, end. */ 75 76 /* Align s to 16 byte. */ 77 risbgn %r1,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ 78 lghi %r5,16 /* current_len = 16. */ 79 slr %r5,%r1 /* Compute bytes to 16bytes boundary. */ 80 81 /* Find zero in 16byte aligned loop. */ 82.Llen_loop: 83 vl %v16,0(%r5,%r2) /* Load s. */ 84 vfenezfs %v16,%v16,%v16 /* Find element not equal with zero search. */ 85 je .Llen_found /* Jump away if zero was found. */ 86 vl %v16,16(%r5,%r2) 87 vfenezfs %v16,%v16,%v16 88 je .Llen_found16 89 vl %v16,32(%r5,%r2) 90 vfenezfs %v16,%v16,%v16 91 je .Llen_found32 92 vl %v16,48(%r5,%r2) 93 vfenezfs %v16,%v16,%v16 94 je .Llen_found48 95 96 aghi %r5,64 97 j .Llen_loop /* No zero -> loop. */ 98 99.Llen_found48: 100 aghi %r5,16 101.Llen_found32: 102 aghi %r5,16 103.Llen_found16: 104 aghi %r5,16 105.Llen_found: 106 vlgvb %r1,%v16,7 /* Load byte index of zero. */ 107 algr %r5,%r1 108 109.Llen_end: 110 /* WCSNCPY 111 %r1 = zero byte index (tmp) 112 %r6 = loaded bytes (tmp) 113 %r3 = curr src pointer 114 %r2 = curr dst pointer 115 %r7 = border, tmp 116 */ 117 la %r2,0(%r5,%r2) /* strcpy at end of dst-string. */ 118 119 vlbb %v16,0(%r3),6 /* Load s until next 4k-byte boundary. */ 120 lcbb %r6,0(%r3),6 /* Get bytes to 4k-byte boundary or 16. */ 121 llgfr %r6,%r6 /* Convert 32bit to 64bit. */ 122 123 lghi %r5,0 /* current_len = 0. */ 124 125 /* Check range of maxlen and convert to byte-count. */ 126# ifdef __s390x__ 127 tmhh %r4,49152 /* Test bit 0 or 1 of maxlen. */ 128 lghi %r1,-4 /* Max byte-count is 18446744073709551612. */ 129# else 130 tmlh %r4,49152 /* Test bit 0 or 1 of maxlen. */ 131 llilf %r1,4294967292 /* Max byte-count is 4294967292. */ 132# endif /* !__s390x__ */ 133 sllg %r4,%r4,2 /* Convert character-count to byte-count. */ 134 locgrne %r4,%r1 /* Use max byte-count, if bit 0/1 was one. */ 135 136 clgrjle %r4,%r6,.Lcpy_remaining_v16 /* If n <= loaded-bytes 137 -> process remaining. */ 138 139 /* n > loaded-byte-count. */ 140 vfenezf %v17,%v16,%v16 /* Find element not equal with zero search. */ 141 vlgvb %r1,%v17,7 /* Load zero index or 16 if not found. */ 142 clrjl %r1,%r6,.Lcpy_found_v16_store /* Found zero within loaded bytes, 143 copy and return. */ 144 145 /* Align s to 16 byte. */ 146 risbgn %r1,%r3,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15. */ 147 lghi %r5,15 /* current_len = 15. */ 148 slr %r5,%r1 /* Compute highest index to 16byte boundary. * 149 150 /* Zero not found and maxlen > loaded-byte-count. */ 151 vstl %v16,%r5,0(%r2) /* Copy loaded characters - no zero. */ 152 ahi %r5,1 /* Start loop at next character. */ 153 154 /* 155 Now we are 16byte aligned, so we can load a full vreg 156 without page fault. 157 */ 158 lgr %r1,%r5 /* If %r5 + 64 < maxlen? -> loop64. */ 159 aghi %r1,64 160 clgrjl %r1,%r4,.Lcpy_loop64 161 162 vl %v16,0(%r5,%r3) /* Load s. */ 163 clgijl %r4,17,.Lcpy_remaining_v16 /* If n <=16, 164 process remaining bytes. */ 165.Lcpy_lt64: 166 lgr %r7,%r4 167 slgfi %r7,16 /* border_len = n - 16. */ 168 169 clgrjhe %r5,%r7,.Lcpy_remaining_v16 170 vfenezfs %v17,%v16,%v16 /* Find element not equal with zero search. */ 171 je .Lcpy_found_v16 /* Jump away if zero was found. */ 172 vl %v18,16(%r5,%r3) /* Load next part of s. */ 173 vst %v16,0(%r5,%r2) /* Save previous part without zero to dst. */ 174 aghi %r5,16 175 176 clgrjhe %r5,%r7,.Lcpy_remaining_v18 177 vfenezfs %v17,%v18,%v18 178 je .Lcpy_found_v18 179 vl %v16,16(%r5,%r3) 180 vst %v18,0(%r5,%r2) 181 aghi %r5,16 182 183 clgrjhe %r5,%r7,.Lcpy_remaining_v16 184 vfenezfs %v17,%v16,%v16 185 je .Lcpy_found_v16 186 vl %v18,16(%r5,%r3) 187 vst %v16,0(%r5,%r2) 188 aghi %r5,16 189 190.Lcpy_remaining_v18: 191 vlr %v16,%v18 192.Lcpy_remaining_v16: 193 /* v16 contains the remaining bytes [1...16]. 194 Store remaining bytes and append string-termination. */ 195 vfenezf %v17,%v16,%v16 /* Find element not equal with zero search. */ 196 slgrk %r7,%r4,%r5 /* Remaining bytes = maxlen - current_len. */ 197 aghi %r7,-1 /* vstl needs highest index. */ 198 vlgvb %r1,%v17,7 /* Load zero index or 16 if not found. */ 199 la %r2,0(%r5,%r2) /* vstl has no index register. */ 200 /* Zero-index within remaining-bytes, store up to zero and end. */ 201 clgrjle %r1,%r7,.Lcpy_found_v16_store 202 vstl %v16,%r7,0(%r2) /* Store remaining bytes. */ 203 lghi %r1,0 204 st %r1,1(%r7,%r2) /* Store string-null-termination beyond n. */ 205.Lcpy_end: 206 /* Restore saved registers. */ 207 vlgvg %r6,%v31,0 208 vlgvg %r7,%v31,1 209 lgr %r2,%r0 /* Load saved dest-ptr. */ 210 br %r14 211 212.Lcpy_found_v16_32: 213 aghi %r5,32 214 j .Lcpy_found_v16 215.Lcpy_found_v18_48: 216 aghi %r5,32 217.Lcpy_found_v18_16: 218 aghi %r5,16 219.Lcpy_found_v18: 220 vlr %v16,%v18 221.Lcpy_found_v16: 222 /* v16 contains a zero. Store remaining bytes to zero. current_len 223 has not reached border, thus checking for n is not needed! */ 224 vlgvb %r1,%v17,7 /* Load byte index of zero. */ 225 la %r2,0(%r5,%r2) 226.Lcpy_found_v16_store: 227 aghi %r1,3 /* Also copy remaining bytes of zero. */ 228 vstl %v16,%r1,0(%r2) /* Copy characters including zero. */ 229 j .Lcpy_end 230 231 /* Find zero in 16byte aligned loop. */ 232.Lcpy_loop2: 233 vl %v16,16(%r5,%r3) 234 vst %v18,0(%r5,%r2) 235 aghi %r5,16 236 237.Lcpy_loop64: 238 vl %v16,0(%r5,%r3) 239 vfenezfs %v17,%v16,%v16 /* Find element not equal with zero search. */ 240 je .Lcpy_found_v16 /* Jump away if zero was found. */ 241 vl %v18,16(%r5,%r3) /* Load next part of s. */ 242 vst %v16,0(%r5,%r2) /* Save previous part without zero to dst. */ 243 vfenezfs %v17,%v18,%v18 244 je .Lcpy_found_v18_16 245 vl %v16,32(%r5,%r3) 246 vst %v18,16(%r5,%r2) 247 vfenezfs %v17,%v16,%v16 248 je .Lcpy_found_v16_32 249 vl %v18,48(%r5,%r3) 250 vst %v16,32(%r5,%r2) 251 vfenezfs %v17,%v18,%v18 252 je .Lcpy_found_v18_48 253 vst %v18,48(%r5,%r2) 254 255 aghi %r5,64 256 lgr %r1,%r5 /* If %r5 + 64 < maxlen? -> loop64. */ 257 aghi %r1,64 258 clgrjl %r1,%r4,.Lcpy_loop64 259 260 vl %v16,0(%r5,%r3) /* Load s. */ 261 j .Lcpy_lt64 262 263.Lfallback: 264 jg WCSNCAT_C 265END(WCSNCAT_Z13) 266 267# if ! HAVE_WCSNCAT_IFUNC 268strong_alias (WCSNCAT_Z13, wcsncat) 269# endif 270#endif 271