1/* Optimized version of the memccpy() function. 2 This file is part of the GNU C Library. 3 Copyright (C) 2000-2021 Free Software Foundation, Inc. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* Return: a pointer to the next byte after char in dest or NULL 20 21 Inputs: 22 in0: dest 23 in1: src 24 in2: char 25 in3: byte count 26 27 This implementation assumes little endian mode (UM.be = 0). 28 29 This implementation assumes that it is safe to do read ahead 30 in the src block, without getting beyond its limit. */ 31 32#include <sysdep.h> 33#undef ret 34 35#define OP_T_THRES 16 36#define OPSIZ 8 37 38#define saved_pr r17 39#define saved_lc r18 40#define dest r19 41#define src r20 42#define len r21 43#define asrc r22 44#define tmp r23 45#define char r24 46#define charx8 r25 47#define saved_ec r26 48#define sh2 r28 49#define sh1 r29 50#define loopcnt r30 51#define value r31 52 53#ifdef GAS_ALIGN_BREAKS_UNWIND_INFO 54/* Manually force proper loop-alignment. Note: be sure to 55 double-check the code-layout after making any changes to 56 this routine! */ 57# define ALIGN(n) { nop 0 } 58#else 59# define ALIGN(n) .align n 60#endif 61 62ENTRY(memccpy) 63 .prologue 64 alloc r2 = ar.pfs, 4, 40 - 4, 0, 40 65 66#include "softpipe.h" 67 .rotr r[MEMLAT + 7], tmp1[4], tmp2[4], val[4], tmp3[2], pos0[2] 68 .rotp p[MEMLAT + 6 + 1] 69 70 mov ret0 = r0 // return NULL if no match 71 .save pr, saved_pr 72 mov saved_pr = pr // save the predicate registers 73 mov dest = in0 // dest 74 .save ar.lc, saved_lc 75 mov saved_lc = ar.lc // save the loop counter 76 mov saved_ec = ar.ec // save the loop counter 77 .body 78 mov src = in1 // src 79 extr.u char = in2, 0, 8 // char 80 mov len = in3 // len 81 sub tmp = r0, in0 // tmp = -dest 82 cmp.ne p7, p0 = r0, r0 // clear p7 83 ;; 84 and loopcnt = 7, tmp // loopcnt = -dest % 8 85 cmp.ge p6, p0 = OP_T_THRES, len // is len <= OP_T_THRES 86 mov ar.ec = 0 // ec not guaranteed zero on entry 87(p6) br.cond.spnt .cpyfew // copy byte by byte 88 ;; 89 cmp.eq p6, p0 = loopcnt, r0 90 mux1 charx8 = char, @brcst 91(p6) br.cond.sptk .dest_aligned 92 sub len = len, loopcnt // len -= -dest % 8 93 adds loopcnt = -1, loopcnt // --loopcnt 94 ;; 95 mov ar.lc = loopcnt 96.l1: // copy -dest % 8 bytes 97 ld1 value = [src], 1 // value = *src++ 98 ;; 99 st1 [dest] = value, 1 // *dest++ = value 100 cmp.eq p6, p0 = value, char 101(p6) br.cond.spnt .foundit 102 br.cloop.dptk .l1 103.dest_aligned: 104 and sh1 = 7, src // sh1 = src % 8 105 and tmp = -8, len // tmp = len & -OPSIZ 106 and asrc = -8, src // asrc = src & -OPSIZ -- align src 107 shr.u loopcnt = len, 3 // loopcnt = len / 8 108 and len = 7, len ;; // len = len % 8 109 shl sh1 = sh1, 3 // sh1 = 8 * (src % 8) 110 adds loopcnt = -1, loopcnt // --loopcnt 111 mov pr.rot = 1 << 16 ;; // set rotating predicates 112 sub sh2 = 64, sh1 // sh2 = 64 - sh1 113 mov ar.lc = loopcnt // set LC 114 cmp.eq p6, p0 = sh1, r0 // is the src aligned? 115(p6) br.cond.sptk .src_aligned ;; 116 add src = src, tmp // src += len & -OPSIZ 117 mov ar.ec = MEMLAT + 6 + 1 // six more passes needed 118 ld8 r[1] = [asrc], 8 // r[1] = w0 119 cmp.ne p6, p0 = r0, r0 ;; // clear p6 120 ALIGN(32) 121.l2: 122(p[0]) ld8.s r[0] = [asrc], 8 // r[0] = w1 123(p[MEMLAT]) shr.u tmp1[0] = r[1 + MEMLAT], sh1 // tmp1 = w0 >> sh1 124(p[MEMLAT]) shl tmp2[0] = r[0 + MEMLAT], sh2 // tmp2 = w1 << sh2 125(p[MEMLAT+4]) xor tmp3[0] = val[1], charx8 126(p[MEMLAT+5]) czx1.r pos0[0] = tmp3[1] 127(p[MEMLAT+6]) chk.s r[6 + MEMLAT], .recovery1 // our data isn't 128 // valid - rollback! 129(p[MEMLAT+6]) cmp.ne p6, p0 = 8, pos0[1] 130(p6) br.cond.spnt .gotit 131(p[MEMLAT+6]) st8 [dest] = val[3], 8 // store val to dest 132(p[MEMLAT+3]) or val[0] = tmp1[3], tmp2[3] // val = tmp1 | tmp2 133 br.ctop.sptk .l2 134 br.cond.sptk .cpyfew 135 136.src_aligned: 137 cmp.ne p6, p0 = r0, r0 // clear p6 138 mov ar.ec = MEMLAT + 2 + 1 ;; // set EC 139.l3: 140(p[0]) ld8.s r[0] = [src], 8 141(p[MEMLAT]) xor tmp3[0] = r[MEMLAT], charx8 142(p[MEMLAT+1]) czx1.r pos0[0] = tmp3[1] 143(p[MEMLAT+2]) cmp.ne p7, p0 = 8, pos0[1] 144(p[MEMLAT+2]) chk.s r[MEMLAT+2], .recovery2 145(p7) br.cond.spnt .gotit 146.back2: 147(p[MEMLAT+2]) st8 [dest] = r[MEMLAT+2], 8 148 br.ctop.dptk .l3 149.cpyfew: 150 cmp.eq p6, p0 = len, r0 // is len == 0 ? 151 adds len = -1, len // --len; 152(p6) br.cond.spnt .restore_and_exit ;; 153 mov ar.lc = len 154.l4: 155 ld1 value = [src], 1 156 ;; 157 st1 [dest] = value, 1 158 cmp.eq p6, p0 = value, char 159(p6) br.cond.spnt .foundit 160 br.cloop.dptk .l4 ;; 161.foundit: 162(p6) mov ret0 = dest 163.restore_and_exit: 164 mov pr = saved_pr, -1 // restore the predicate registers 165 mov ar.lc = saved_lc // restore the loop counter 166 mov ar.ec = saved_ec ;; // restore the epilog counter 167 br.ret.sptk.many b0 168.gotit: 169 .pred.rel "mutex" p6, p7 170(p6) mov value = val[3] // if coming from l2 171(p7) mov value = r[MEMLAT+2] // if coming from l3 172 mov ar.lc = pos0[1] ;; 173.l5: 174 extr.u tmp = value, 0, 8 ;; 175 st1 [dest] = tmp, 1 176 shr.u value = value, 8 177 br.cloop.sptk .l5 ;; 178 mov ret0 = dest 179 mov pr = saved_pr, -1 180 mov ar.lc = saved_lc 181 br.ret.sptk.many b0 182 183.recovery1: 184#if MEMLAT != 6 185# error "MEMLAT must be 6!" 186#endif 187 adds src = -8, asrc 188 mov loopcnt = ar.lc 189 mov tmp = ar.ec 190 ;; 191(p[0]) adds src = -8, src 192 ;; 193(p[1]) adds src = -8, src 194 sub sh1 = (MEMLAT + 6 + 1), tmp 195 ;; 196(p[2]) adds src = -8, src 197 ;; 198(p[3]) adds src = -8, src 199 shl loopcnt = loopcnt, 3 200 ;; 201(p[4]) adds src = -8, src 202 ;; 203(p[5]) adds src = -8, src 204 shl sh1 = sh1, 3 205 ;; 206(p[6]) adds src = -8, src 207 ;; 208(p[7]) adds src = -8, src 209 shl tmp = tmp, 3 210 ;; 211(p[8]) adds src = -8, src 212 ;; 213(p[9]) adds src = -8, src 214 shr.u sh2 = sh2, 3 215 ;; 216(p[10]) adds src = -8, src 217 ;; 218(p[11]) adds src = -8, src 219 add len = len, loopcnt 220 ;; 221 sub src = src, sh2 222 ;; 223 add len = tmp, len 224 add src = sh1, src 225 br.cond.sptk .cpyfew 226 227.recovery2: 228#if MEMLAT != 6 229# error "MEMLAT must be 6!" 230#endif 231 add tmp = -8, src 232(p7) br.cond.spnt .gotit 233 ;; 234(p[0]) add tmp = -8, tmp ;; 235(p[1]) add tmp = -8, tmp ;; 236(p[2]) add tmp = -8, tmp ;; 237(p[3]) add tmp = -8, tmp ;; 238(p[4]) add tmp = -8, tmp ;; 239(p[5]) add tmp = -8, tmp ;; 240(p[6]) add tmp = -8, tmp ;; 241(p[7]) add tmp = -8, tmp ;; 242 ld8 r[MEMLAT+2] = [tmp] ;; 243 xor pos0[1] = r[MEMLAT+2], charx8 ;; 244 czx1.r pos0[1] = pos0[1] ;; 245 cmp.ne p7, p6 = 8, pos0[1] 246(p7) br.cond.spnt .gotit 247 br.cond.sptk .back2 248END(memccpy) 249