1/* Optimized version of the standard bzero() function. 2 This file is part of the GNU C Library. 3 Copyright (C) 2000-2021 Free Software Foundation, Inc. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19/* Return: dest 20 21 Inputs: 22 in0: dest 23 in1: count 24 25 The algorithm is fairly straightforward: set byte by byte until we 26 we get to a 16B-aligned address, then loop on 128 B chunks using an 27 early store as prefetching, then loop on 32B chucks, then clear remaining 28 words, finally clear remaining bytes. 29 Since a stf.spill f0 can store 16B in one go, we use this instruction 30 to get peak speed. */ 31 32#include <sysdep.h> 33#undef ret 34 35#define dest in0 36#define cnt in1 37 38#define tmp r31 39#define save_lc r30 40#define ptr0 r29 41#define ptr1 r28 42#define ptr2 r27 43#define ptr3 r26 44#define ptr9 r24 45#define loopcnt r23 46#define linecnt r22 47#define bytecnt r21 48 49// This routine uses only scratch predicate registers (p6 - p15) 50#define p_scr p6 // default register for same-cycle branches 51#define p_unalgn p9 52#define p_y p11 53#define p_n p12 54#define p_yy p13 55#define p_nn p14 56 57#define movi0 mov 58 59#define MIN1 15 60#define MIN1P1HALF 8 61#define LINE_SIZE 128 62#define LSIZE_SH 7 // shift amount 63#define PREF_AHEAD 8 64 65#define USE_FLP 66#if defined(USE_INT) 67#define store st8 68#define myval r0 69#elif defined(USE_FLP) 70#define store stf8 71#define myval f0 72#endif 73 74.align 64 75ENTRY(bzero) 76{ .mmi 77 .prologue 78 alloc tmp = ar.pfs, 2, 0, 0, 0 79 lfetch.nt1 [dest] 80 .save ar.lc, save_lc 81 movi0 save_lc = ar.lc 82} { .mmi 83 .body 84 mov ret0 = dest // return value 85 nop.m 0 86 cmp.eq p_scr, p0 = cnt, r0 87;; } 88{ .mmi 89 and ptr2 = -(MIN1+1), dest // aligned address 90 and tmp = MIN1, dest // prepare to check for alignment 91 tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U) 92} { .mib 93 mov ptr1 = dest 94 nop.i 0 95(p_scr) br.ret.dpnt.many rp // return immediately if count = 0 96;; } 97{ .mib 98 cmp.ne p_unalgn, p0 = tmp, r0 99} { .mib // NB: # of bytes to move is 1 100 sub bytecnt = (MIN1+1), tmp // higher than loopcnt 101 cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task? 102(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U) 103;; } 104{ .mmi 105(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment 106(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment 107(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ? 108;; } 109{ .mib 110(p_y) add cnt = -8, cnt 111(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ? 112} { .mib 113(p_y) st8 [ptr2] = r0,-4 114(p_n) add ptr2 = 4, ptr2 115;; } 116{ .mib 117(p_yy) add cnt = -4, cnt 118(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ? 119} { .mib 120(p_yy) st4 [ptr2] = r0,-2 121(p_nn) add ptr2 = 2, ptr2 122;; } 123{ .mmi 124 mov tmp = LINE_SIZE+1 // for compare 125(p_y) add cnt = -2, cnt 126(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ? 127} { .mmi 128 nop.m 0 129(p_y) st2 [ptr2] = r0,-1 130(p_n) add ptr2 = 1, ptr2 131;; } 132 133{ .mmi 134(p_yy) st1 [ptr2] = r0 135 cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task? 136} { .mbb 137(p_yy) add cnt = -1, cnt 138(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few 139;; } 140{ .mib 141 nop.m 0 142 shr.u linecnt = cnt, LSIZE_SH 143 nop.b 0 144;; } 145 146 .align 32 147.l1b: // ------------------// L1B: store ahead into cache lines; fill later 148{ .mmi 149 and tmp = -(LINE_SIZE), cnt // compute end of range 150 mov ptr9 = ptr1 // used for prefetching 151 and cnt = (LINE_SIZE-1), cnt // remainder 152} { .mmi 153 mov loopcnt = PREF_AHEAD-1 // default prefetch loop 154 cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value 155;; } 156{ .mmi 157(p_scr) add loopcnt = -1, linecnt 158 add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores) 159 add ptr1 = tmp, ptr1 // first address beyond total range 160;; } 161{ .mmi 162 add tmp = -1, linecnt // next loop count 163 movi0 ar.lc = loopcnt 164;; } 165.pref_l1b: 166{ .mib 167 stf.spill [ptr9] = f0, 128 // Do stores one cache line apart 168 nop.i 0 169 br.cloop.dptk.few .pref_l1b 170;; } 171{ .mmi 172 add ptr0 = 16, ptr2 // Two stores in parallel 173 movi0 ar.lc = tmp 174;; } 175.l1bx: 176 { .mmi 177 stf.spill [ptr2] = f0, 32 178 stf.spill [ptr0] = f0, 32 179 ;; } 180 { .mmi 181 stf.spill [ptr2] = f0, 32 182 stf.spill [ptr0] = f0, 32 183 ;; } 184 { .mmi 185 stf.spill [ptr2] = f0, 32 186 stf.spill [ptr0] = f0, 64 187 cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching? 188 ;; } 189{ .mmb 190 stf.spill [ptr2] = f0, 32 191(p_scr) stf.spill [ptr9] = f0, 128 192 br.cloop.dptk.few .l1bx 193;; } 194{ .mib 195 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 196(p_scr) br.cond.dpnt.many .move_bytes_from_alignment 197;; } 198 199.fraction_of_line: 200{ .mib 201 add ptr2 = 16, ptr1 202 shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32 203;; } 204{ .mib 205 cmp.eq p_scr, p0 = loopcnt, r0 206 add loopcnt = -1, loopcnt 207(p_scr) br.cond.dpnt.many .store_words 208;; } 209{ .mib 210 and cnt = 0x1f, cnt // compute the remaining cnt 211 movi0 ar.lc = loopcnt 212;; } 213 .align 32 214.l2: // -----------------------------// L2A: store 32B in 2 cycles 215{ .mmb 216 store [ptr1] = myval, 8 217 store [ptr2] = myval, 8 218;; } { .mmb 219 store [ptr1] = myval, 24 220 store [ptr2] = myval, 24 221 br.cloop.dptk.many .l2 222;; } 223.store_words: 224{ .mib 225 cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ? 226(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch 227;; } 228 229{ .mmi 230 store [ptr1] = myval, 8 // store 231 cmp.le p_y, p_n = 16, cnt // 232 add cnt = -8, cnt // subtract 233;; } 234{ .mmi 235(p_y) store [ptr1] = myval, 8 // store 236(p_y) cmp.le.unc p_yy, p_nn = 16, cnt 237(p_y) add cnt = -8, cnt // subtract 238;; } 239{ .mmi // store 240(p_yy) store [ptr1] = myval, 8 241(p_yy) add cnt = -8, cnt // subtract 242;; } 243 244.move_bytes_from_alignment: 245{ .mib 246 cmp.eq p_scr, p0 = cnt, r0 247 tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ? 248(p_scr) br.cond.dpnt.few .restore_and_exit 249;; } 250{ .mib 251(p_y) st4 [ptr1] = r0,4 252 tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ? 253;; } 254{ .mib 255(p_yy) st2 [ptr1] = r0,2 256 tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ? 257;; } 258 259{ .mib 260(p_y) st1 [ptr1] = r0 261;; } 262.restore_and_exit: 263{ .mib 264 nop.m 0 265 movi0 ar.lc = save_lc 266 br.ret.sptk.many rp 267;; } 268 269.move_bytes_unaligned: 270{ .mmi 271 .pred.rel "mutex",p_y, p_n 272 .pred.rel "mutex",p_yy, p_nn 273(p_n) cmp.le p_yy, p_nn = 4, cnt 274(p_y) cmp.le p_yy, p_nn = 5, cnt 275(p_n) add ptr2 = 2, ptr1 276} { .mmi 277(p_y) add ptr2 = 3, ptr1 278(p_y) st1 [ptr1] = r0, 1 // fill 1 (odd-aligned) byte 279(p_y) add cnt = -1, cnt // [15, 14 (or less) left] 280;; } 281{ .mmi 282(p_yy) cmp.le.unc p_y, p0 = 8, cnt 283 add ptr3 = ptr1, cnt // prepare last store 284 movi0 ar.lc = save_lc 285} { .mmi 286(p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes 287(p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes 288(p_yy) add cnt = -4, cnt // [11, 10 (o less) left] 289;; } 290{ .mmi 291(p_y) cmp.le.unc p_yy, p0 = 8, cnt 292 add ptr3 = -1, ptr3 // last store 293 tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ? 294} { .mmi 295(p_y) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes 296(p_y) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes 297(p_y) add cnt = -4, cnt // [7, 6 (or less) left] 298;; } 299{ .mmi 300(p_yy) st2 [ptr1] = r0, 4 // fill 2 (aligned) bytes 301(p_yy) st2 [ptr2] = r0, 4 // fill 2 (aligned) bytes 302 // [3, 2 (or less) left] 303 tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ? 304} { .mmi 305(p_yy) add cnt = -4, cnt 306;; } 307{ .mmb 308(p_scr) st2 [ptr1] = r0 // fill 2 (aligned) bytes 309(p_y) st1 [ptr3] = r0 // fill last byte (using ptr3) 310 br.ret.sptk.many rp 311;; } 312END(bzero) 313