1/* Set a block of memory to some byte value. For SUN4V M7. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef XCC 22# define XCC xcc 23#endif 24 .register %g2, #scratch 25 .register %g3, #scratch 26 27/* The algorithm is as follows : 28 * 29 * For small 7 or fewer bytes stores, bytes will be stored. 30 * 31 * For less than 32 bytes stores, align the address on 4 byte boundary. 32 * Then store as many 4-byte chunks, followed by trailing bytes. 33 * 34 * For sizes greater than 32 bytes, align the address on 8 byte boundary. 35 * if (count >= 64) { 36 * store 8-bytes chunks to align the address on 64 byte boundary 37 * if (value to be set is zero && count >= MIN_ZERO) { 38 * Using BIS stores, set the first long word of each 39 * 64-byte cache line to zero which will also clear the 40 * other seven long words of the cache line. 41 * } 42 * else if (count >= MIN_LOOP) { 43 * Using BIS stores, set the first long word of each of 44 * ST_CHUNK cache lines (64 bytes each) before the main 45 * loop is entered. 46 * In the main loop, continue pre-setting the first long 47 * word of each cache line ST_CHUNK lines in advance while 48 * setting the other seven long words (56 bytes) of each 49 * cache line until fewer than ST_CHUNK*64 bytes remain. 50 * Then set the remaining seven long words of each cache 51 * line that has already had its first long word set. 52 * } 53 * store remaining data in 64-byte chunks until less than 54 * 64 bytes remain. 55 * } 56 * Store as many 8-byte chunks, followed by trailing bytes. 57 * 58 * 59 * BIS = Block Init Store 60 * Doing the advance store of the first element of the cache line 61 * initiates the displacement of a cache line while only using a single 62 * instruction in the pipeline. That avoids various pipeline delays, 63 * such as filling the miss buffer. The performance effect is 64 * similar to prefetching for normal stores. 65 * The special case for zero fills runs faster and uses fewer instruction 66 * cycles than the normal memset loop. 67 * 68 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence 69 * BIS stores must be followed by a membar #StoreStore. The benefit of 70 * the BIS store must be balanced against the cost of the membar operation. 71 */ 72 73/* 74 * ASI_STBI_P marks the cache line as "least recently used" 75 * which means if many threads are active, it has a high chance 76 * of being pushed out of the cache between the first initializing 77 * store and the final stores. 78 * Thus, we use ASI_STBIMRU_P which marks the cache line as 79 * "most recently used" for all but the last store to the cache line. 80 */ 81 82#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 83#define ASI_ST_BLK_INIT_MRU_P 0xf2 84 85#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P 86#define ASI_STBIMRU_P ASI_ST_BLK_INIT_MRU_P 87 88#define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */ 89#define MIN_LOOP (ST_CHUNK)*64 90#define MIN_ZERO 256 91 92#define EX_ST(x) x 93#define EX_RETVAL(x) x 94#define STORE_ASI(src,addr) stxa src, [addr] ASI_STBIMRU_P 95#define STORE_INIT(src,addr) stxa src, [addr] ASI_STBI_P 96 97#if IS_IN (libc) 98 99 .text 100 .align 32 101 102ENTRY(__bzero_niagara7) 103 /* bzero (dst, size) */ 104 mov %o1, %o2 105 mov 0, %o1 106 /* fall through into memset code */ 107END(__bzero_niagara7) 108 109ENTRY(__memset_niagara7) 110 /* memset (src, c, size) */ 111 mov %o0, %o5 /* copy sp1 before using it */ 112 cmp %o2, 7 /* if small counts, just write bytes */ 113 bleu,pn %XCC, .Lwrchar 114 and %o1, 0xff, %o1 /* o1 is (char)c */ 115 116 sll %o1, 8, %o3 117 or %o1, %o3, %o1 /* now o1 has 2 bytes of c */ 118 sll %o1, 16, %o3 119 cmp %o2, 32 120 blu,pn %XCC, .Lwdalign 121 or %o1, %o3, %o1 /* now o1 has 4 bytes of c */ 122 123 sllx %o1, 32, %o3 124 or %o1, %o3, %o1 /* now o1 has 8 bytes of c */ 125 126.Ldbalign: 127 andcc %o5, 7, %o3 /* is sp1 aligned on a 8 byte bound? */ 128 bz,pt %XCC, .Lblkalign /* already long word aligned */ 129 sub %o3, 8, %o3 /* -(bytes till long word aligned) */ 130 131 add %o2, %o3, %o2 /* update o2 with new count */ 132 /* Set -(%o3) bytes till sp1 long word aligned */ 1331: stb %o1, [%o5] /* there is at least 1 byte to set */ 134 inccc %o3 /* byte clearing loop */ 135 bl,pt %XCC, 1b 136 inc %o5 137 138 /* Now sp1 is long word aligned (sp1 is found in %o5) */ 139.Lblkalign: 140 cmp %o2, 64 /* check if there are 64 bytes to set */ 141 blu,pn %XCC, .Lwrshort 142 mov %o2, %o3 143 144 andcc %o5, 63, %o3 /* is sp1 block aligned? */ 145 bz,pt %XCC, .Lblkwr /* now block aligned */ 146 sub %o3, 64, %o3 /* o3 is -(bytes till block aligned) */ 147 add %o2, %o3, %o2 /* o2 is the remainder */ 148 149 /* Store -(%o3) bytes till dst is block (64 byte) aligned. */ 150 /* Use long word stores. */ 151 /* Recall that dst is already long word aligned */ 1521: 153 addcc %o3, 8, %o3 154 stx %o1, [%o5] 155 bl,pt %XCC, 1b 156 add %o5, 8, %o5 157 158 /* Now sp1 is block aligned */ 159.Lblkwr: 160 andn %o2, 63, %o4 /* calculate size of blocks in bytes */ 161 brz,pn %o1, .Lwrzero /* special case if c == 0 */ 162 and %o2, 63, %o3 /* %o3 = bytes left after blk stores */ 163 164 cmp %o4, MIN_LOOP /* check for enough bytes to set */ 165 blu,pn %XCC, .Lshort_set /* to justify cost of membar */ 166 nop /* must be > pre-cleared lines */ 167 168 /* initial cache-clearing stores */ 169 /* get store pipeline moving */ 170 171/* Primary memset loop for large memsets */ 172.Lwr_loop: 173 mov ST_CHUNK, %g1 174.Lwr_loop_start: 175 subcc %g1, 4, %g1 176 EX_ST(STORE_ASI(%o1,%o5)) 177 add %o5, 64, %o5 178 EX_ST(STORE_ASI(%o1,%o5)) 179 add %o5, 64, %o5 180 EX_ST(STORE_ASI(%o1,%o5)) 181 add %o5, 64, %o5 182 EX_ST(STORE_ASI(%o1,%o5)) 183 bgu %XCC, .Lwr_loop_start 184 add %o5, 64, %o5 185 186 sub %o5, ST_CHUNK*64, %o5 /* reset %o5 */ 187 mov ST_CHUNK, %g1 188 sub %o5, 8, %o5 /* adjust %o5 for ASI store */ 189 190.Lwr_loop_rest: 191 stx %o1,[%o5+8+8] 192 sub %o4, 64, %o4 193 stx %o1,[%o5+16+8] 194 subcc %g1, 1, %g1 195 stx %o1,[%o5+24+8] 196 stx %o1,[%o5+32+8] 197 stx %o1,[%o5+40+8] 198 add %o5, 64, %o5 199 stx %o1,[%o5-8] 200 bgu %XCC, .Lwr_loop_rest 201 EX_ST(STORE_INIT(%o1,%o5)) 202 203 add %o5, 8, %o5 /* restore %o5 offset */ 204 205 /* If more than ST_CHUNK*64 bytes remain to set, continue */ 206 /* setting the first long word of each cache line in advance */ 207 /* to keep the store pipeline moving. */ 208 209 cmp %o4, ST_CHUNK*64 210 bge,pt %XCC, .Lwr_loop_start 211 mov ST_CHUNK, %g1 212 213 brz,a,pn %o4, .Lasi_done 214 nop 215 216 sub %o5, 8, %o5 /* adjust %o5 for ASI store */ 217.Lwr_loop_small: 218 add %o5, 8, %o5 /* adjust %o5 for ASI store */ 219 EX_ST(STORE_ASI(%o1,%o5)) 220 stx %o1,[%o5+8] 221 stx %o1,[%o5+16] 222 stx %o1,[%o5+24] 223 stx %o1,[%o5+32] 224 subcc %o4, 64, %o4 225 stx %o1,[%o5+40] 226 add %o5, 56, %o5 227 stx %o1,[%o5-8] 228 bgu,pt %XCC, .Lwr_loop_small 229 EX_ST(STORE_INIT(%o1,%o5)) 230 231 ba .Lasi_done 232 add %o5, 8, %o5 /* restore %o5 offset */ 233 234/* Special case loop for zero fill memsets */ 235/* For each 64 byte cache line, single STBI to first element */ 236/* clears line */ 237.Lwrzero: 238 cmp %o4, MIN_ZERO /* check if enough bytes to set */ 239 /* to pay %asi + membar cost */ 240 blu %XCC, .Lshort_set 241 nop 242 sub %o4, 256, %o4 243 244.Lwrzero_loop: 245 mov 64, %g3 246 EX_ST(STORE_INIT(%o1,%o5)) 247 subcc %o4, 256, %o4 248 EX_ST(STORE_INIT(%o1,%o5+%g3)) 249 add %o5, 256, %o5 250 sub %g3, 192, %g3 251 EX_ST(STORE_INIT(%o1,%o5+%g3)) 252 add %g3, 64, %g3 253 bge,pt %XCC, .Lwrzero_loop 254 EX_ST(STORE_INIT(%o1,%o5+%g3)) 255 add %o4, 256, %o4 256 257 brz,pn %o4, .Lbsi_done 258 nop 259.Lwrzero_small: 260 EX_ST(STORE_INIT(%o1,%o5)) 261 subcc %o4, 64, %o4 262 bgu,pt %XCC, .Lwrzero_small 263 add %o5, 64, %o5 264 265.Lasi_done: 266.Lbsi_done: 267 membar #StoreStore /* required by use of BSI */ 268 269.Lshort_set: 270 cmp %o4, 64 /* check if 64 bytes to set */ 271 blu %XCC, 5f 272 nop 2734: /* set final blocks of 64 bytes */ 274 stx %o1, [%o5] 275 stx %o1, [%o5+8] 276 stx %o1, [%o5+16] 277 stx %o1, [%o5+24] 278 subcc %o4, 64, %o4 279 stx %o1, [%o5+32] 280 stx %o1, [%o5+40] 281 add %o5, 64, %o5 282 stx %o1, [%o5-16] 283 bgu,pt %XCC, 4b 284 stx %o1, [%o5-8] 285 2865: 287 /* Set the remaining long words */ 288.Lwrshort: 289 subcc %o3, 8, %o3 /* Can we store any long words? */ 290 blu,pn %XCC, .Lwrchars 291 and %o2, 7, %o2 /* calc bytes left after long words */ 2926: 293 subcc %o3, 8, %o3 294 stx %o1, [%o5] /* store the long words */ 295 bgeu,pt %XCC, 6b 296 add %o5, 8, %o5 297 298.Lwrchars: /* check for extra chars */ 299 brnz %o2, .Lwrfin 300 nop 301 retl 302 nop 303 304.Lwdalign: 305 andcc %o5, 3, %o3 /* is sp1 aligned on a word boundary */ 306 bz,pn %XCC, .Lwrword 307 andn %o2, 3, %o3 /* create word sized count in %o3 */ 308 309 dec %o2 /* decrement count */ 310 stb %o1, [%o5] /* clear a byte */ 311 b .Lwdalign 312 inc %o5 /* next byte */ 313 314.Lwrword: 315 subcc %o3, 4, %o3 316 st %o1, [%o5] /* 4-byte writing loop */ 317 bnz,pt %XCC, .Lwrword 318 add %o5, 4, %o5 319 and %o2, 3, %o2 /* leftover count, if any */ 320 321.Lwrchar: 322 /* Set the remaining bytes, if any */ 323 brz %o2, .Lexit 324 nop 325.Lwrfin: 326 deccc %o2 327 stb %o1, [%o5] 328 bgu,pt %XCC, .Lwrfin 329 inc %o5 330.Lexit: 331 retl /* %o0 was preserved */ 332 nop 333END(__memset_niagara7) 334#endif 335