1/* Copyright (C) 2018-2021 Free Software Foundation, Inc. 2 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include "memset-reg.h" 21 22#ifndef MEMSET 23# define MEMSET __memset_base64 24#endif 25 26/* To disable DC ZVA, set this threshold to 0. */ 27#ifndef DC_ZVA_THRESHOLD 28# define DC_ZVA_THRESHOLD 512 29#endif 30 31/* Assumptions: 32 * 33 * ARMv8-a, AArch64, unaligned accesses 34 * 35 */ 36 37ENTRY_ALIGN (MEMSET, 6) 38 39 PTR_ARG (0) 40 SIZE_ARG (2) 41 42 bfi valw, valw, 8, 8 43 bfi valw, valw, 16, 16 44 bfi val, val, 32, 32 45 46 add dstend, dstin, count 47 48 cmp count, 96 49 b.hi L(set_long) 50 cmp count, 16 51 b.hs L(set_medium) 52 53 /* Set 0..15 bytes. */ 54 tbz count, 3, 1f 55 str val, [dstin] 56 str val, [dstend, -8] 57 ret 58 59 .p2align 3 601: tbz count, 2, 2f 61 str valw, [dstin] 62 str valw, [dstend, -4] 63 ret 642: cbz count, 3f 65 strb valw, [dstin] 66 tbz count, 1, 3f 67 strh valw, [dstend, -2] 683: ret 69 70 .p2align 3 71 /* Set 16..96 bytes. */ 72L(set_medium): 73 stp val, val, [dstin] 74 tbnz count, 6, L(set96) 75 stp val, val, [dstend, -16] 76 tbz count, 5, 1f 77 stp val, val, [dstin, 16] 78 stp val, val, [dstend, -32] 791: ret 80 81 .p2align 4 82 /* Set 64..96 bytes. Write 64 bytes from the start and 83 32 bytes from the end. */ 84L(set96): 85 stp val, val, [dstin, 16] 86 stp val, val, [dstin, 32] 87 stp val, val, [dstin, 48] 88 stp val, val, [dstend, -32] 89 stp val, val, [dstend, -16] 90 ret 91 92 .p2align 4 93L(set_long): 94 stp val, val, [dstin] 95 bic dst, dstin, 15 96#if DC_ZVA_THRESHOLD 97 cmp count, DC_ZVA_THRESHOLD 98 ccmp val, 0, 0, cs 99 b.eq L(zva_64) 100#endif 101 /* Small-size or non-zero memset does not use DC ZVA. */ 102 sub count, dstend, dst 103 104 /* 105 * Adjust count and bias for loop. By substracting extra 1 from count, 106 * it is easy to use tbz instruction to check whether loop tailing 107 * count is less than 33 bytes, so as to bypass 2 unneccesary stps. 108 */ 109 sub count, count, 64+16+1 110 111#if DC_ZVA_THRESHOLD 112 /* Align loop on 16-byte boundary, this might be friendly to i-cache. */ 113 nop 114#endif 115 1161: stp val, val, [dst, 16] 117 stp val, val, [dst, 32] 118 stp val, val, [dst, 48] 119 stp val, val, [dst, 64]! 120 subs count, count, 64 121 b.hs 1b 122 123 tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ 124 stp val, val, [dst, 16] 125 stp val, val, [dst, 32] 1261: stp val, val, [dstend, -32] 127 stp val, val, [dstend, -16] 128 ret 129 130#if DC_ZVA_THRESHOLD 131 .p2align 3 132L(zva_64): 133 stp val, val, [dst, 16] 134 stp val, val, [dst, 32] 135 stp val, val, [dst, 48] 136 bic dst, dst, 63 137 138 /* 139 * Previous memory writes might cross cache line boundary, and cause 140 * cache line partially dirty. Zeroing this kind of cache line using 141 * DC ZVA will incur extra cost, for it requires loading untouched 142 * part of the line from memory before zeoring. 143 * 144 * So, write the first 64 byte aligned block using stp to force 145 * fully dirty cache line. 146 */ 147 stp val, val, [dst, 64] 148 stp val, val, [dst, 80] 149 stp val, val, [dst, 96] 150 stp val, val, [dst, 112] 151 152 sub count, dstend, dst 153 /* 154 * Adjust count and bias for loop. By substracting extra 1 from count, 155 * it is easy to use tbz instruction to check whether loop tailing 156 * count is less than 33 bytes, so as to bypass 2 unneccesary stps. 157 */ 158 sub count, count, 128+64+64+1 159 add dst, dst, 128 160 nop 161 162 /* DC ZVA sets 64 bytes each time. */ 1631: dc zva, dst 164 add dst, dst, 64 165 subs count, count, 64 166 b.hs 1b 167 168 /* 169 * Write the last 64 byte aligned block using stp to force fully 170 * dirty cache line. 171 */ 172 stp val, val, [dst, 0] 173 stp val, val, [dst, 16] 174 stp val, val, [dst, 32] 175 stp val, val, [dst, 48] 176 177 tbz count, 5, 1f /* Remaining count is less than 33 bytes? */ 178 stp val, val, [dst, 64] 179 stp val, val, [dst, 80] 1801: stp val, val, [dstend, -32] 181 stp val, val, [dstend, -16] 182 ret 183#endif 184 185END (MEMSET) 186libc_hidden_builtin_def (MEMSET) 187