1/* Optimized memcpy for Qualcomm Falkor processor. 2 Copyright (C) 2017-2021 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library. If not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22/* Assumptions: 23 24 ARMv8-a, AArch64, falkor, unaligned accesses. */ 25 26#define dstin x0 27#define src x1 28#define count x2 29#define dst x3 30#define srcend x4 31#define dstend x5 32#define tmp1 x14 33#define A_x x6 34#define B_x x7 35#define A_w w6 36#define B_w w7 37 38#define A_q q0 39#define B_q q1 40#define C_q q2 41#define D_q q3 42#define E_q q4 43#define F_q q5 44#define G_q q6 45#define H_q q7 46#define Q_q q6 47#define S_q q22 48 49/* Copies are split into 3 main cases: 50 51 1. Small copies of up to 32 bytes 52 2. Medium copies of 33..128 bytes which are fully unrolled 53 3. Large copies of more than 128 bytes. 54 55 Large copies align the source to a quad word and use an unrolled loop 56 processing 64 bytes per iteration. 57 58 FALKOR-SPECIFIC DESIGN: 59 60 The smallest copies (32 bytes or less) focus on optimal pipeline usage, 61 which is why the redundant copies of 0-3 bytes have been replaced with 62 conditionals, since the former would unnecessarily break across multiple 63 issue groups. The medium copy group has been enlarged to 128 bytes since 64 bumping up the small copies up to 32 bytes allows us to do that without 65 cost and also allows us to reduce the size of the prep code before loop64. 66 67 The copy loop uses only one register q0. This is to ensure that all loads 68 hit a single hardware prefetcher which can get correctly trained to prefetch 69 a single stream. 70 71 The non-temporal stores help optimize cache utilization. */ 72 73#if IS_IN (libc) 74ENTRY_ALIGN (__memcpy_falkor, 6) 75 76 PTR_ARG (0) 77 PTR_ARG (1) 78 SIZE_ARG (2) 79 80 cmp count, 32 81 add srcend, src, count 82 add dstend, dstin, count 83 b.ls L(copy32) 84 cmp count, 128 85 b.hi L(copy_long) 86 87 /* Medium copies: 33..128 bytes. */ 88L(copy128): 89 sub tmp1, count, 1 90 ldr A_q, [src] 91 ldr B_q, [src, 16] 92 ldr C_q, [srcend, -32] 93 ldr D_q, [srcend, -16] 94 tbz tmp1, 6, 1f 95 ldr E_q, [src, 32] 96 ldr F_q, [src, 48] 97 ldr G_q, [srcend, -64] 98 ldr H_q, [srcend, -48] 99 str G_q, [dstend, -64] 100 str H_q, [dstend, -48] 101 str E_q, [dstin, 32] 102 str F_q, [dstin, 48] 1031: 104 str A_q, [dstin] 105 str B_q, [dstin, 16] 106 str C_q, [dstend, -32] 107 str D_q, [dstend, -16] 108 ret 109 110 .p2align 4 111 /* Small copies: 0..32 bytes. */ 112L(copy32): 113 /* 16-32 */ 114 cmp count, 16 115 b.lo 1f 116 ldr A_q, [src] 117 ldr B_q, [srcend, -16] 118 str A_q, [dstin] 119 str B_q, [dstend, -16] 120 ret 121 .p2align 4 1221: 123 /* 8-15 */ 124 tbz count, 3, 1f 125 ldr A_x, [src] 126 ldr B_x, [srcend, -8] 127 str A_x, [dstin] 128 str B_x, [dstend, -8] 129 ret 130 .p2align 4 1311: 132 /* 4-7 */ 133 tbz count, 2, 1f 134 ldr A_w, [src] 135 ldr B_w, [srcend, -4] 136 str A_w, [dstin] 137 str B_w, [dstend, -4] 138 ret 139 .p2align 4 1401: 141 /* 2-3 */ 142 tbz count, 1, 1f 143 ldrh A_w, [src] 144 ldrh B_w, [srcend, -2] 145 strh A_w, [dstin] 146 strh B_w, [dstend, -2] 147 ret 148 .p2align 4 1491: 150 /* 0-1 */ 151 tbz count, 0, 1f 152 ldrb A_w, [src] 153 strb A_w, [dstin] 1541: 155 ret 156 157 /* Align SRC to 16 bytes and copy; that way at least one of the 158 accesses is aligned throughout the copy sequence. 159 160 The count is off by 0 to 15 bytes, but this is OK because we trim 161 off the last 64 bytes to copy off from the end. Due to this the 162 loop never runs out of bounds. */ 163 164 .p2align 4 165 nop /* Align loop64 below. */ 166L(copy_long): 167 ldr A_q, [src] 168 sub count, count, 64 + 16 169 and tmp1, src, 15 170 str A_q, [dstin] 171 bic src, src, 15 172 sub dst, dstin, tmp1 173 add count, count, tmp1 174 175L(loop64): 176 ldr A_q, [src, 16]! 177 str A_q, [dst, 16] 178 ldr A_q, [src, 16]! 179 subs count, count, 64 180 str A_q, [dst, 32] 181 ldr A_q, [src, 16]! 182 str A_q, [dst, 48] 183 ldr A_q, [src, 16]! 184 str A_q, [dst, 64]! 185 b.hi L(loop64) 186 187 /* Write the last full set of 64 bytes. The remainder is at most 64 188 bytes, so it is safe to always copy 64 bytes from the end even if 189 there is just 1 byte left. */ 190 ldr E_q, [srcend, -64] 191 str E_q, [dstend, -64] 192 ldr D_q, [srcend, -48] 193 str D_q, [dstend, -48] 194 ldr C_q, [srcend, -32] 195 str C_q, [dstend, -32] 196 ldr B_q, [srcend, -16] 197 str B_q, [dstend, -16] 198 ret 199 200END (__memcpy_falkor) 201libc_hidden_builtin_def (__memcpy_falkor) 202 203 204/* RATIONALE: 205 206 The move has 4 distinct parts: 207 * Small moves of 32 bytes and under. 208 * Medium sized moves of 33-128 bytes (fully unrolled). 209 * Large moves where the source address is higher than the destination 210 (forward copies) 211 * Large moves where the destination address is higher than the source 212 (copy backward, or move). 213 214 We use only two registers q6 and q22 for the moves and move 32 bytes at a 215 time to correctly train the hardware prefetcher for better throughput. 216 217 For small and medium cases memcpy is used. */ 218 219ENTRY_ALIGN (__memmove_falkor, 6) 220 221 PTR_ARG (0) 222 PTR_ARG (1) 223 SIZE_ARG (2) 224 225 cmp count, 32 226 add srcend, src, count 227 add dstend, dstin, count 228 b.ls L(copy32) 229 cmp count, 128 230 b.ls L(copy128) 231 sub tmp1, dstin, src 232 ccmp tmp1, count, 2, hi 233 b.lo L(move_long) 234 235 /* CASE: Copy Forwards 236 237 Align src to 16 byte alignment so that we don't cross cache line 238 boundaries on both loads and stores. There are at least 128 bytes 239 to copy, so copy 16 bytes unaligned and then align. The loop 240 copies 32 bytes per iteration and prefetches one iteration ahead. */ 241 242 ldr S_q, [src] 243 and tmp1, src, 15 244 bic src, src, 15 245 sub dst, dstin, tmp1 246 add count, count, tmp1 /* Count is now 16 too large. */ 247 ldr Q_q, [src, 16]! 248 str S_q, [dstin] 249 ldr S_q, [src, 16]! 250 sub count, count, 32 + 32 + 16 /* Test and readjust count. */ 251 252 .p2align 4 2531: 254 subs count, count, 32 255 str Q_q, [dst, 16] 256 ldr Q_q, [src, 16]! 257 str S_q, [dst, 32]! 258 ldr S_q, [src, 16]! 259 b.hi 1b 260 261 /* Copy 32 bytes from the end before writing the data prefetched in the 262 last loop iteration. */ 2632: 264 ldr B_q, [srcend, -32] 265 ldr C_q, [srcend, -16] 266 str Q_q, [dst, 16] 267 str S_q, [dst, 32] 268 str B_q, [dstend, -32] 269 str C_q, [dstend, -16] 270 ret 271 272 /* CASE: Copy Backwards 273 274 Align srcend to 16 byte alignment so that we don't cross cache line 275 boundaries on both loads and stores. There are at least 128 bytes 276 to copy, so copy 16 bytes unaligned and then align. The loop 277 copies 32 bytes per iteration and prefetches one iteration ahead. */ 278 279 .p2align 4 280 nop 281 nop 282L(move_long): 283 cbz tmp1, 3f /* Return early if src == dstin */ 284 ldr S_q, [srcend, -16] 285 and tmp1, srcend, 15 286 sub srcend, srcend, tmp1 287 ldr Q_q, [srcend, -16]! 288 str S_q, [dstend, -16] 289 sub count, count, tmp1 290 ldr S_q, [srcend, -16]! 291 sub dstend, dstend, tmp1 292 sub count, count, 32 + 32 293 2941: 295 subs count, count, 32 296 str Q_q, [dstend, -16] 297 ldr Q_q, [srcend, -16]! 298 str S_q, [dstend, -32]! 299 ldr S_q, [srcend, -16]! 300 b.hi 1b 301 302 /* Copy 32 bytes from the start before writing the data prefetched in the 303 last loop iteration. */ 304 305 ldr B_q, [src, 16] 306 ldr C_q, [src] 307 str Q_q, [dstend, -16] 308 str S_q, [dstend, -32] 309 str B_q, [dstin, 16] 310 str C_q, [dstin] 3113: ret 312 313END (__memmove_falkor) 314libc_hidden_builtin_def (__memmove_falkor) 315#endif 316