1/* Optimized memcpy implementation for CELL BE PowerPC. 2 Copyright (C) 2010-2021 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21#ifndef MEMCPY 22# define MEMCPY memcpy 23#endif 24 25#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ 26#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ 27 28/* memcpy routine optimized for CELL-BE-PPC v2.0 29 * 30 * The CELL PPC core has 1 integer unit and 1 load/store unit 31 * CELL: 32 * 1st level data cache = 32K 33 * 2nd level data cache = 512K 34 * 3rd level data cache = 0K 35 * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, 36 * latency to memory is >400 clocks 37 * To improve copy performance we need to prefetch source data 38 * far ahead to hide this latency 39 * For best performance instruction forms ending in "." like "andi." 40 * should be avoided as the are implemented in microcode on CELL. 41 * The below code is loop unrolled for the CELL cache line of 128 bytes 42 */ 43 44.align 7 45 46ENTRY_TOCLESS (MEMCPY, 5) 47 CALL_MCOUNT 3 48 49 dcbt 0,r4 /* Prefetch ONE SRC cacheline */ 50 cmpldi cr1,r5,16 /* is size < 16 ? */ 51 mr r6,r3 52 blt+ cr1,.Lshortcopy 53 54.Lbigcopy: 55 neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ 56 clrldi r8,r8,64-4 /* align to 16byte boundary */ 57 sub r7,r4,r3 58 cmpldi cr0,r8,0 59 beq+ .Ldst_aligned 60 61.Ldst_unaligned: 62 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ 63 subf r5,r8,r5 64 65 bf cr7*4+3,1f 66 lbzx r0,r7,r6 /* copy 1 byte */ 67 stb r0,0(r6) 68 addi r6,r6,1 691: bf cr7*4+2,2f 70 lhzx r0,r7,r6 /* copy 2 byte */ 71 sth r0,0(r6) 72 addi r6,r6,2 732: bf cr7*4+1,4f 74 lwzx r0,r7,r6 /* copy 4 byte */ 75 stw r0,0(r6) 76 addi r6,r6,4 774: bf cr7*4+0,8f 78 ldx r0,r7,r6 /* copy 8 byte */ 79 std r0,0(r6) 80 addi r6,r6,8 818: 82 add r4,r7,r6 83 84.Ldst_aligned: 85 86 cmpdi cr5,r5,128-1 87 88 neg r7,r6 89 addi r6,r6,-8 /* prepare for stdu */ 90 addi r4,r4,-8 /* prepare for ldu */ 91 92 clrldi r7,r7,64-7 /* align to cacheline boundary */ 93 ble+ cr5,.Llessthancacheline 94 95 cmpldi cr6,r7,0 96 subf r5,r7,r5 97 srdi r7,r7,4 /* divide size by 16 */ 98 srdi r10,r5,7 /* number of cache lines to copy */ 99 100 cmpldi r10,0 101 li r11,0 /* number cachelines to copy with prefetch */ 102 beq .Lnocacheprefetch 103 104 cmpldi r10,PREFETCH_AHEAD 105 li r12,128+8 /* prefetch distance */ 106 ble .Llessthanmaxprefetch 107 108 subi r11,r10,PREFETCH_AHEAD 109 li r10,PREFETCH_AHEAD 110 111.Llessthanmaxprefetch: 112 mtctr r10 113 114.LprefetchSRC: 115 dcbt r12,r4 116 addi r12,r12,128 117 bdnz .LprefetchSRC 118 119.Lnocacheprefetch: 120 mtctr r7 121 cmpldi cr1,r5,128 122 clrldi r5,r5,64-7 123 beq cr6,.Lcachelinealigned 124 125.Laligntocacheline: 126 ld r9,0x08(r4) 127 ldu r7,0x10(r4) 128 std r9,0x08(r6) 129 stdu r7,0x10(r6) 130 bdnz .Laligntocacheline 131 132 133.Lcachelinealigned: /* copy while cache lines */ 134 135 blt- cr1,.Llessthancacheline /* size <128 */ 136 137.Louterloop: 138 cmpdi r11,0 139 mtctr r11 140 beq- .Lendloop 141 142 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ 143 144.align 4 145 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 146.Lloop: /* Copy aligned body */ 147 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 148 ld r9, 0x08(r4) 149 dcbz r11,r6 150 ld r7, 0x10(r4) /* 4 register stride copy is optimal */ 151 ld r8, 0x18(r4) /* to hide 1st level cache latency. */ 152 ld r0, 0x20(r4) 153 std r9, 0x08(r6) 154 std r7, 0x10(r6) 155 std r8, 0x18(r6) 156 std r0, 0x20(r6) 157 ld r9, 0x28(r4) 158 ld r7, 0x30(r4) 159 ld r8, 0x38(r4) 160 ld r0, 0x40(r4) 161 std r9, 0x28(r6) 162 std r7, 0x30(r6) 163 std r8, 0x38(r6) 164 std r0, 0x40(r6) 165 ld r9, 0x48(r4) 166 ld r7, 0x50(r4) 167 ld r8, 0x58(r4) 168 ld r0, 0x60(r4) 169 std r9, 0x48(r6) 170 std r7, 0x50(r6) 171 std r8, 0x58(r6) 172 std r0, 0x60(r6) 173 ld r9, 0x68(r4) 174 ld r7, 0x70(r4) 175 ld r8, 0x78(r4) 176 ldu r0, 0x80(r4) 177 std r9, 0x68(r6) 178 std r7, 0x70(r6) 179 std r8, 0x78(r6) 180 stdu r0, 0x80(r6) 181 182 bdnz .Lloop 183 184.Lendloop: 185 cmpdi r10,0 186 sldi r10,r10,2 /* adjust from 128 to 32 byte stride */ 187 beq- .Lendloop2 188 mtctr r10 189 190.Lloop2: /* Copy aligned body */ 191 ld r9, 0x08(r4) 192 ld r7, 0x10(r4) 193 ld r8, 0x18(r4) 194 ldu r0, 0x20(r4) 195 std r9, 0x08(r6) 196 std r7, 0x10(r6) 197 std r8, 0x18(r6) 198 stdu r0, 0x20(r6) 199 200 bdnz .Lloop2 201.Lendloop2: 202 203.Llessthancacheline: /* less than cache to do ? */ 204 cmpldi cr0,r5,16 205 srdi r7,r5,4 /* divide size by 16 */ 206 blt- .Ldo_lt16 207 mtctr r7 208 209.Lcopy_remaining: 210 ld r8,0x08(r4) 211 ldu r7,0x10(r4) 212 std r8,0x08(r6) 213 stdu r7,0x10(r6) 214 bdnz .Lcopy_remaining 215 216.Ldo_lt16: /* less than 16 ? */ 217 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ 218 beqlr+ /* no rest to copy */ 219 addi r4,r4,8 220 addi r6,r6,8 221 222.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ 223 mtcrf 0x01,r5 224 sub r7,r4,r6 225 bf- cr7*4+0,8f 226 ldx r0,r7,r6 /* copy 8 byte */ 227 std r0,0(r6) 228 addi r6,r6,8 2298: 230 bf cr7*4+1,4f 231 lwzx r0,r7,r6 /* copy 4 byte */ 232 stw r0,0(r6) 233 addi r6,r6,4 2344: 235 bf cr7*4+2,2f 236 lhzx r0,r7,r6 /* copy 2 byte */ 237 sth r0,0(r6) 238 addi r6,r6,2 2392: 240 bf cr7*4+3,1f 241 lbzx r0,r7,r6 /* copy 1 byte */ 242 stb r0,0(r6) 2431: blr 244 245END_GEN_TB (MEMCPY,TB_TOCLESS) 246libc_hidden_builtin_def (memcpy) 247