1/* Copyright (C) 2012-2021 Free Software Foundation, Inc. 2 This file is part of the GNU C Library. 3 4 The GNU C Library is free software; you can redistribute it and/or 5 modify it under the terms of the GNU Lesser General Public 6 License as published by the Free Software Foundation; either 7 version 2.1 of the License, or (at your option) any later version. 8 9 The GNU C Library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public 15 License along with the GNU C Library. If not, see 16 <https://www.gnu.org/licenses/>. */ 17 18#ifdef ANDROID_CHANGES 19# include "machine/asm.h" 20# include "machine/regdef.h" 21# define USE_MEMMOVE_FOR_OVERLAP 22# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 23# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 24#elif _LIBC 25# include <sysdep.h> 26# include <regdef.h> 27# include <sys/asm.h> 28# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 29# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 30#elif defined _COMPILING_NEWLIB 31# include "machine/asm.h" 32# include "machine/regdef.h" 33# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED 34# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE 35#else 36# include <regdef.h> 37# include <sys/asm.h> 38#endif 39 40#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \ 41 (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64) 42# ifndef DISABLE_PREFETCH 43# define USE_PREFETCH 44# endif 45#endif 46 47#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)) 48# ifndef DISABLE_DOUBLE 49# define USE_DOUBLE 50# endif 51#endif 52 53/* Some asm.h files do not have the L macro definition. */ 54#ifndef L 55# if _MIPS_SIM == _ABIO32 56# define L(label) $L ## label 57# else 58# define L(label) .L ## label 59# endif 60#endif 61 62/* Some asm.h files do not have the PTR_ADDIU macro definition. */ 63#ifndef PTR_ADDIU 64# ifdef USE_DOUBLE 65# define PTR_ADDIU daddiu 66# else 67# define PTR_ADDIU addiu 68# endif 69#endif 70 71/* Some asm.h files do not have the PTR_SRA macro definition. */ 72#ifndef PTR_SRA 73# ifdef USE_DOUBLE 74# define PTR_SRA dsra 75# else 76# define PTR_SRA sra 77# endif 78#endif 79 80/* New R6 instructions that may not be in asm.h. */ 81#ifndef PTR_LSA 82# if _MIPS_SIM == _ABI64 83# define PTR_LSA dlsa 84# else 85# define PTR_LSA lsa 86# endif 87#endif 88 89/* 90 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load 91 * prefetches appears to offer a slight preformance advantage. 92 * 93 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE 94 * or PREFETCH_STORE_STREAMED offers a large performance advantage 95 * but PREPAREFORSTORE has some special restrictions to consider. 96 * 97 * Prefetch with the 'prepare for store' hint does not copy a memory 98 * location into the cache, it just allocates a cache line and zeros 99 * it out. This means that if you do not write to the entire cache 100 * line before writing it out to memory some data will get zero'ed out 101 * when the cache line is written back to memory and data will be lost. 102 * 103 * Also if you are using this memcpy to copy overlapping buffers it may 104 * not behave correctly when using the 'prepare for store' hint. If you 105 * use the 'prepare for store' prefetch on a memory area that is in the 106 * memcpy source (as well as the memcpy destination), then you will get 107 * some data zero'ed out before you have a chance to read it and data will 108 * be lost. 109 * 110 * If you are going to use this memcpy routine with the 'prepare for store' 111 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid 112 * the problem of running memcpy on overlapping buffers. 113 * 114 * There are ifdef'ed sections of this memcpy to make sure that it does not 115 * do prefetches on cache lines that are not going to be completely written. 116 * This code is only needed and only used when PREFETCH_STORE_HINT is set to 117 * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are 118 * 32 bytes and if the cache line is larger it will not work correctly. 119 */ 120 121#ifdef USE_PREFETCH 122# define PREFETCH_HINT_LOAD 0 123# define PREFETCH_HINT_STORE 1 124# define PREFETCH_HINT_LOAD_STREAMED 4 125# define PREFETCH_HINT_STORE_STREAMED 5 126# define PREFETCH_HINT_LOAD_RETAINED 6 127# define PREFETCH_HINT_STORE_RETAINED 7 128# define PREFETCH_HINT_WRITEBACK_INVAL 25 129# define PREFETCH_HINT_PREPAREFORSTORE 30 130 131/* 132 * If we have not picked out what hints to use at this point use the 133 * standard load and store prefetch hints. 134 */ 135# ifndef PREFETCH_STORE_HINT 136# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE 137# endif 138# ifndef PREFETCH_LOAD_HINT 139# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD 140# endif 141 142/* 143 * We double everything when USE_DOUBLE is true so we do 2 prefetches to 144 * get 64 bytes in that case. The assumption is that each individual 145 * prefetch brings in 32 bytes. 146 */ 147 148# ifdef USE_DOUBLE 149# define PREFETCH_CHUNK 64 150# define PREFETCH_FOR_LOAD(chunk, reg) \ 151 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ 152 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) 153# define PREFETCH_FOR_STORE(chunk, reg) \ 154 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ 155 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) 156# else 157# define PREFETCH_CHUNK 32 158# define PREFETCH_FOR_LOAD(chunk, reg) \ 159 pref PREFETCH_LOAD_HINT, (chunk)*32(reg) 160# define PREFETCH_FOR_STORE(chunk, reg) \ 161 pref PREFETCH_STORE_HINT, (chunk)*32(reg) 162# endif 163/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less 164 * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size 165 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE 166 * hint is used, the code will not work correctly. If PREPAREFORSTORE is not 167 * used then MAX_PREFETCH_SIZE does not matter. */ 168# define MAX_PREFETCH_SIZE 128 169/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater 170 * than 5 on a STORE prefetch and that a single prefetch can never be larger 171 * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because 172 * we actually do two prefetches in that case, one 32 bytes after the other. */ 173# ifdef USE_DOUBLE 174# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE 175# else 176# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE 177# endif 178# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ 179 && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) 180/* We cannot handle this because the initial prefetches may fetch bytes that 181 * are before the buffer being copied. We start copies with an offset 182 * of 4 so avoid this situation when using PREPAREFORSTORE. */ 183#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." 184# endif 185#else /* USE_PREFETCH not defined */ 186# define PREFETCH_FOR_LOAD(offset, reg) 187# define PREFETCH_FOR_STORE(offset, reg) 188#endif 189 190#if __mips_isa_rev > 5 191# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 192# undef PREFETCH_STORE_HINT 193# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED 194# endif 195# define R6_CODE 196#endif 197 198/* Allow the routine to be named something else if desired. */ 199#ifndef MEMCPY_NAME 200# define MEMCPY_NAME memcpy 201#endif 202 203/* We use these 32/64 bit registers as temporaries to do the copying. */ 204#define REG0 t0 205#define REG1 t1 206#define REG2 t2 207#define REG3 t3 208#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABIO32) || (_MIPS_SIM == _ABIO64)) 209# define REG4 t4 210# define REG5 t5 211# define REG6 t6 212# define REG7 t7 213#else 214# define REG4 ta0 215# define REG5 ta1 216# define REG6 ta2 217# define REG7 ta3 218#endif 219 220/* We load/store 64 bits at a time when USE_DOUBLE is true. 221 * The C_ prefix stands for CHUNK and is used to avoid macro name 222 * conflicts with system header files. */ 223 224#ifdef USE_DOUBLE 225# define C_ST sd 226# define C_LD ld 227# ifdef __MIPSEB 228# define C_LDHI ldl /* high part is left in big-endian */ 229# define C_STHI sdl /* high part is left in big-endian */ 230# define C_LDLO ldr /* low part is right in big-endian */ 231# define C_STLO sdr /* low part is right in big-endian */ 232# else 233# define C_LDHI ldr /* high part is right in little-endian */ 234# define C_STHI sdr /* high part is right in little-endian */ 235# define C_LDLO ldl /* low part is left in little-endian */ 236# define C_STLO sdl /* low part is left in little-endian */ 237# endif 238# define C_ALIGN dalign /* r6 align instruction */ 239#else 240# define C_ST sw 241# define C_LD lw 242# ifdef __MIPSEB 243# define C_LDHI lwl /* high part is left in big-endian */ 244# define C_STHI swl /* high part is left in big-endian */ 245# define C_LDLO lwr /* low part is right in big-endian */ 246# define C_STLO swr /* low part is right in big-endian */ 247# else 248# define C_LDHI lwr /* high part is right in little-endian */ 249# define C_STHI swr /* high part is right in little-endian */ 250# define C_LDLO lwl /* low part is left in little-endian */ 251# define C_STLO swl /* low part is left in little-endian */ 252# endif 253# define C_ALIGN align /* r6 align instruction */ 254#endif 255 256/* Bookkeeping values for 32 vs. 64 bit mode. */ 257#ifdef USE_DOUBLE 258# define NSIZE 8 259# define NSIZEMASK 0x3f 260# define NSIZEDMASK 0x7f 261#else 262# define NSIZE 4 263# define NSIZEMASK 0x1f 264# define NSIZEDMASK 0x3f 265#endif 266#define UNIT(unit) ((unit)*NSIZE) 267#define UNITM1(unit) (((unit)*NSIZE)-1) 268 269#ifdef ANDROID_CHANGES 270LEAF(MEMCPY_NAME, 0) 271#else 272LEAF(MEMCPY_NAME) 273#endif 274 .set nomips16 275 .set noreorder 276/* 277 * Below we handle the case where memcpy is called with overlapping src and dst. 278 * Although memcpy is not required to handle this case, some parts of Android 279 * like Skia rely on such usage. We call memmove to handle such cases. 280 */ 281#ifdef USE_MEMMOVE_FOR_OVERLAP 282 PTR_SUBU t0,a0,a1 283 PTR_SRA t2,t0,31 284 xor t1,t0,t2 285 PTR_SUBU t0,t1,t2 286 sltu t2,t0,a2 287 beq t2,zero,L(memcpy) 288 la t9,memmove 289 jr t9 290 nop 291L(memcpy): 292#endif 293/* 294 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of 295 * size, copy dst pointer to v0 for the return value. 296 */ 297 slti t2,a2,(2 * NSIZE) 298 bne t2,zero,L(lasts) 299#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) 300 move v0,zero 301#else 302 move v0,a0 303#endif 304 305#ifndef R6_CODE 306 307/* 308 * If src and dst have different alignments, go to L(unaligned), if they 309 * have the same alignment (but are not actually aligned) do a partial 310 * load/store to make them aligned. If they are both already aligned 311 * we can start copying at L(aligned). 312 */ 313 xor t8,a1,a0 314 andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */ 315 bne t8,zero,L(unaligned) 316 PTR_SUBU a3, zero, a0 317 318 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 319 beq a3,zero,L(aligned) /* if a3=0, it is already aligned */ 320 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 321 322 C_LDHI t8,0(a1) 323 PTR_ADDU a1,a1,a3 324 C_STHI t8,0(a0) 325 PTR_ADDU a0,a0,a3 326 327#else /* R6_CODE */ 328 329/* 330 * Align the destination and hope that the source gets aligned too. If it 331 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6 332 * align instruction. 333 */ 334 andi t8,a0,7 335 lapc t9,L(atable) 336 PTR_LSA t9,t8,t9,2 337 jrc t9 338L(atable): 339 bc L(lb0) 340 bc L(lb7) 341 bc L(lb6) 342 bc L(lb5) 343 bc L(lb4) 344 bc L(lb3) 345 bc L(lb2) 346 bc L(lb1) 347L(lb7): 348 lb a3, 6(a1) 349 sb a3, 6(a0) 350L(lb6): 351 lb a3, 5(a1) 352 sb a3, 5(a0) 353L(lb5): 354 lb a3, 4(a1) 355 sb a3, 4(a0) 356L(lb4): 357 lb a3, 3(a1) 358 sb a3, 3(a0) 359L(lb3): 360 lb a3, 2(a1) 361 sb a3, 2(a0) 362L(lb2): 363 lb a3, 1(a1) 364 sb a3, 1(a0) 365L(lb1): 366 lb a3, 0(a1) 367 sb a3, 0(a0) 368 369 li t9,8 370 subu t8,t9,t8 371 PTR_SUBU a2,a2,t8 372 PTR_ADDU a0,a0,t8 373 PTR_ADDU a1,a1,t8 374L(lb0): 375 376 andi t8,a1,(NSIZE-1) 377 lapc t9,L(jtable) 378 PTR_LSA t9,t8,t9,2 379 jrc t9 380L(jtable): 381 bc L(aligned) 382 bc L(r6_unaligned1) 383 bc L(r6_unaligned2) 384 bc L(r6_unaligned3) 385# ifdef USE_DOUBLE 386 bc L(r6_unaligned4) 387 bc L(r6_unaligned5) 388 bc L(r6_unaligned6) 389 bc L(r6_unaligned7) 390# endif 391#endif /* R6_CODE */ 392 393L(aligned): 394 395/* 396 * Now dst/src are both aligned to (word or double word) aligned addresses 397 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 398 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 399 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 400 * equals a3. 401 */ 402 403 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 404 beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 405 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 406 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 407 408/* When in the loop we may prefetch with the 'prepare to store' hint, 409 * in this case the a0+x should not be past the "t0-32" address. This 410 * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively, 411 * for x=64 the last "safe" a0 address is "t0-96" In the current version we 412 * will use "prefetch hint,128(a0)", so "t0-160" is the limit. 413 */ 414#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 415 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 416 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 417#endif 418 PREFETCH_FOR_LOAD (0, a1) 419 PREFETCH_FOR_LOAD (1, a1) 420 PREFETCH_FOR_LOAD (2, a1) 421 PREFETCH_FOR_LOAD (3, a1) 422#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 423 PREFETCH_FOR_STORE (1, a0) 424 PREFETCH_FOR_STORE (2, a0) 425 PREFETCH_FOR_STORE (3, a0) 426#endif 427#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 428# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE 429 sltu v1,t9,a0 430 bgtz v1,L(skip_set) 431 nop 432 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 433L(skip_set): 434# else 435 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 436# endif 437#endif 438#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ 439 && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 440 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) 441# ifdef USE_DOUBLE 442 PTR_ADDIU v0,v0,32 443# endif 444#endif 445L(loop16w): 446 C_LD t0,UNIT(0)(a1) 447#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 448 sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ 449 bgtz v1,L(skip_pref) 450#endif 451 C_LD t1,UNIT(1)(a1) 452#ifdef R6_CODE 453 PREFETCH_FOR_STORE (2, a0) 454#else 455 PREFETCH_FOR_STORE (4, a0) 456 PREFETCH_FOR_STORE (5, a0) 457#endif 458#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) 459 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) 460# ifdef USE_DOUBLE 461 PTR_ADDIU v0,v0,32 462# endif 463#endif 464L(skip_pref): 465 C_LD REG2,UNIT(2)(a1) 466 C_LD REG3,UNIT(3)(a1) 467 C_LD REG4,UNIT(4)(a1) 468 C_LD REG5,UNIT(5)(a1) 469 C_LD REG6,UNIT(6)(a1) 470 C_LD REG7,UNIT(7)(a1) 471#ifdef R6_CODE 472 PREFETCH_FOR_LOAD (3, a1) 473#else 474 PREFETCH_FOR_LOAD (4, a1) 475#endif 476 C_ST t0,UNIT(0)(a0) 477 C_ST t1,UNIT(1)(a0) 478 C_ST REG2,UNIT(2)(a0) 479 C_ST REG3,UNIT(3)(a0) 480 C_ST REG4,UNIT(4)(a0) 481 C_ST REG5,UNIT(5)(a0) 482 C_ST REG6,UNIT(6)(a0) 483 C_ST REG7,UNIT(7)(a0) 484 485 C_LD t0,UNIT(8)(a1) 486 C_LD t1,UNIT(9)(a1) 487 C_LD REG2,UNIT(10)(a1) 488 C_LD REG3,UNIT(11)(a1) 489 C_LD REG4,UNIT(12)(a1) 490 C_LD REG5,UNIT(13)(a1) 491 C_LD REG6,UNIT(14)(a1) 492 C_LD REG7,UNIT(15)(a1) 493#ifndef R6_CODE 494 PREFETCH_FOR_LOAD (5, a1) 495#endif 496 C_ST t0,UNIT(8)(a0) 497 C_ST t1,UNIT(9)(a0) 498 C_ST REG2,UNIT(10)(a0) 499 C_ST REG3,UNIT(11)(a0) 500 C_ST REG4,UNIT(12)(a0) 501 C_ST REG5,UNIT(13)(a0) 502 C_ST REG6,UNIT(14)(a0) 503 C_ST REG7,UNIT(15)(a0) 504 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 505 bne a0,a3,L(loop16w) 506 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 507 move a2,t8 508 509/* Here we have src and dest word-aligned but less than 64-bytes or 510 * 128 bytes to go. Check for a 32(64) byte chunk and copy if there 511 * is one. Otherwise jump down to L(chk1w) to handle the tail end of 512 * the copy. 513 */ 514 515L(chkw): 516 PREFETCH_FOR_LOAD (0, a1) 517 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 518 /* The t8 is the reminder count past 32-bytes */ 519 beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */ 520 nop 521 C_LD t0,UNIT(0)(a1) 522 C_LD t1,UNIT(1)(a1) 523 C_LD REG2,UNIT(2)(a1) 524 C_LD REG3,UNIT(3)(a1) 525 C_LD REG4,UNIT(4)(a1) 526 C_LD REG5,UNIT(5)(a1) 527 C_LD REG6,UNIT(6)(a1) 528 C_LD REG7,UNIT(7)(a1) 529 PTR_ADDIU a1,a1,UNIT(8) 530 C_ST t0,UNIT(0)(a0) 531 C_ST t1,UNIT(1)(a0) 532 C_ST REG2,UNIT(2)(a0) 533 C_ST REG3,UNIT(3)(a0) 534 C_ST REG4,UNIT(4)(a0) 535 C_ST REG5,UNIT(5)(a0) 536 C_ST REG6,UNIT(6)(a0) 537 C_ST REG7,UNIT(7)(a0) 538 PTR_ADDIU a0,a0,UNIT(8) 539 540/* 541 * Here we have less than 32(64) bytes to copy. Set up for a loop to 542 * copy one word (or double word) at a time. Set a2 to count how many 543 * bytes we have to copy after all the word (or double word) chunks are 544 * copied and a3 to the dst pointer after all the (d)word chunks have 545 * been copied. We will loop, incrementing a0 and a1 until a0 equals a3. 546 */ 547L(chk1w): 548 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 549 beq a2,t8,L(lastw) 550 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 551 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 552 553/* copying in words (4-byte or 8-byte chunks) */ 554L(wordCopy_loop): 555 C_LD REG3,UNIT(0)(a1) 556 PTR_ADDIU a0,a0,UNIT(1) 557 PTR_ADDIU a1,a1,UNIT(1) 558 bne a0,a3,L(wordCopy_loop) 559 C_ST REG3,UNIT(-1)(a0) 560 561/* If we have been copying double words, see if we can copy a single word 562 before doing byte copies. We can have, at most, one word to copy. */ 563 564L(lastw): 565#ifdef USE_DOUBLE 566 andi t8,a2,3 /* a2 is the remainder past 4 byte chunks. */ 567 beq t8,a2,L(lastb) 568 move a2,t8 569 lw REG3,0(a1) 570 sw REG3,0(a0) 571 PTR_ADDIU a0,a0,4 572 PTR_ADDIU a1,a1,4 573#endif 574 575/* Copy the last 8 (or 16) bytes */ 576L(lastb): 577 blez a2,L(leave) 578 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 579L(lastbloop): 580 lb v1,0(a1) 581 PTR_ADDIU a0,a0,1 582 PTR_ADDIU a1,a1,1 583 bne a0,a3,L(lastbloop) 584 sb v1,-1(a0) 585L(leave): 586 j ra 587 nop 588 589/* We jump here with a memcpy of less than 8 or 16 bytes, depending on 590 whether or not USE_DOUBLE is defined. Instead of just doing byte 591 copies, check the alignment and size and use lw/sw if possible. 592 Otherwise, do byte copies. */ 593 594L(lasts): 595 andi t8,a2,3 596 beq t8,a2,L(lastb) 597 598 andi t9,a0,3 599 bne t9,zero,L(lastb) 600 andi t9,a1,3 601 bne t9,zero,L(lastb) 602 603 PTR_SUBU a3,a2,t8 604 PTR_ADDU a3,a0,a3 605 606L(wcopy_loop): 607 lw REG3,0(a1) 608 PTR_ADDIU a0,a0,4 609 PTR_ADDIU a1,a1,4 610 bne a0,a3,L(wcopy_loop) 611 sw REG3,-4(a0) 612 613 b L(lastb) 614 move a2,t8 615 616#ifndef R6_CODE 617/* 618 * UNALIGNED case, got here with a3 = "negu a0" 619 * This code is nearly identical to the aligned code above 620 * but only the destination (not the source) gets aligned 621 * so we need to do partial loads of the source followed 622 * by normal stores to the destination (once we have aligned 623 * the destination). 624 */ 625 626L(unaligned): 627 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */ 628 beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */ 629 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */ 630 631 C_LDHI v1,UNIT(0)(a1) 632 C_LDLO v1,UNITM1(1)(a1) 633 PTR_ADDU a1,a1,a3 634 C_STHI v1,UNIT(0)(a0) 635 PTR_ADDU a0,a0,a3 636 637/* 638 * Now the destination (but not the source) is aligned 639 * Set a2 to count how many bytes we have to copy after all the 64/128 byte 640 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 641 * chunks have been copied. We will loop, incrementing a0 and a1 until a0 642 * equals a3. 643 */ 644 645L(ua_chk16w): 646 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */ 647 beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */ 648 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */ 649 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */ 650 651# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 652 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */ 653 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */ 654# endif 655 PREFETCH_FOR_LOAD (0, a1) 656 PREFETCH_FOR_LOAD (1, a1) 657 PREFETCH_FOR_LOAD (2, a1) 658# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) 659 PREFETCH_FOR_STORE (1, a0) 660 PREFETCH_FOR_STORE (2, a0) 661 PREFETCH_FOR_STORE (3, a0) 662# endif 663# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) 664# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 665 sltu v1,t9,a0 666 bgtz v1,L(ua_skip_set) 667 nop 668 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) 669L(ua_skip_set): 670# else 671 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) 672# endif 673# endif 674L(ua_loop16w): 675 PREFETCH_FOR_LOAD (3, a1) 676 C_LDHI t0,UNIT(0)(a1) 677 C_LDHI t1,UNIT(1)(a1) 678 C_LDHI REG2,UNIT(2)(a1) 679# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) 680 sltu v1,t9,a0 681 bgtz v1,L(ua_skip_pref) 682# endif 683 C_LDHI REG3,UNIT(3)(a1) 684 PREFETCH_FOR_STORE (4, a0) 685 PREFETCH_FOR_STORE (5, a0) 686L(ua_skip_pref): 687 C_LDHI REG4,UNIT(4)(a1) 688 C_LDHI REG5,UNIT(5)(a1) 689 C_LDHI REG6,UNIT(6)(a1) 690 C_LDHI REG7,UNIT(7)(a1) 691 C_LDLO t0,UNITM1(1)(a1) 692 C_LDLO t1,UNITM1(2)(a1) 693 C_LDLO REG2,UNITM1(3)(a1) 694 C_LDLO REG3,UNITM1(4)(a1) 695 C_LDLO REG4,UNITM1(5)(a1) 696 C_LDLO REG5,UNITM1(6)(a1) 697 C_LDLO REG6,UNITM1(7)(a1) 698 C_LDLO REG7,UNITM1(8)(a1) 699 PREFETCH_FOR_LOAD (4, a1) 700 C_ST t0,UNIT(0)(a0) 701 C_ST t1,UNIT(1)(a0) 702 C_ST REG2,UNIT(2)(a0) 703 C_ST REG3,UNIT(3)(a0) 704 C_ST REG4,UNIT(4)(a0) 705 C_ST REG5,UNIT(5)(a0) 706 C_ST REG6,UNIT(6)(a0) 707 C_ST REG7,UNIT(7)(a0) 708 C_LDHI t0,UNIT(8)(a1) 709 C_LDHI t1,UNIT(9)(a1) 710 C_LDHI REG2,UNIT(10)(a1) 711 C_LDHI REG3,UNIT(11)(a1) 712 C_LDHI REG4,UNIT(12)(a1) 713 C_LDHI REG5,UNIT(13)(a1) 714 C_LDHI REG6,UNIT(14)(a1) 715 C_LDHI REG7,UNIT(15)(a1) 716 C_LDLO t0,UNITM1(9)(a1) 717 C_LDLO t1,UNITM1(10)(a1) 718 C_LDLO REG2,UNITM1(11)(a1) 719 C_LDLO REG3,UNITM1(12)(a1) 720 C_LDLO REG4,UNITM1(13)(a1) 721 C_LDLO REG5,UNITM1(14)(a1) 722 C_LDLO REG6,UNITM1(15)(a1) 723 C_LDLO REG7,UNITM1(16)(a1) 724 PREFETCH_FOR_LOAD (5, a1) 725 C_ST t0,UNIT(8)(a0) 726 C_ST t1,UNIT(9)(a0) 727 C_ST REG2,UNIT(10)(a0) 728 C_ST REG3,UNIT(11)(a0) 729 C_ST REG4,UNIT(12)(a0) 730 C_ST REG5,UNIT(13)(a0) 731 C_ST REG6,UNIT(14)(a0) 732 C_ST REG7,UNIT(15)(a0) 733 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ 734 bne a0,a3,L(ua_loop16w) 735 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ 736 move a2,t8 737 738/* Here we have src and dest word-aligned but less than 64-bytes or 739 * 128 bytes to go. Check for a 32(64) byte chunk and copy if there 740 * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of 741 * the copy. */ 742 743L(ua_chkw): 744 PREFETCH_FOR_LOAD (0, a1) 745 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */ 746 /* t8 is the reminder count past 32-bytes */ 747 beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ 748 nop 749 C_LDHI t0,UNIT(0)(a1) 750 C_LDHI t1,UNIT(1)(a1) 751 C_LDHI REG2,UNIT(2)(a1) 752 C_LDHI REG3,UNIT(3)(a1) 753 C_LDHI REG4,UNIT(4)(a1) 754 C_LDHI REG5,UNIT(5)(a1) 755 C_LDHI REG6,UNIT(6)(a1) 756 C_LDHI REG7,UNIT(7)(a1) 757 C_LDLO t0,UNITM1(1)(a1) 758 C_LDLO t1,UNITM1(2)(a1) 759 C_LDLO REG2,UNITM1(3)(a1) 760 C_LDLO REG3,UNITM1(4)(a1) 761 C_LDLO REG4,UNITM1(5)(a1) 762 C_LDLO REG5,UNITM1(6)(a1) 763 C_LDLO REG6,UNITM1(7)(a1) 764 C_LDLO REG7,UNITM1(8)(a1) 765 PTR_ADDIU a1,a1,UNIT(8) 766 C_ST t0,UNIT(0)(a0) 767 C_ST t1,UNIT(1)(a0) 768 C_ST REG2,UNIT(2)(a0) 769 C_ST REG3,UNIT(3)(a0) 770 C_ST REG4,UNIT(4)(a0) 771 C_ST REG5,UNIT(5)(a0) 772 C_ST REG6,UNIT(6)(a0) 773 C_ST REG7,UNIT(7)(a0) 774 PTR_ADDIU a0,a0,UNIT(8) 775/* 776 * Here we have less than 32(64) bytes to copy. Set up for a loop to 777 * copy one word (or double word) at a time. 778 */ 779L(ua_chk1w): 780 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */ 781 beq a2,t8,L(ua_smallCopy) 782 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */ 783 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */ 784 785/* copying in words (4-byte or 8-byte chunks) */ 786L(ua_wordCopy_loop): 787 C_LDHI v1,UNIT(0)(a1) 788 C_LDLO v1,UNITM1(1)(a1) 789 PTR_ADDIU a0,a0,UNIT(1) 790 PTR_ADDIU a1,a1,UNIT(1) 791 bne a0,a3,L(ua_wordCopy_loop) 792 C_ST v1,UNIT(-1)(a0) 793 794/* Copy the last 8 (or 16) bytes */ 795L(ua_smallCopy): 796 beqz a2,L(leave) 797 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ 798L(ua_smallCopy_loop): 799 lb v1,0(a1) 800 PTR_ADDIU a0,a0,1 801 PTR_ADDIU a1,a1,1 802 bne a0,a3,L(ua_smallCopy_loop) 803 sb v1,-1(a0) 804 805 j ra 806 nop 807 808#else /* R6_CODE */ 809 810# ifdef __MIPSEB 811# define SWAP_REGS(X,Y) X, Y 812# define ALIGN_OFFSET(N) (N) 813# else 814# define SWAP_REGS(X,Y) Y, X 815# define ALIGN_OFFSET(N) (NSIZE-N) 816# endif 817# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \ 818 andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \ 819 beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \ 820 PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \ 821 /* (d)word chunks. */ \ 822 move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \ 823 /* after word loop is finished. */ \ 824 PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \ 825 PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \ 826 PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \ 827 C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \ 828L(r6_ua_wordcopy##BYTEOFFSET): \ 829 C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \ 830 C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \ 831 PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \ 832 PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \ 833 move t0, t1; /* Move second part of source to first. */ \ 834 bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \ 835 C_ST REG3, UNIT(-1)(a0); \ 836 j L(lastb); \ 837 nop 838 839 /* We are generating R6 code, the destination is 4 byte aligned and 840 the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the 841 alignment of the source. */ 842 843L(r6_unaligned1): 844 R6_UNALIGNED_WORD_COPY(1) 845L(r6_unaligned2): 846 R6_UNALIGNED_WORD_COPY(2) 847L(r6_unaligned3): 848 R6_UNALIGNED_WORD_COPY(3) 849# ifdef USE_DOUBLE 850L(r6_unaligned4): 851 R6_UNALIGNED_WORD_COPY(4) 852L(r6_unaligned5): 853 R6_UNALIGNED_WORD_COPY(5) 854L(r6_unaligned6): 855 R6_UNALIGNED_WORD_COPY(6) 856L(r6_unaligned7): 857 R6_UNALIGNED_WORD_COPY(7) 858# endif 859#endif /* R6_CODE */ 860 861 .set at 862 .set reorder 863END(MEMCPY_NAME) 864#ifndef ANDROID_CHANGES 865# ifdef _LIBC 866libc_hidden_builtin_def (MEMCPY_NAME) 867# endif 868#endif 869