1/* Optimized memcpy implementation for PowerPC64. 2 Copyright (C) 2003-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); 22 Returns 'dst'. 23 24 Memcpy handles short copies (< 32-bytes) using a binary move blocks 25 (no loops) of lwz/stw. The tail (remaining 1-3) bytes is handled 26 with the appropriate combination of byte and halfword load/stores. 27 There is minimal effort to optimize the alignment of short moves. 28 The 64-bit implementations of POWER3 and POWER4 do a reasonable job 29 of handling unaligned load/stores that do not cross 32-byte boundaries. 30 31 Longer moves (>= 32-bytes) justify the effort to get at least the 32 destination doubleword (8-byte) aligned. Further optimization is 33 possible when both source and destination are doubleword aligned. 34 Each case has a optimized unrolled loop. 35 36 For POWER6 unaligned loads will take a 20+ cycle hiccup for any 37 L1 cache miss that crosses a 32- or 128-byte boundary. Store 38 is more forgiving and does not take a hiccup until page or 39 segment boundaries. So we require doubleword alignment for 40 the source but may take a risk and only require word alignment 41 for the destination. */ 42 43#ifndef MEMCPY 44# define MEMCPY memcpy 45#endif 46 .machine "power6" 47ENTRY_TOCLESS (MEMCPY, 7) 48 CALL_MCOUNT 3 49 50 cmpldi cr1,5,31 51 neg 0,3 52 std 3,-16(1) 53 std 31,-8(1) 54 andi. 11,3,7 /* check alignment of dst. */ 55 clrldi 0,0,61 /* Number of bytes until the 1st doubleword of dst. */ 56 clrldi 10,4,61 /* check alignment of src. */ 57 cmpldi cr6,5,8 58 ble- cr1,.L2 /* If move < 32 bytes use short move code. */ 59 mtcrf 0x01,0 60 cmpld cr6,10,11 61 srdi 9,5,3 /* Number of full double words remaining. */ 62 beq .L0 63 64 subf 5,0,5 65 /* Move 0-7 bytes as needed to get the destination doubleword aligned. 66 Duplicate some code to maximize fall-through and minimize agen delays. */ 671: bf 31,2f 68 lbz 6,0(4) 69 stb 6,0(3) 70 bf 30,5f 71 lhz 6,1(4) 72 sth 6,1(3) 73 bf 29,0f 74 lwz 6,3(4) 75 stw 6,3(3) 76 b 0f 775: 78 bf 29,0f 79 lwz 6,1(4) 80 stw 6,1(3) 81 b 0f 82 832: bf 30,4f 84 lhz 6,0(4) 85 sth 6,0(3) 86 bf 29,0f 87 lwz 6,2(4) 88 stw 6,2(3) 89 b 0f 90 914: bf 29,0f 92 lwz 6,0(4) 93 stw 6,0(3) 940: 95/* Add the number of bytes until the 1st doubleword of dst to src and dst. */ 96 add 4,4,0 97 add 3,3,0 98 99 clrldi 10,4,61 /* check alignment of src again. */ 100 srdi 9,5,3 /* Number of full double words remaining. */ 101 102 /* Copy doublewords from source to destination, assuming the 103 destination is aligned on a doubleword boundary. 104 105 At this point we know there are at least 25 bytes left (32-7) to copy. 106 The next step is to determine if the source is also doubleword aligned. 107 If not branch to the unaligned move code at .L6. which uses 108 a load, shift, store strategy. 109 110 Otherwise source and destination are doubleword aligned, and we can 111 the optimized doubleword copy loop. */ 112 .align 4 113.L0: 114 clrldi 11,5,61 115 andi. 0,5,0x78 116 srdi 12,5,7 /* Number of 128-byte blocks to move. */ 117 cmpldi cr1,11,0 /* If the tail is 0 bytes */ 118 bne- cr6,.L6 /* If source is not DW aligned. */ 119 120 /* Move doublewords where destination and source are DW aligned. 121 Use a unrolled loop to copy 16 doublewords (128-bytes) per iteration. 122 If the copy is not an exact multiple of 128 bytes, 1-15 123 doublewords are copied as needed to set up the main loop. After 124 the main loop exits there may be a tail of 1-7 bytes. These byte 125 are copied a word/halfword/byte at a time as needed to preserve 126 alignment. 127 128 For POWER6 the L1 is store-through and the L2 is store-in. The 129 L2 is clocked at half CPU clock so we can store 16 bytes every 130 other cycle. POWER6 also has a load/store bypass so we can do 131 load, load, store, store every 2 cycles. 132 133 The following code is sensitive to cache line alignment. Do not 134 make any change with out first making sure they don't result in 135 splitting ld/std pairs across a cache line. */ 136 137 mtcrf 0x02,5 138 mtcrf 0x01,5 139 cmpldi cr5,12,1 140 beq L(das_loop) 141 142 bf 25,4f 143 .align 3 144 ld 6,0(4) 145 ld 7,8(4) 146 mr 11,4 147 mr 10,3 148 std 6,0(3) 149 std 7,8(3) 150 ld 6,16(4) 151 ld 7,24(4) 152 std 6,16(3) 153 std 7,24(3) 154 ld 6,0+32(4) 155 ld 7,8+32(4) 156 addi 4,4,64 157 addi 3,3,64 158 std 6,0+32(10) 159 std 7,8+32(10) 160 ld 6,16+32(11) 161 ld 7,24+32(11) 162 std 6,16+32(10) 163 std 7,24+32(10) 1644: 165 mr 10,3 166 bf 26,2f 167 ld 6,0(4) 168 ld 7,8(4) 169 mr 11,4 170 nop 171 std 6,0(3) 172 std 7,8(3) 173 ld 6,16(4) 174 ld 7,24(4) 175 addi 4,4,32 176 std 6,16(3) 177 std 7,24(3) 178 addi 3,3,32 1796: 180 nop 181 bf 27,5f 182 ld 6,0+32(11) 183 ld 7,8+32(11) 184 addi 4,4,16 185 addi 3,3,16 186 std 6,0+32(10) 187 std 7,8+32(10) 188 bf 28,L(das_loop_s) 189 ld 0,16+32(11) 190 addi 4,4,8 191 addi 3,3,8 192 std 0,16+32(10) 193 blt cr5,L(das_tail) 194 b L(das_loop) 195 .align 3 1965: 197 nop 198 bf 28,L(das_loop_s) 199 ld 6,32(11) 200 addi 4,4,8 201 addi 3,3,8 202 std 6,32(10) 203 blt cr5,L(das_tail) 204 b L(das_loop) 205 .align 3 2062: 207 mr 11,4 208 bf 27,1f 209 ld 6,0(4) 210 ld 7,8(4) 211 addi 4,4,16 212 addi 3,3,16 213 std 6,0(10) 214 std 7,8(10) 215 bf 28,L(das_loop_s) 216 ld 0,16(11) 217 addi 4,11,24 218 addi 3,10,24 219 std 0,16(10) 220 blt cr5,L(das_tail) 221 b L(das_loop) 222 .align 3 2231: 224 nop 225 bf 28,L(das_loop_s) 226 ld 6,0(4) 227 addi 4,4,8 228 addi 3,3,8 229 std 6,0(10) 230L(das_loop_s): 231 nop 232 blt cr5,L(das_tail) 233 .align 4 234L(das_loop): 235 ld 6,0(4) 236 ld 7,8(4) 237 mr 10,3 238 mr 11,4 239 std 6,0(3) 240 std 7,8(3) 241 addi 12,12,-1 242 nop 243 ld 8,16(4) 244 ld 0,24(4) 245 std 8,16(3) 246 std 0,24(3) 247 248 ld 6,0+32(4) 249 ld 7,8+32(4) 250 std 6,0+32(3) 251 std 7,8+32(3) 252 ld 8,16+32(4) 253 ld 0,24+32(4) 254 std 8,16+32(3) 255 std 0,24+32(3) 256 257 ld 6,0+64(11) 258 ld 7,8+64(11) 259 std 6,0+64(10) 260 std 7,8+64(10) 261 ld 8,16+64(11) 262 ld 0,24+64(11) 263 std 8,16+64(10) 264 std 0,24+64(10) 265 266 ld 6,0+96(11) 267 ld 7,8+96(11) 268 addi 4,4,128 269 addi 3,3,128 270 std 6,0+96(10) 271 std 7,8+96(10) 272 ld 8,16+96(11) 273 ld 0,24+96(11) 274 std 8,16+96(10) 275 std 0,24+96(10) 276 ble cr5,L(das_loop_e) 277 278 mtctr 12 279 .align 4 280L(das_loop2): 281 ld 6,0(4) 282 ld 7,8(4) 283 mr 10,3 284 mr 11,4 285 std 6,0(3) 286 std 7,8(3) 287 ld 8,16(4) 288 ld 0,24(4) 289 std 8,16(3) 290 std 0,24(3) 291 292 ld 6,0+32(4) 293 ld 7,8+32(4) 294 std 6,0+32(3) 295 std 7,8+32(3) 296 ld 8,16+32(4) 297 ld 0,24+32(4) 298 std 8,16+32(3) 299 std 0,24+32(3) 300 301 ld 6,0+64(11) 302 ld 7,8+64(11) 303 std 6,0+64(10) 304 std 7,8+64(10) 305 ld 8,16+64(11) 306 ld 0,24+64(11) 307 std 8,16+64(10) 308 std 0,24+64(10) 309 310 ld 6,0+96(11) 311 ld 7,8+96(11) 312 addi 4,4,128 313 addi 3,3,128 314 std 6,0+96(10) 315 std 7,8+96(10) 316 ld 8,16+96(11) 317 ld 0,24+96(11) 318 std 8,16+96(10) 319 std 0,24+96(10) 320 bdnz L(das_loop2) 321L(das_loop_e): 322/* Check of a 1-7 byte tail, return if none. */ 323 bne cr1,L(das_tail2) 324/* Return original dst pointer. */ 325 ld 3,-16(1) 326 blr 327 .align 4 328L(das_tail): 329 beq cr1,0f 330 331L(das_tail2): 332/* At this point we have a tail of 0-7 bytes and we know that the 333 destination is double word aligned. */ 3344: bf 29,2f 335 lwz 6,0(4) 336 stw 6,0(3) 337 bf 30,5f 338 lhz 6,4(4) 339 sth 6,4(3) 340 bf 31,0f 341 lbz 6,6(4) 342 stb 6,6(3) 343 b 0f 3445: bf 31,0f 345 lbz 6,4(4) 346 stb 6,4(3) 347 b 0f 348 3492: bf 30,1f 350 lhz 6,0(4) 351 sth 6,0(3) 352 bf 31,0f 353 lbz 6,2(4) 354 stb 6,2(3) 355 b 0f 356 3571: bf 31,0f 358 lbz 6,0(4) 359 stb 6,0(3) 3600: 361 /* Return original dst pointer. */ 362 ld 3,-16(1) 363 blr 364 365/* Copy up to 31 bytes. This divided into two cases 0-8 bytes and 9-31 366 bytes. Each case is handled without loops, using binary (1,2,4,8) 367 tests. 368 369 In the short (0-8 byte) case no attempt is made to force alignment 370 of either source or destination. The hardware will handle the 371 unaligned load/stores with small delays for crossing 32- 128-byte, 372 and 4096-byte boundaries. Since these short moves are unlikely to be 373 unaligned or cross these boundaries, the overhead to force 374 alignment is not justified. 375 376 The longer (9-31 byte) move is more likely to cross 32- or 128-byte 377 boundaries. Since only loads are sensitive to the 32-/128-byte 378 boundaries it is more important to align the source then the 379 destination. If the source is not already word aligned, we first 380 move 1-3 bytes as needed. Since we are only word aligned we don't 381 use double word load/stores to insure that all loads are aligned. 382 While the destination and stores may still be unaligned, this 383 is only an issue for page (4096 byte boundary) crossing, which 384 should be rare for these short moves. The hardware handles this 385 case automatically with a small (~20 cycle) delay. */ 386 .align 4 387.L2: 388 mtcrf 0x01,5 389 neg 8,4 390 clrrdi 11,4,2 391 andi. 0,8,3 392 ble cr6,.LE8 /* Handle moves of 0-8 bytes. */ 393/* At least 9 bytes left. Get the source word aligned. */ 394 cmpldi cr1,5,16 395 mr 10,5 396 mr 12,4 397 cmpldi cr6,0,2 398 beq L(dus_tail) /* If the source is already word aligned skip this. */ 399/* Copy 1-3 bytes to get source address word aligned. */ 400 lwz 6,0(11) 401 subf 10,0,5 402 add 12,4,0 403 blt cr6,5f 404 srdi 7,6,16 405 bgt cr6,3f 406#ifdef __LITTLE_ENDIAN__ 407 sth 7,0(3) 408#else 409 sth 6,0(3) 410#endif 411 b 7f 412 .align 4 4133: 414#ifdef __LITTLE_ENDIAN__ 415 rotlwi 6,6,24 416 stb 6,0(3) 417 sth 7,1(3) 418#else 419 stb 7,0(3) 420 sth 6,1(3) 421#endif 422 b 7f 423 .align 4 4245: 425#ifdef __LITTLE_ENDIAN__ 426 rotlwi 6,6,8 427#endif 428 stb 6,0(3) 4297: 430 cmpldi cr1,10,16 431 add 3,3,0 432 mtcrf 0x01,10 433 .align 4 434L(dus_tail): 435/* At least 6 bytes left and the source is word aligned. This allows 436 some speculative loads up front. */ 437/* We need to special case the fall-through because the biggest delays 438 are due to address computation not being ready in time for the 439 AGEN. */ 440 lwz 6,0(12) 441 lwz 7,4(12) 442 blt cr1,L(dus_tail8) 443 cmpldi cr0,10,24 444L(dus_tail16): /* Move 16 bytes. */ 445 stw 6,0(3) 446 stw 7,4(3) 447 lwz 6,8(12) 448 lwz 7,12(12) 449 stw 6,8(3) 450 stw 7,12(3) 451/* Move 8 bytes more. */ 452 bf 28,L(dus_tail16p8) 453 cmpldi cr1,10,28 454 lwz 6,16(12) 455 lwz 7,20(12) 456 stw 6,16(3) 457 stw 7,20(3) 458/* Move 4 bytes more. */ 459 bf 29,L(dus_tail16p4) 460 lwz 6,24(12) 461 stw 6,24(3) 462 addi 12,12,28 463 addi 3,3,28 464 bgt cr1,L(dus_tail2) 465 /* exactly 28 bytes. Return original dst pointer and exit. */ 466 ld 3,-16(1) 467 blr 468 .align 4 469L(dus_tail16p8): /* less than 8 bytes left. */ 470 beq cr1,L(dus_tailX) /* exactly 16 bytes, early exit. */ 471 cmpldi cr1,10,20 472 bf 29,L(dus_tail16p2) 473/* Move 4 bytes more. */ 474 lwz 6,16(12) 475 stw 6,16(3) 476 addi 12,12,20 477 addi 3,3,20 478 bgt cr1,L(dus_tail2) 479 /* exactly 20 bytes. Return original dst pointer and exit. */ 480 ld 3,-16(1) 481 blr 482 .align 4 483L(dus_tail16p4): /* less than 4 bytes left. */ 484 addi 12,12,24 485 addi 3,3,24 486 bgt cr0,L(dus_tail2) 487 /* exactly 24 bytes. Return original dst pointer and exit. */ 488 ld 3,-16(1) 489 blr 490 .align 4 491L(dus_tail16p2): /* 16 bytes moved, less than 4 bytes left. */ 492 addi 12,12,16 493 addi 3,3,16 494 b L(dus_tail2) 495 496 .align 4 497L(dus_tail8): /* Move 8 bytes. */ 498/* r6, r7 already loaded speculatively. */ 499 cmpldi cr1,10,8 500 cmpldi cr0,10,12 501 bf 28,L(dus_tail4) 502 .align 2 503 stw 6,0(3) 504 stw 7,4(3) 505/* Move 4 bytes more. */ 506 bf 29,L(dus_tail8p4) 507 lwz 6,8(12) 508 stw 6,8(3) 509 addi 12,12,12 510 addi 3,3,12 511 bgt cr0,L(dus_tail2) 512 /* exactly 12 bytes. Return original dst pointer and exit. */ 513 ld 3,-16(1) 514 blr 515 .align 4 516L(dus_tail8p4): /* less than 4 bytes left. */ 517 addi 12,12,8 518 addi 3,3,8 519 bgt cr1,L(dus_tail2) 520 /* exactly 8 bytes. Return original dst pointer and exit. */ 521 ld 3,-16(1) 522 blr 523 524 .align 4 525L(dus_tail4): /* Move 4 bytes. */ 526/* r6 already loaded speculatively. If we are here we know there is 527 more than 4 bytes left. So there is no need to test. */ 528 addi 12,12,4 529 stw 6,0(3) 530 addi 3,3,4 531L(dus_tail2): /* Move 2-3 bytes. */ 532 bf 30,L(dus_tail1) 533 lhz 6,0(12) 534 sth 6,0(3) 535 bf 31,L(dus_tailX) 536 lbz 7,2(12) 537 stb 7,2(3) 538 ld 3,-16(1) 539 blr 540L(dus_tail1): /* Move 1 byte. */ 541 bf 31,L(dus_tailX) 542 lbz 6,0(12) 543 stb 6,0(3) 544L(dus_tailX): 545 /* Return original dst pointer. */ 546 ld 3,-16(1) 547 blr 548 549/* Special case to copy 0-8 bytes. */ 550 .align 4 551.LE8: 552 mr 12,4 553 bne cr6,L(dus_4) 554/* Exactly 8 bytes. We may cross a 32-/128-byte boundary and take a ~20 555 cycle delay. This case should be rare and any attempt to avoid this 556 would take most of 20 cycles any way. */ 557 ld 6,0(4) 558 std 6,0(3) 559 /* Return original dst pointer. */ 560 ld 3,-16(1) 561 blr 562 .align 4 563L(dus_4): 564 bf 29,L(dus_tail2) 565 lwz 6,0(4) 566 stw 6,0(3) 567 bf 30,L(dus_5) 568 lhz 7,4(4) 569 sth 7,4(3) 570 bf 31,L(dus_0) 571 lbz 8,6(4) 572 stb 8,6(3) 573 ld 3,-16(1) 574 blr 575 .align 4 576L(dus_5): 577 bf 31,L(dus_0) 578 lbz 6,4(4) 579 stb 6,4(3) 580L(dus_0): 581 /* Return original dst pointer. */ 582 ld 3,-16(1) 583 blr 584 585 .align 4 586.L6: 587 cfi_offset(31,-8) 588 mr 12,4 589 mr 31,5 590 /* Copy doublewords where the destination is aligned but the source is 591 not. Use aligned doubleword loads from the source, shifted to realign 592 the data, to allow aligned destination stores. */ 593 addi 11,9,-1 /* loop DW count is one less than total */ 594 subf 5,10,12 /* Move source addr to previous full double word. */ 595 cmpldi cr5, 10, 2 596 cmpldi cr0, 10, 4 597 mr 4,3 598 srdi 8,11,2 /* calculate the 32 byte loop count */ 599 ld 6,0(5) /* pre load 1st full doubleword. */ 600 mtcrf 0x01,11 601 cmpldi cr6,9,4 602 mtctr 8 603 ld 7,8(5) /* pre load 2nd full doubleword. */ 604 bge cr0, L(du4_do) 605 blt cr5, L(du1_do) 606 beq cr5, L(du2_do) 607 b L(du3_do) 608 609 .align 4 610L(du1_do): 611 bf 30,L(du1_1dw) 612 613 /* there are at least two DWs to copy */ 614 /* FIXME: can combine last shift and "or" into "rldimi" */ 615#ifdef __LITTLE_ENDIAN__ 616 srdi 0,6, 8 617 sldi 8,7, 64-8 618#else 619 sldi 0,6, 8 620 srdi 8,7, 64-8 621#endif 622 or 0,0,8 623 ld 6,16(5) 624 std 0,0(4) 625#ifdef __LITTLE_ENDIAN__ 626 srdi 0,7, 8 627 sldi 8,6, 64-8 628#else 629 sldi 0,7, 8 630 srdi 8,6, 64-8 631#endif 632 or 0,0,8 633 ld 7,24(5) 634 std 0,8(4) 635 addi 4,4,16 636 addi 5,5,32 637 blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */ 638 bf 31,L(du1_loop) 639 /* there is a third DW to copy */ 640#ifdef __LITTLE_ENDIAN__ 641 srdi 0,6, 8 642 sldi 8,7, 64-8 643#else 644 sldi 0,6, 8 645 srdi 8,7, 64-8 646#endif 647 or 0,0,8 648 std 0,0(4) 649 mr 6,7 650 ld 7,0(5) 651 addi 5,5,8 652 addi 4,4,8 653 beq cr6,L(du1_fini) /* if total DWs = 4, then bypass loop */ 654 b L(du1_loop) 655 .align 4 656L(du1_1dw): 657#ifdef __LITTLE_ENDIAN__ 658 srdi 0,6, 8 659 sldi 8,7, 64-8 660#else 661 sldi 0,6, 8 662 srdi 8,7, 64-8 663#endif 664 addi 5,5,16 665 or 0,0,8 666 bf 31,L(du1_loop) 667 mr 6,7 668 ld 7,0(5) 669 addi 5,5,8 670 std 0,0(4) 671 addi 4,4,8 672 .align 4 673/* copy 32 bytes at a time */ 674L(du1_loop): 675#ifdef __LITTLE_ENDIAN__ 676 srdi 0,6, 8 677 sldi 8,7, 64-8 678#else 679 sldi 0,6, 8 680 srdi 8,7, 64-8 681#endif 682 or 0,0,8 683 ld 6,0(5) 684 std 0,0(4) 685#ifdef __LITTLE_ENDIAN__ 686 srdi 0,7, 8 687 sldi 8,6, 64-8 688#else 689 sldi 0,7, 8 690 srdi 8,6, 64-8 691#endif 692 or 0,0,8 693 ld 7,8(5) 694 std 0,8(4) 695#ifdef __LITTLE_ENDIAN__ 696 srdi 0,6, 8 697 sldi 8,7, 64-8 698#else 699 sldi 0,6, 8 700 srdi 8,7, 64-8 701#endif 702 or 0,0,8 703 ld 6,16(5) 704 std 0,16(4) 705#ifdef __LITTLE_ENDIAN__ 706 srdi 0,7, 8 707 sldi 8,6, 64-8 708#else 709 sldi 0,7, 8 710 srdi 8,6, 64-8 711#endif 712 or 0,0,8 713 ld 7,24(5) 714 std 0,24(4) 715 addi 5,5,32 716 addi 4,4,32 717 bdnz+ L(du1_loop) 718 .align 4 719L(du1_fini): 720 /* calculate and store the final DW */ 721#ifdef __LITTLE_ENDIAN__ 722 srdi 0,6, 8 723 sldi 8,7, 64-8 724#else 725 sldi 0,6, 8 726 srdi 8,7, 64-8 727#endif 728 or 0,0,8 729 std 0,0(4) 730 b L(du_done) 731 732 .align 4 733L(du2_do): 734 bf 30,L(du2_1dw) 735 736 /* there are at least two DWs to copy */ 737#ifdef __LITTLE_ENDIAN__ 738 srdi 0,6, 16 739 sldi 8,7, 64-16 740#else 741 sldi 0,6, 16 742 srdi 8,7, 64-16 743#endif 744 or 0,0,8 745 ld 6,16(5) 746 std 0,0(4) 747#ifdef __LITTLE_ENDIAN__ 748 srdi 0,7, 16 749 sldi 8,6, 64-16 750#else 751 sldi 0,7, 16 752 srdi 8,6, 64-16 753#endif 754 or 0,0,8 755 ld 7,24(5) 756 std 0,8(4) 757 addi 4,4,16 758 addi 5,5,32 759 blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */ 760 bf 31,L(du2_loop) 761 /* there is a third DW to copy */ 762#ifdef __LITTLE_ENDIAN__ 763 srdi 0,6, 16 764 sldi 8,7, 64-16 765#else 766 sldi 0,6, 16 767 srdi 8,7, 64-16 768#endif 769 or 0,0,8 770 std 0,0(4) 771 mr 6,7 772 ld 7,0(5) 773 addi 5,5,8 774 addi 4,4,8 775 beq cr6,L(du2_fini) /* if total DWs = 4, then bypass loop */ 776 b L(du2_loop) 777 .align 4 778L(du2_1dw): 779#ifdef __LITTLE_ENDIAN__ 780 srdi 0,6, 16 781 sldi 8,7, 64-16 782#else 783 sldi 0,6, 16 784 srdi 8,7, 64-16 785#endif 786 addi 5,5,16 787 or 0,0,8 788 bf 31,L(du2_loop) 789 mr 6,7 790 ld 7,0(5) 791 addi 5,5,8 792 std 0,0(4) 793 addi 4,4,8 794 .align 4 795/* copy 32 bytes at a time */ 796L(du2_loop): 797#ifdef __LITTLE_ENDIAN__ 798 srdi 0,6, 16 799 sldi 8,7, 64-16 800#else 801 sldi 0,6, 16 802 srdi 8,7, 64-16 803#endif 804 or 0,0,8 805 ld 6,0(5) 806 std 0,0(4) 807#ifdef __LITTLE_ENDIAN__ 808 srdi 0,7, 16 809 sldi 8,6, 64-16 810#else 811 sldi 0,7, 16 812 srdi 8,6, 64-16 813#endif 814 or 0,0,8 815 ld 7,8(5) 816 std 0,8(4) 817#ifdef __LITTLE_ENDIAN__ 818 srdi 0,6, 16 819 sldi 8,7, 64-16 820#else 821 sldi 0,6, 16 822 srdi 8,7, 64-16 823#endif 824 or 0,0,8 825 ld 6,16(5) 826 std 0,16(4) 827#ifdef __LITTLE_ENDIAN__ 828 srdi 0,7, 16 829 sldi 8,6, 64-16 830#else 831 sldi 0,7, 16 832 srdi 8,6, 64-16 833#endif 834 or 0,0,8 835 ld 7,24(5) 836 std 0,24(4) 837 addi 5,5,32 838 addi 4,4,32 839 bdnz+ L(du2_loop) 840 .align 4 841L(du2_fini): 842 /* calculate and store the final DW */ 843#ifdef __LITTLE_ENDIAN__ 844 srdi 0,6, 16 845 sldi 8,7, 64-16 846#else 847 sldi 0,6, 16 848 srdi 8,7, 64-16 849#endif 850 or 0,0,8 851 std 0,0(4) 852 b L(du_done) 853 854 .align 4 855L(du3_do): 856 bf 30,L(du3_1dw) 857 858 /* there are at least two DWs to copy */ 859#ifdef __LITTLE_ENDIAN__ 860 srdi 0,6, 24 861 sldi 8,7, 64-24 862#else 863 sldi 0,6, 24 864 srdi 8,7, 64-24 865#endif 866 or 0,0,8 867 ld 6,16(5) 868 std 0,0(4) 869#ifdef __LITTLE_ENDIAN__ 870 srdi 0,7, 24 871 sldi 8,6, 64-24 872#else 873 sldi 0,7, 24 874 srdi 8,6, 64-24 875#endif 876 or 0,0,8 877 ld 7,24(5) 878 std 0,8(4) 879 addi 4,4,16 880 addi 5,5,32 881 blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */ 882 bf 31,L(du3_loop) 883 /* there is a third DW to copy */ 884#ifdef __LITTLE_ENDIAN__ 885 srdi 0,6, 24 886 sldi 8,7, 64-24 887#else 888 sldi 0,6, 24 889 srdi 8,7, 64-24 890#endif 891 or 0,0,8 892 std 0,0(4) 893 mr 6,7 894 ld 7,0(5) 895 addi 5,5,8 896 addi 4,4,8 897 beq cr6,L(du3_fini) /* if total DWs = 4, then bypass loop */ 898 b L(du3_loop) 899 .align 4 900L(du3_1dw): 901#ifdef __LITTLE_ENDIAN__ 902 srdi 0,6, 24 903 sldi 8,7, 64-24 904#else 905 sldi 0,6, 24 906 srdi 8,7, 64-24 907#endif 908 addi 5,5,16 909 or 0,0,8 910 bf 31,L(du3_loop) 911 mr 6,7 912 ld 7,0(5) 913 addi 5,5,8 914 std 0,0(4) 915 addi 4,4,8 916 .align 4 917/* copy 32 bytes at a time */ 918L(du3_loop): 919#ifdef __LITTLE_ENDIAN__ 920 srdi 0,6, 24 921 sldi 8,7, 64-24 922#else 923 sldi 0,6, 24 924 srdi 8,7, 64-24 925#endif 926 or 0,0,8 927 ld 6,0(5) 928 std 0,0(4) 929#ifdef __LITTLE_ENDIAN__ 930 srdi 0,7, 24 931 sldi 8,6, 64-24 932#else 933 sldi 0,7, 24 934 srdi 8,6, 64-24 935#endif 936 or 0,0,8 937 ld 7,8(5) 938 std 0,8(4) 939#ifdef __LITTLE_ENDIAN__ 940 srdi 0,6, 24 941 sldi 8,7, 64-24 942#else 943 sldi 0,6, 24 944 srdi 8,7, 64-24 945#endif 946 or 0,0,8 947 ld 6,16(5) 948 std 0,16(4) 949#ifdef __LITTLE_ENDIAN__ 950 srdi 0,7, 24 951 sldi 8,6, 64-24 952#else 953 sldi 0,7, 24 954 srdi 8,6, 64-24 955#endif 956 or 0,0,8 957 ld 7,24(5) 958 std 0,24(4) 959 addi 5,5,32 960 addi 4,4,32 961 bdnz+ L(du3_loop) 962 .align 4 963L(du3_fini): 964 /* calculate and store the final DW */ 965#ifdef __LITTLE_ENDIAN__ 966 srdi 0,6, 24 967 sldi 8,7, 64-24 968#else 969 sldi 0,6, 24 970 srdi 8,7, 64-24 971#endif 972 or 0,0,8 973 std 0,0(4) 974 b L(du_done) 975 976 .align 4 977L(du4_do): 978 cmpldi cr5, 10, 6 979 beq cr0, L(du4_dox) 980 blt cr5, L(du5_do) 981 beq cr5, L(du6_do) 982 b L(du7_do) 983L(du4_dox): 984 bf 30,L(du4_1dw) 985 986 /* there are at least two DWs to copy */ 987#ifdef __LITTLE_ENDIAN__ 988 srdi 0,6, 32 989 sldi 8,7, 64-32 990#else 991 sldi 0,6, 32 992 srdi 8,7, 64-32 993#endif 994 or 0,0,8 995 ld 6,16(5) 996 std 0,0(4) 997#ifdef __LITTLE_ENDIAN__ 998 srdi 0,7, 32 999 sldi 8,6, 64-32 1000#else 1001 sldi 0,7, 32 1002 srdi 8,6, 64-32 1003#endif 1004 or 0,0,8 1005 ld 7,24(5) 1006 std 0,8(4) 1007 addi 4,4,16 1008 addi 5,5,32 1009 blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */ 1010 bf 31,L(du4_loop) 1011 /* there is a third DW to copy */ 1012#ifdef __LITTLE_ENDIAN__ 1013 srdi 0,6, 32 1014 sldi 8,7, 64-32 1015#else 1016 sldi 0,6, 32 1017 srdi 8,7, 64-32 1018#endif 1019 or 0,0,8 1020 std 0,0(4) 1021 mr 6,7 1022 ld 7,0(5) 1023 addi 5,5,8 1024 addi 4,4,8 1025 beq cr6,L(du4_fini) /* if total DWs = 4, then bypass loop */ 1026 b L(du4_loop) 1027 .align 4 1028L(du4_1dw): 1029#ifdef __LITTLE_ENDIAN__ 1030 srdi 0,6, 32 1031 sldi 8,7, 64-32 1032#else 1033 sldi 0,6, 32 1034 srdi 8,7, 64-32 1035#endif 1036 addi 5,5,16 1037 or 0,0,8 1038 bf 31,L(du4_loop) 1039 mr 6,7 1040 ld 7,0(5) 1041 addi 5,5,8 1042 std 0,0(4) 1043 addi 4,4,8 1044 .align 4 1045/* copy 32 bytes at a time */ 1046L(du4_loop): 1047#ifdef __LITTLE_ENDIAN__ 1048 srdi 0,6, 32 1049 sldi 8,7, 64-32 1050#else 1051 sldi 0,6, 32 1052 srdi 8,7, 64-32 1053#endif 1054 or 0,0,8 1055 ld 6,0(5) 1056 std 0,0(4) 1057#ifdef __LITTLE_ENDIAN__ 1058 srdi 0,7, 32 1059 sldi 8,6, 64-32 1060#else 1061 sldi 0,7, 32 1062 srdi 8,6, 64-32 1063#endif 1064 or 0,0,8 1065 ld 7,8(5) 1066 std 0,8(4) 1067#ifdef __LITTLE_ENDIAN__ 1068 srdi 0,6, 32 1069 sldi 8,7, 64-32 1070#else 1071 sldi 0,6, 32 1072 srdi 8,7, 64-32 1073#endif 1074 or 0,0,8 1075 ld 6,16(5) 1076 std 0,16(4) 1077#ifdef __LITTLE_ENDIAN__ 1078 srdi 0,7, 32 1079 sldi 8,6, 64-32 1080#else 1081 sldi 0,7, 32 1082 srdi 8,6, 64-32 1083#endif 1084 or 0,0,8 1085 ld 7,24(5) 1086 std 0,24(4) 1087 addi 5,5,32 1088 addi 4,4,32 1089 bdnz+ L(du4_loop) 1090 .align 4 1091L(du4_fini): 1092 /* calculate and store the final DW */ 1093#ifdef __LITTLE_ENDIAN__ 1094 srdi 0,6, 32 1095 sldi 8,7, 64-32 1096#else 1097 sldi 0,6, 32 1098 srdi 8,7, 64-32 1099#endif 1100 or 0,0,8 1101 std 0,0(4) 1102 b L(du_done) 1103 1104 .align 4 1105L(du5_do): 1106 bf 30,L(du5_1dw) 1107 1108 /* there are at least two DWs to copy */ 1109#ifdef __LITTLE_ENDIAN__ 1110 srdi 0,6, 40 1111 sldi 8,7, 64-40 1112#else 1113 sldi 0,6, 40 1114 srdi 8,7, 64-40 1115#endif 1116 or 0,0,8 1117 ld 6,16(5) 1118 std 0,0(4) 1119#ifdef __LITTLE_ENDIAN__ 1120 srdi 0,7, 40 1121 sldi 8,6, 64-40 1122#else 1123 sldi 0,7, 40 1124 srdi 8,6, 64-40 1125#endif 1126 or 0,0,8 1127 ld 7,24(5) 1128 std 0,8(4) 1129 addi 4,4,16 1130 addi 5,5,32 1131 blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */ 1132 bf 31,L(du5_loop) 1133 /* there is a third DW to copy */ 1134#ifdef __LITTLE_ENDIAN__ 1135 srdi 0,6, 40 1136 sldi 8,7, 64-40 1137#else 1138 sldi 0,6, 40 1139 srdi 8,7, 64-40 1140#endif 1141 or 0,0,8 1142 std 0,0(4) 1143 mr 6,7 1144 ld 7,0(5) 1145 addi 5,5,8 1146 addi 4,4,8 1147 beq cr6,L(du5_fini) /* if total DWs = 4, then bypass loop */ 1148 b L(du5_loop) 1149 .align 4 1150L(du5_1dw): 1151#ifdef __LITTLE_ENDIAN__ 1152 srdi 0,6, 40 1153 sldi 8,7, 64-40 1154#else 1155 sldi 0,6, 40 1156 srdi 8,7, 64-40 1157#endif 1158 addi 5,5,16 1159 or 0,0,8 1160 bf 31,L(du5_loop) 1161 mr 6,7 1162 ld 7,0(5) 1163 addi 5,5,8 1164 std 0,0(4) 1165 addi 4,4,8 1166 .align 4 1167/* copy 32 bytes at a time */ 1168L(du5_loop): 1169#ifdef __LITTLE_ENDIAN__ 1170 srdi 0,6, 40 1171 sldi 8,7, 64-40 1172#else 1173 sldi 0,6, 40 1174 srdi 8,7, 64-40 1175#endif 1176 or 0,0,8 1177 ld 6,0(5) 1178 std 0,0(4) 1179#ifdef __LITTLE_ENDIAN__ 1180 srdi 0,7, 40 1181 sldi 8,6, 64-40 1182#else 1183 sldi 0,7, 40 1184 srdi 8,6, 64-40 1185#endif 1186 or 0,0,8 1187 ld 7,8(5) 1188 std 0,8(4) 1189#ifdef __LITTLE_ENDIAN__ 1190 srdi 0,6, 40 1191 sldi 8,7, 64-40 1192#else 1193 sldi 0,6, 40 1194 srdi 8,7, 64-40 1195#endif 1196 or 0,0,8 1197 ld 6,16(5) 1198 std 0,16(4) 1199#ifdef __LITTLE_ENDIAN__ 1200 srdi 0,7, 40 1201 sldi 8,6, 64-40 1202#else 1203 sldi 0,7, 40 1204 srdi 8,6, 64-40 1205#endif 1206 or 0,0,8 1207 ld 7,24(5) 1208 std 0,24(4) 1209 addi 5,5,32 1210 addi 4,4,32 1211 bdnz+ L(du5_loop) 1212 .align 4 1213L(du5_fini): 1214 /* calculate and store the final DW */ 1215#ifdef __LITTLE_ENDIAN__ 1216 srdi 0,6, 40 1217 sldi 8,7, 64-40 1218#else 1219 sldi 0,6, 40 1220 srdi 8,7, 64-40 1221#endif 1222 or 0,0,8 1223 std 0,0(4) 1224 b L(du_done) 1225 1226 .align 4 1227L(du6_do): 1228 bf 30,L(du6_1dw) 1229 1230 /* there are at least two DWs to copy */ 1231#ifdef __LITTLE_ENDIAN__ 1232 srdi 0,6, 48 1233 sldi 8,7, 64-48 1234#else 1235 sldi 0,6, 48 1236 srdi 8,7, 64-48 1237#endif 1238 or 0,0,8 1239 ld 6,16(5) 1240 std 0,0(4) 1241#ifdef __LITTLE_ENDIAN__ 1242 srdi 0,7, 48 1243 sldi 8,6, 64-48 1244#else 1245 sldi 0,7, 48 1246 srdi 8,6, 64-48 1247#endif 1248 or 0,0,8 1249 ld 7,24(5) 1250 std 0,8(4) 1251 addi 4,4,16 1252 addi 5,5,32 1253 blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */ 1254 bf 31,L(du6_loop) 1255 /* there is a third DW to copy */ 1256#ifdef __LITTLE_ENDIAN__ 1257 srdi 0,6, 48 1258 sldi 8,7, 64-48 1259#else 1260 sldi 0,6, 48 1261 srdi 8,7, 64-48 1262#endif 1263 or 0,0,8 1264 std 0,0(4) 1265 mr 6,7 1266 ld 7,0(5) 1267 addi 5,5,8 1268 addi 4,4,8 1269 beq cr6,L(du6_fini) /* if total DWs = 4, then bypass loop */ 1270 b L(du6_loop) 1271 .align 4 1272L(du6_1dw): 1273#ifdef __LITTLE_ENDIAN__ 1274 srdi 0,6, 48 1275 sldi 8,7, 64-48 1276#else 1277 sldi 0,6, 48 1278 srdi 8,7, 64-48 1279#endif 1280 addi 5,5,16 1281 or 0,0,8 1282 bf 31,L(du6_loop) 1283 mr 6,7 1284 ld 7,0(5) 1285 addi 5,5,8 1286 std 0,0(4) 1287 addi 4,4,8 1288 .align 4 1289/* copy 32 bytes at a time */ 1290L(du6_loop): 1291#ifdef __LITTLE_ENDIAN__ 1292 srdi 0,6, 48 1293 sldi 8,7, 64-48 1294#else 1295 sldi 0,6, 48 1296 srdi 8,7, 64-48 1297#endif 1298 or 0,0,8 1299 ld 6,0(5) 1300 std 0,0(4) 1301#ifdef __LITTLE_ENDIAN__ 1302 srdi 0,7, 48 1303 sldi 8,6, 64-48 1304#else 1305 sldi 0,7, 48 1306 srdi 8,6, 64-48 1307#endif 1308 or 0,0,8 1309 ld 7,8(5) 1310 std 0,8(4) 1311#ifdef __LITTLE_ENDIAN__ 1312 srdi 0,6, 48 1313 sldi 8,7, 64-48 1314#else 1315 sldi 0,6, 48 1316 srdi 8,7, 64-48 1317#endif 1318 or 0,0,8 1319 ld 6,16(5) 1320 std 0,16(4) 1321#ifdef __LITTLE_ENDIAN__ 1322 srdi 0,7, 48 1323 sldi 8,6, 64-48 1324#else 1325 sldi 0,7, 48 1326 srdi 8,6, 64-48 1327#endif 1328 or 0,0,8 1329 ld 7,24(5) 1330 std 0,24(4) 1331 addi 5,5,32 1332 addi 4,4,32 1333 bdnz+ L(du6_loop) 1334 .align 4 1335L(du6_fini): 1336 /* calculate and store the final DW */ 1337#ifdef __LITTLE_ENDIAN__ 1338 srdi 0,6, 48 1339 sldi 8,7, 64-48 1340#else 1341 sldi 0,6, 48 1342 srdi 8,7, 64-48 1343#endif 1344 or 0,0,8 1345 std 0,0(4) 1346 b L(du_done) 1347 1348 .align 4 1349L(du7_do): 1350 bf 30,L(du7_1dw) 1351 1352 /* there are at least two DWs to copy */ 1353#ifdef __LITTLE_ENDIAN__ 1354 srdi 0,6, 56 1355 sldi 8,7, 64-56 1356#else 1357 sldi 0,6, 56 1358 srdi 8,7, 64-56 1359#endif 1360 or 0,0,8 1361 ld 6,16(5) 1362 std 0,0(4) 1363#ifdef __LITTLE_ENDIAN__ 1364 srdi 0,7, 56 1365 sldi 8,6, 64-56 1366#else 1367 sldi 0,7, 56 1368 srdi 8,6, 64-56 1369#endif 1370 or 0,0,8 1371 ld 7,24(5) 1372 std 0,8(4) 1373 addi 4,4,16 1374 addi 5,5,32 1375 blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */ 1376 bf 31,L(du7_loop) 1377 /* there is a third DW to copy */ 1378#ifdef __LITTLE_ENDIAN__ 1379 srdi 0,6, 56 1380 sldi 8,7, 64-56 1381#else 1382 sldi 0,6, 56 1383 srdi 8,7, 64-56 1384#endif 1385 or 0,0,8 1386 std 0,0(4) 1387 mr 6,7 1388 ld 7,0(5) 1389 addi 5,5,8 1390 addi 4,4,8 1391 beq cr6,L(du7_fini) /* if total DWs = 4, then bypass loop */ 1392 b L(du7_loop) 1393 .align 4 1394L(du7_1dw): 1395#ifdef __LITTLE_ENDIAN__ 1396 srdi 0,6, 56 1397 sldi 8,7, 64-56 1398#else 1399 sldi 0,6, 56 1400 srdi 8,7, 64-56 1401#endif 1402 addi 5,5,16 1403 or 0,0,8 1404 bf 31,L(du7_loop) 1405 mr 6,7 1406 ld 7,0(5) 1407 addi 5,5,8 1408 std 0,0(4) 1409 addi 4,4,8 1410 .align 4 1411/* copy 32 bytes at a time */ 1412L(du7_loop): 1413#ifdef __LITTLE_ENDIAN__ 1414 srdi 0,6, 56 1415 sldi 8,7, 64-56 1416#else 1417 sldi 0,6, 56 1418 srdi 8,7, 64-56 1419#endif 1420 or 0,0,8 1421 ld 6,0(5) 1422 std 0,0(4) 1423#ifdef __LITTLE_ENDIAN__ 1424 srdi 0,7, 56 1425 sldi 8,6, 64-56 1426#else 1427 sldi 0,7, 56 1428 srdi 8,6, 64-56 1429#endif 1430 or 0,0,8 1431 ld 7,8(5) 1432 std 0,8(4) 1433#ifdef __LITTLE_ENDIAN__ 1434 srdi 0,6, 56 1435 sldi 8,7, 64-56 1436#else 1437 sldi 0,6, 56 1438 srdi 8,7, 64-56 1439#endif 1440 or 0,0,8 1441 ld 6,16(5) 1442 std 0,16(4) 1443#ifdef __LITTLE_ENDIAN__ 1444 srdi 0,7, 56 1445 sldi 8,6, 64-56 1446#else 1447 sldi 0,7, 56 1448 srdi 8,6, 64-56 1449#endif 1450 or 0,0,8 1451 ld 7,24(5) 1452 std 0,24(4) 1453 addi 5,5,32 1454 addi 4,4,32 1455 bdnz+ L(du7_loop) 1456 .align 4 1457L(du7_fini): 1458 /* calculate and store the final DW */ 1459#ifdef __LITTLE_ENDIAN__ 1460 srdi 0,6, 56 1461 sldi 8,7, 64-56 1462#else 1463 sldi 0,6, 56 1464 srdi 8,7, 64-56 1465#endif 1466 or 0,0,8 1467 std 0,0(4) 1468 b L(du_done) 1469 1470 .align 4 1471L(du_done): 1472 rldicr 0,31,0,60 1473 mtcrf 0x01,31 1474 beq cr1,0f /* If the tail is 0 bytes we are done! */ 1475 1476 add 3,3,0 1477 add 12,12,0 1478/* At this point we have a tail of 0-7 bytes and we know that the 1479 destination is double word aligned. */ 14804: bf 29,2f 1481 lwz 6,0(12) 1482 addi 12,12,4 1483 stw 6,0(3) 1484 addi 3,3,4 14852: bf 30,1f 1486 lhz 6,0(12) 1487 addi 12,12,2 1488 sth 6,0(3) 1489 addi 3,3,2 14901: bf 31,0f 1491 lbz 6,0(12) 1492 stb 6,0(3) 14930: 1494 /* Return original dst pointer. */ 1495 ld 31,-8(1) 1496 ld 3,-16(1) 1497 blr 1498END_GEN_TB (MEMCPY,TB_TOCLESS) 1499libc_hidden_builtin_def (memcpy) 1500