1/* Optimized memmove implementation for PowerPC64/POWER7. 2 Copyright (C) 2014-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 22/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5]) 23 24 This optimization check if memory 'dest' overlaps with 'src'. If it does 25 not then it calls an optimized memcpy call (similar to memcpy for POWER7, 26 embedded here to gain some cycles). 27 If source and destiny overlaps, a optimized backwards memcpy is used 28 instead. */ 29 30#ifndef MEMMOVE 31# define MEMMOVE memmove 32#endif 33 .machine power7 34ENTRY_TOCLESS (MEMMOVE, 5) 35 CALL_MCOUNT 3 36 37L(_memmove): 38 subf r9,r4,r3 39 cmpld cr7,r9,r5 40 blt cr7,L(memmove_bwd) 41 42 cmpldi cr1,r5,31 43 neg 0,3 44 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move 45 code. */ 46 47 andi. 10,3,15 48 clrldi 11,4,60 49 cmpld cr6,10,11 /* SRC and DST alignments match? */ 50 51 mr r11,3 52 bne cr6,L(copy_GE_32_unaligned) 53 beq L(aligned_copy) 54 55 mtocrf 0x01,0 56 clrldi 0,0,60 57 58/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ 591: 60 bf 31,2f 61 lbz 6,0(r4) 62 addi r4,r4,1 63 stb 6,0(r11) 64 addi r11,r11,1 652: 66 bf 30,4f 67 lhz 6,0(r4) 68 addi r4,r4,2 69 sth 6,0(r11) 70 addi r11,r11,2 714: 72 bf 29,8f 73 lwz 6,0(r4) 74 addi r4,r4,4 75 stw 6,0(r11) 76 addi r11,r11,4 778: 78 bf 28,16f 79 ld 6,0(r4) 80 addi r4,r4,8 81 std 6,0(r11) 82 addi r11,r11,8 8316: 84 subf r5,0,r5 85 86/* Main aligned copy loop. Copies 128 bytes at a time. */ 87L(aligned_copy): 88 li 6,16 89 li 7,32 90 li 8,48 91 mtocrf 0x02,r5 92 srdi 12,r5,7 93 cmpdi 12,0 94 beq L(aligned_tail) 95 lvx 6,0,r4 96 lvx 7,r4,6 97 mtctr 12 98 b L(aligned_128loop) 99 100 .align 4 101L(aligned_128head): 102 /* for the 2nd + iteration of this loop. */ 103 lvx 6,0,r4 104 lvx 7,r4,6 105L(aligned_128loop): 106 lvx 8,r4,7 107 lvx 9,r4,8 108 stvx 6,0,r11 109 addi r4,r4,64 110 stvx 7,r11,6 111 stvx 8,r11,7 112 stvx 9,r11,8 113 lvx 6,0,r4 114 lvx 7,r4,6 115 addi r11,r11,64 116 lvx 8,r4,7 117 lvx 9,r4,8 118 addi r4,r4,64 119 stvx 6,0,r11 120 stvx 7,r11,6 121 stvx 8,r11,7 122 stvx 9,r11,8 123 addi r11,r11,64 124 bdnz L(aligned_128head) 125 126L(aligned_tail): 127 mtocrf 0x01,r5 128 bf 25,32f 129 lvx 6,0,r4 130 lvx 7,r4,6 131 lvx 8,r4,7 132 lvx 9,r4,8 133 addi r4,r4,64 134 stvx 6,0,r11 135 stvx 7,r11,6 136 stvx 8,r11,7 137 stvx 9,r11,8 138 addi r11,r11,64 13932: 140 bf 26,16f 141 lvx 6,0,r4 142 lvx 7,r4,6 143 addi r4,r4,32 144 stvx 6,0,r11 145 stvx 7,r11,6 146 addi r11,r11,32 14716: 148 bf 27,8f 149 lvx 6,0,r4 150 addi r4,r4,16 151 stvx 6,0,r11 152 addi r11,r11,16 1538: 154 bf 28,4f 155 ld 6,0(r4) 156 addi r4,r4,8 157 std 6,0(r11) 158 addi r11,r11,8 1594: /* Copies 4~7 bytes. */ 160 bf 29,L(tail2) 161 lwz 6,0(r4) 162 stw 6,0(r11) 163 bf 30,L(tail5) 164 lhz 7,4(r4) 165 sth 7,4(r11) 166 bflr 31 167 lbz 8,6(r4) 168 stb 8,6(r11) 169 /* Return original DST pointer. */ 170 blr 171 172/* Handle copies of 0~31 bytes. */ 173 .align 4 174L(copy_LT_32): 175 mr r11,3 176 cmpldi cr6,r5,8 177 mtocrf 0x01,r5 178 ble cr6,L(copy_LE_8) 179 180 /* At least 9 bytes to go. */ 181 neg 8,4 182 andi. 0,8,3 183 cmpldi cr1,r5,16 184 beq L(copy_LT_32_aligned) 185 186 /* Force 4-byte alignment for SRC. */ 187 mtocrf 0x01,0 188 subf r5,0,r5 1892: 190 bf 30,1f 191 lhz 6,0(r4) 192 addi r4,r4,2 193 sth 6,0(r11) 194 addi r11,r11,2 1951: 196 bf 31,L(end_4bytes_alignment) 197 lbz 6,0(r4) 198 addi r4,r4,1 199 stb 6,0(r11) 200 addi r11,r11,1 201 202 .align 4 203L(end_4bytes_alignment): 204 cmpldi cr1,r5,16 205 mtocrf 0x01,r5 206 207L(copy_LT_32_aligned): 208 /* At least 6 bytes to go, and SRC is word-aligned. */ 209 blt cr1,8f 210 211 /* Copy 16 bytes. */ 212 lwz 6,0(r4) 213 lwz 7,4(r4) 214 stw 6,0(r11) 215 lwz 8,8(r4) 216 stw 7,4(r11) 217 lwz 6,12(r4) 218 addi r4,r4,16 219 stw 8,8(r11) 220 stw 6,12(r11) 221 addi r11,r11,16 2228: /* Copy 8 bytes. */ 223 bf 28,L(tail4) 224 lwz 6,0(r4) 225 lwz 7,4(r4) 226 addi r4,r4,8 227 stw 6,0(r11) 228 stw 7,4(r11) 229 addi r11,r11,8 230 231 .align 4 232/* Copies 4~7 bytes. */ 233L(tail4): 234 bf 29,L(tail2) 235 lwz 6,0(r4) 236 stw 6,0(r11) 237 bf 30,L(tail5) 238 lhz 7,4(r4) 239 sth 7,4(r11) 240 bflr 31 241 lbz 8,6(r4) 242 stb 8,6(r11) 243 /* Return original DST pointer. */ 244 blr 245 246 .align 4 247/* Copies 2~3 bytes. */ 248L(tail2): 249 bf 30,1f 250 lhz 6,0(r4) 251 sth 6,0(r11) 252 bflr 31 253 lbz 7,2(r4) 254 stb 7,2(r11) 255 blr 256 257 .align 4 258L(tail5): 259 bflr 31 260 lbz 6,4(r4) 261 stb 6,4(r11) 262 blr 263 264 .align 4 2651: 266 bflr 31 267 lbz 6,0(r4) 268 stb 6,0(r11) 269 /* Return original DST pointer. */ 270 blr 271 272/* Handles copies of 0~8 bytes. */ 273 .align 4 274L(copy_LE_8): 275 bne cr6,L(tail4) 276 277 /* Though we could've used ld/std here, they are still 278 slow for unaligned cases. */ 279 280 lwz 6,0(r4) 281 lwz 7,4(r4) 282 stw 6,0(r11) 283 stw 7,4(r11) 284 blr 285 286 287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but 288 SRC is not. Use aligned quadword loads from SRC, shifted to realign 289 the data, allowing for aligned DST stores. */ 290 .align 4 291L(copy_GE_32_unaligned): 292 clrldi 0,0,60 /* Number of bytes until the 1st r11 quadword. */ 293 srdi 9,r5,4 /* Number of full quadwords remaining. */ 294 295 beq L(copy_GE_32_unaligned_cont) 296 297 /* DST is not quadword aligned, get it aligned. */ 298 299 mtocrf 0x01,0 300 subf r5,0,r5 301 302 /* Vector instructions work best when proper alignment (16-bytes) 303 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 3041: 305 bf 31,2f 306 lbz 6,0(r4) 307 addi r4,r4,1 308 stb 6,0(r11) 309 addi r11,r11,1 3102: 311 bf 30,4f 312 lhz 6,0(r4) 313 addi r4,r4,2 314 sth 6,0(r11) 315 addi r11,r11,2 3164: 317 bf 29,8f 318 lwz 6,0(r4) 319 addi r4,r4,4 320 stw 6,0(r11) 321 addi r11,r11,4 3228: 323 bf 28,0f 324 ld 6,0(r4) 325 addi r4,r4,8 326 std 6,0(r11) 327 addi r11,r11,8 3280: 329 srdi 9,r5,4 /* Number of full quadwords remaining. */ 330 331 /* The proper alignment is present, it is OK to copy the bytes now. */ 332L(copy_GE_32_unaligned_cont): 333 334 /* Setup two indexes to speed up the indexed vector operations. */ 335 clrldi 10,r5,60 336 li 6,16 /* Index for 16-bytes offsets. */ 337 li 7,32 /* Index for 32-bytes offsets. */ 338 cmpldi cr1,10,0 339 srdi 8,r5,5 /* Setup the loop counter. */ 340 mtocrf 0x01,9 341 cmpldi cr6,9,1 342#ifdef __LITTLE_ENDIAN__ 343 lvsr 5,0,r4 344#else 345 lvsl 5,0,r4 346#endif 347 lvx 3,0,r4 348 li 0,0 349 bf 31,L(setup_unaligned_loop) 350 351 /* Copy another 16 bytes to align to 32-bytes due to the loop. */ 352 lvx 4,r4,6 353#ifdef __LITTLE_ENDIAN__ 354 vperm 6,4,3,5 355#else 356 vperm 6,3,4,5 357#endif 358 addi r4,r4,16 359 stvx 6,0,r11 360 addi r11,r11,16 361 vor 3,4,4 362 clrrdi 0,r4,60 363 364L(setup_unaligned_loop): 365 mtctr 8 366 ble cr6,L(end_unaligned_loop) 367 368 /* Copy 32 bytes at a time using vector instructions. */ 369 .align 4 370L(unaligned_loop): 371 372 /* Note: vr6/vr10 may contain data that was already copied, 373 but in order to get proper alignment, we may have to copy 374 some portions again. This is faster than having unaligned 375 vector instructions though. */ 376 377 lvx 4,r4,6 378#ifdef __LITTLE_ENDIAN__ 379 vperm 6,4,3,5 380#else 381 vperm 6,3,4,5 382#endif 383 lvx 3,r4,7 384#ifdef __LITTLE_ENDIAN__ 385 vperm 10,3,4,5 386#else 387 vperm 10,4,3,5 388#endif 389 addi r4,r4,32 390 stvx 6,0,r11 391 stvx 10,r11,6 392 addi r11,r11,32 393 bdnz L(unaligned_loop) 394 395 clrrdi 0,r4,60 396 397 .align 4 398L(end_unaligned_loop): 399 400 /* Check for tail bytes. */ 401 mtocrf 0x01,r5 402 beqlr cr1 403 404 add r4,r4,0 405 406 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 407 /* Copy 8 bytes. */ 408 bf 28,4f 409 lwz 6,0(r4) 410 lwz 7,4(r4) 411 addi r4,r4,8 412 stw 6,0(r11) 413 stw 7,4(r11) 414 addi r11,r11,8 4154: /* Copy 4~7 bytes. */ 416 bf 29,L(tail2) 417 lwz 6,0(r4) 418 stw 6,0(r11) 419 bf 30,L(tail5) 420 lhz 7,4(r4) 421 sth 7,4(r11) 422 bflr 31 423 lbz 8,6(r4) 424 stb 8,6(r11) 425 /* Return original DST pointer. */ 426 blr 427 428 /* Start to memcpy backward implementation: the algorith first check if 429 src and dest have the same alignment and if it does align both to 16 430 bytes and copy using VSX instructions. 431 If does not, align dest to 16 bytes and use VMX (altivec) instruction 432 to read two 16 bytes at time, shift/permute the bytes read and write 433 aligned to dest. */ 434L(memmove_bwd): 435 cmpldi cr1,r5,31 436 /* Copy is done backwards: update the pointers and check alignment. */ 437 add r11,r3,r5 438 add r4,r4,r5 439 mr r0,r11 440 ble cr1, L(copy_LT_32_bwd) /* If move < 32 bytes use short move 441 code. */ 442 443 andi. r10,r11,15 /* Check if r11 is aligned to 16 bytes */ 444 clrldi r9,r4,60 /* Check if r4 is aligned to 16 bytes */ 445 cmpld cr6,r10,r9 /* SRC and DST alignments match? */ 446 447 bne cr6,L(copy_GE_32_unaligned_bwd) 448 beq L(aligned_copy_bwd) 449 450 mtocrf 0x01,r0 451 clrldi r0,r0,60 452 453/* Get the DST and SRC aligned to 16 bytes. */ 4541: 455 bf 31,2f 456 lbz r6,-1(r4) 457 subi r4,r4,1 458 stb r6,-1(r11) 459 subi r11,r11,1 4602: 461 bf 30,4f 462 lhz r6,-2(r4) 463 subi r4,r4,2 464 sth r6,-2(r11) 465 subi r11,r11,2 4664: 467 bf 29,8f 468 lwz r6,-4(r4) 469 subi r4,r4,4 470 stw r6,-4(r11) 471 subi r11,r11,4 4728: 473 bf 28,16f 474 ld r6,-8(r4) 475 subi r4,r4,8 476 std r6,-8(r11) 477 subi r11,r11,8 47816: 479 subf r5,0,r5 480 481/* Main aligned copy loop. Copies 128 bytes at a time. */ 482L(aligned_copy_bwd): 483 li r6,-16 484 li r7,-32 485 li r8,-48 486 li r9,-64 487 mtocrf 0x02,r5 488 srdi r12,r5,7 489 cmpdi r12,0 490 beq L(aligned_tail_bwd) 491 lvx v6,r4,r6 492 lvx v7,r4,r7 493 mtctr 12 494 b L(aligned_128loop_bwd) 495 496 .align 4 497L(aligned_128head_bwd): 498 /* for the 2nd + iteration of this loop. */ 499 lvx v6,r4,r6 500 lvx v7,r4,r7 501L(aligned_128loop_bwd): 502 lvx v8,r4,r8 503 lvx v9,r4,r9 504 stvx v6,r11,r6 505 subi r4,r4,64 506 stvx v7,r11,r7 507 stvx v8,r11,r8 508 stvx v9,r11,r9 509 lvx v6,r4,r6 510 lvx v7,r4,7 511 subi r11,r11,64 512 lvx v8,r4,r8 513 lvx v9,r4,r9 514 subi r4,r4,64 515 stvx v6,r11,r6 516 stvx v7,r11,r7 517 stvx v8,r11,r8 518 stvx v9,r11,r9 519 subi r11,r11,64 520 bdnz L(aligned_128head_bwd) 521 522L(aligned_tail_bwd): 523 mtocrf 0x01,r5 524 bf 25,32f 525 lvx v6,r4,r6 526 lvx v7,r4,r7 527 lvx v8,r4,r8 528 lvx v9,r4,r9 529 subi r4,r4,64 530 stvx v6,r11,r6 531 stvx v7,r11,r7 532 stvx v8,r11,r8 533 stvx v9,r11,r9 534 subi r11,r11,64 53532: 536 bf 26,16f 537 lvx v6,r4,r6 538 lvx v7,r4,r7 539 subi r4,r4,32 540 stvx v6,r11,r6 541 stvx v7,r11,r7 542 subi r11,r11,32 54316: 544 bf 27,8f 545 lvx v6,r4,r6 546 subi r4,r4,16 547 stvx v6,r11,r6 548 subi r11,r11,16 5498: 550 bf 28,4f 551 ld r6,-8(r4) 552 subi r4,r4,8 553 std r6,-8(r11) 554 subi r11,r11,8 5554: /* Copies 4~7 bytes. */ 556 bf 29,L(tail2_bwd) 557 lwz r6,-4(r4) 558 stw r6,-4(r11) 559 bf 30,L(tail5_bwd) 560 lhz r7,-6(r4) 561 sth r7,-6(r11) 562 bflr 31 563 lbz r8,-7(r4) 564 stb r8,-7(r11) 565 /* Return original DST pointer. */ 566 blr 567 568/* Handle copies of 0~31 bytes. */ 569 .align 4 570L(copy_LT_32_bwd): 571 cmpldi cr6,r5,8 572 mtocrf 0x01,r5 573 ble cr6,L(copy_LE_8_bwd) 574 575 /* At least 9 bytes to go. */ 576 neg r8,r4 577 andi. r0,r8,3 578 cmpldi cr1,r5,16 579 beq L(copy_LT_32_aligned_bwd) 580 581 /* Force 4-byte alignment for SRC. */ 582 mtocrf 0x01,0 583 subf r5,0,r5 5842: 585 bf 30,1f 586 lhz r6,-2(r4) 587 subi r4,r4,2 588 sth r6,-2(r11) 589 subi r11,r11,2 5901: 591 bf 31,L(end_4bytes_alignment_bwd) 592 lbz 6,-1(r4) 593 subi r4,r4,1 594 stb 6,-1(r11) 595 subi r11,r11,1 596 597 .align 4 598L(end_4bytes_alignment_bwd): 599 cmpldi cr1,r5,16 600 mtocrf 0x01,r5 601 602L(copy_LT_32_aligned_bwd): 603 /* At least 6 bytes to go, and SRC is word-aligned. */ 604 blt cr1,8f 605 606 /* Copy 16 bytes. */ 607 lwz r6,-4(r4) 608 lwz r7,-8(r4) 609 stw r6,-4(r11) 610 lwz r8,-12(r4) 611 stw r7,-8(r11) 612 lwz r6,-16(r4) 613 subi r4,r4,16 614 stw r8,-12(r11) 615 stw r6,-16(r11) 616 subi r11,r11,16 6178: /* Copy 8 bytes. */ 618 bf 28,L(tail4_bwd) 619 lwz r6,-4(r4) 620 lwz r7,-8(r4) 621 subi r4,r4,8 622 stw r6,-4(r11) 623 stw r7,-8(r11) 624 subi r11,r11,8 625 626 .align 4 627/* Copies 4~7 bytes. */ 628L(tail4_bwd): 629 bf 29,L(tail2_bwd) 630 lwz 6,-4(r4) 631 stw 6,-4(r11) 632 bf 30,L(tail5_bwd) 633 lhz 7,-6(r4) 634 sth 7,-6(r11) 635 bflr 31 636 lbz 8,-7(r4) 637 stb 8,-7(r11) 638 /* Return original DST pointer. */ 639 blr 640 641 .align 4 642/* Copies 2~3 bytes. */ 643L(tail2_bwd): 644 bf 30,1f 645 lhz 6,-2(r4) 646 sth 6,-2(r11) 647 bflr 31 648 lbz 7,-3(r4) 649 stb 7,-3(r11) 650 blr 651 652 .align 4 653L(tail5_bwd): 654 bflr 31 655 lbz 6,-5(r4) 656 stb 6,-5(r11) 657 blr 658 659 .align 4 6601: 661 bflr 31 662 lbz 6,-1(r4) 663 stb 6,-1(r11) 664 /* Return original DST pointer. */ 665 blr 666 667 668/* Handles copies of 0~8 bytes. */ 669 .align 4 670L(copy_LE_8_bwd): 671 bne cr6,L(tail4_bwd) 672 673 /* Though we could've used ld/std here, they are still 674 slow for unaligned cases. */ 675 lwz 6,-8(r4) 676 lwz 7,-4(r4) 677 stw 6,-8(r11) 678 stw 7,-4(r11) 679 blr 680 681 682/* Handle copies of 32+ bytes where DST is aligned (to quadword) but 683 SRC is not. Use aligned quadword loads from SRC, shifted to realign 684 the data, allowing for aligned DST stores. */ 685 .align 4 686L(copy_GE_32_unaligned_bwd): 687 andi. r10,r11,15 /* Check alignment of DST against 16 bytes.. */ 688 srdi r9,r5,4 /* Number of full quadwords remaining. */ 689 690 beq L(copy_GE_32_unaligned_cont_bwd) 691 692 /* DST is not quadword aligned and r10 holds the address masked to 693 compare alignments. */ 694 mtocrf 0x01,r10 695 subf r5,r10,r5 696 697 /* Vector instructions work best when proper alignment (16-bytes) 698 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 6991: 700 bf 31,2f 701 lbz r6,-1(r4) 702 subi r4,r4,1 703 stb r6,-1(r11) 704 subi r11,r11,1 7052: 706 bf 30,4f 707 lhz r6,-2(r4) 708 subi r4,r4,2 709 sth r6,-2(r11) 710 subi r11,r11,2 7114: 712 bf 29,8f 713 lwz r6,-4(r4) 714 subi r4,r4,4 715 stw r6,-4(r11) 716 subi r11,r11,4 7178: 718 bf 28,0f 719 ld r6,-8(r4) 720 subi r4,r4,8 721 std r6,-8(r11) 722 subi r11,r11,8 7230: 724 srdi r9,r5,4 /* Number of full quadwords remaining. */ 725 726 /* The proper alignment is present, it is OK to copy the bytes now. */ 727L(copy_GE_32_unaligned_cont_bwd): 728 729 /* Setup two indexes to speed up the indexed vector operations. */ 730 clrldi r10,r5,60 731 li r6,-16 /* Index for 16-bytes offsets. */ 732 li r7,-32 /* Index for 32-bytes offsets. */ 733 cmpldi cr1,10,0 734 srdi r8,r5,5 /* Setup the loop counter. */ 735 mtocrf 0x01,9 736 cmpldi cr6,r9,1 737#ifdef __LITTLE_ENDIAN__ 738 lvsr v5,r0,r4 739#else 740 lvsl v5,r0,r4 741#endif 742 lvx v3,0,r4 743 li r0,0 744 bf 31,L(setup_unaligned_loop_bwd) 745 746 /* Copy another 16 bytes to align to 32-bytes due to the loop. */ 747 lvx v4,r4,r6 748#ifdef __LITTLE_ENDIAN__ 749 vperm v6,v3,v4,v5 750#else 751 vperm v6,v4,v3,v5 752#endif 753 subi r4,r4,16 754 stvx v6,r11,r6 755 subi r11,r11,16 756 vor v3,v4,v4 757 clrrdi r0,r4,60 758 759L(setup_unaligned_loop_bwd): 760 mtctr r8 761 ble cr6,L(end_unaligned_loop_bwd) 762 763 /* Copy 32 bytes at a time using vector instructions. */ 764 .align 4 765L(unaligned_loop_bwd): 766 767 /* Note: vr6/vr10 may contain data that was already copied, 768 but in order to get proper alignment, we may have to copy 769 some portions again. This is faster than having unaligned 770 vector instructions though. */ 771 772 lvx v4,r4,r6 773#ifdef __LITTLE_ENDIAN__ 774 vperm v6,v3,v4,v5 775#else 776 vperm v6,v4,v3,v5 777#endif 778 lvx v3,r4,r7 779#ifdef __LITTLE_ENDIAN__ 780 vperm v10,v4,v3,v5 781#else 782 vperm v10,v3,v4,v5 783#endif 784 subi r4,r4,32 785 stvx v6,r11,r6 786 stvx v10,r11,r7 787 subi r11,r11,32 788 bdnz L(unaligned_loop_bwd) 789 790 clrrdi r0,r4,60 791 792 .align 4 793L(end_unaligned_loop_bwd): 794 795 /* Check for tail bytes. */ 796 mtocrf 0x01,r5 797 beqlr cr1 798 799 add r4,r4,0 800 801 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 802 /* Copy 8 bytes. */ 803 bf 28,4f 804 lwz r6,-4(r4) 805 lwz r7,-8(r4) 806 subi r4,r4,8 807 stw r6,-4(r11) 808 stw r7,-8(r11) 809 subi r11,r11,8 8104: /* Copy 4~7 bytes. */ 811 bf 29,L(tail2_bwd) 812 lwz r6,-4(r4) 813 stw r6,-4(r11) 814 bf 30,L(tail5_bwd) 815 lhz r7,-6(r4) 816 sth r7,-6(r11) 817 bflr 31 818 lbz r8,-7(r4) 819 stb r8,-7(r11) 820 /* Return original DST pointer. */ 821 blr 822END_GEN_TB (MEMMOVE, TB_TOCLESS) 823libc_hidden_builtin_def (memmove) 824