1/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15. 2 Copyright (C) 2013-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. 18 19 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 20 of VFP or NEON when built with the appropriate flags. 21 22 Assumptions: 23 24 ARMv6 (ARMv7-a if using Neon) 25 ARM state 26 Unaligned accesses 27 28 */ 29 30/* Thumb cannot encode negative immediate offsets in memory operations. */ 31#ifndef NO_THUMB 32#define NO_THUMB 33#endif 34#include <sysdep.h> 35#include <arm-features.h> 36 37 .syntax unified 38 /* This implementation requires ARM state. */ 39 .arm 40 41#ifdef MEMCPY_NEON 42 43 .fpu neon 44 .arch armv7-a 45# define FRAME_SIZE 4 46# define USE_VFP 47# define USE_NEON 48 49#elif defined (MEMCPY_VFP) 50 51 .arch armv6 52 .fpu vfpv2 53# define FRAME_SIZE 32 54# define USE_VFP 55 56#else 57 .arch armv6 58# define FRAME_SIZE 32 59 60#endif 61 62#define ALIGN(addr, align) addr:align 63 64#define INSN_SIZE 4 65 66/* Call parameters. */ 67#define dstin r0 68#define src r1 69#define count r2 70 71/* Locals. */ 72#define tmp1 r3 73#define dst ip 74#define tmp2 r8 75 76/* These two macros both work by repeated invocation of the macro 77 dispatch_step (not defined here). That macro performs one "step", 78 doing one load instruction and one store instruction to copy one 79 "unit". On entry, TMP1 contains the number of bytes to be copied, 80 a multiple of the unit size. The macro clobbers TMP1 in the 81 process of doing a computed jump to the tail containing the 82 appropriate number of steps. 83 84 In dispatch_7_dword, dispatch_step is invoked seven times, with an 85 argument that is 7 for the first and 1 for the last. Units are 86 double-words (8 bytes). TMP1 is at most 56. 87 88 In dispatch_15_word, dispatch_step is invoked fifteen times, 89 with an argument that is 15 for the first and 1 for the last. 90 Units are words (4 bytes). TMP1 is at most 60. */ 91 92#ifndef ARM_ALWAYS_BX 93# if ARM_BX_ALIGN_LOG2 != 2 94# error case not handled 95# endif 96 .macro dispatch_7_dword 97 rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE) 98 add pc, pc, tmp1 99 dispatch_step 7 100 dispatch_step 6 101 dispatch_step 5 102 dispatch_step 4 103 dispatch_step 3 104 dispatch_step 2 105 dispatch_step 1 106 .purgem dispatch_step 107 .endm 108 109 .macro dispatch_15_word 110 rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2) 111 add pc, pc, tmp1, lsl #1 112 dispatch_step 15 113 dispatch_step 14 114 dispatch_step 13 115 dispatch_step 12 116 dispatch_step 11 117 dispatch_step 10 118 dispatch_step 9 119 dispatch_step 8 120 dispatch_step 7 121 dispatch_step 6 122 dispatch_step 5 123 dispatch_step 4 124 dispatch_step 3 125 dispatch_step 2 126 dispatch_step 1 127 .purgem dispatch_step 128 .endm 129#else 130# if ARM_BX_ALIGN_LOG2 < 3 131# error case not handled 132# endif 133 .macro dispatch_helper steps, log2_bytes_per_step 134 /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is 135 (STEPS << LOG2_BYTES_PER_STEP). 136 So this is (steps_to_skip << LOG2_BYTES_PER_STEP). 137 Then it needs further adjustment to compensate for the 138 distance between the PC value taken below (0f + PC_OFS) 139 and the first step's instructions (1f). */ 140 rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \ 141 + ((1f - PC_OFS - 0f) \ 142 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step))) 143 /* Shifting down LOG2_BYTES_PER_STEP gives us the number of 144 steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us 145 the (byte) distance to add to the PC. */ 1460: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step) 147 bx tmp1 148 .p2align ARM_BX_ALIGN_LOG2 1491: 150 .endm 151 152 .macro dispatch_7_dword 153 dispatch_helper 7, 3 154 .p2align ARM_BX_ALIGN_LOG2 155 dispatch_step 7 156 .p2align ARM_BX_ALIGN_LOG2 157 dispatch_step 6 158 .p2align ARM_BX_ALIGN_LOG2 159 dispatch_step 5 160 .p2align ARM_BX_ALIGN_LOG2 161 dispatch_step 4 162 .p2align ARM_BX_ALIGN_LOG2 163 dispatch_step 3 164 .p2align ARM_BX_ALIGN_LOG2 165 dispatch_step 2 166 .p2align ARM_BX_ALIGN_LOG2 167 dispatch_step 1 168 .p2align ARM_BX_ALIGN_LOG2 169 .purgem dispatch_step 170 .endm 171 172 .macro dispatch_15_word 173 dispatch_helper 15, 2 174 dispatch_step 15 175 .p2align ARM_BX_ALIGN_LOG2 176 dispatch_step 14 177 .p2align ARM_BX_ALIGN_LOG2 178 dispatch_step 13 179 .p2align ARM_BX_ALIGN_LOG2 180 dispatch_step 12 181 .p2align ARM_BX_ALIGN_LOG2 182 dispatch_step 11 183 .p2align ARM_BX_ALIGN_LOG2 184 dispatch_step 10 185 .p2align ARM_BX_ALIGN_LOG2 186 dispatch_step 9 187 .p2align ARM_BX_ALIGN_LOG2 188 dispatch_step 8 189 .p2align ARM_BX_ALIGN_LOG2 190 dispatch_step 7 191 .p2align ARM_BX_ALIGN_LOG2 192 dispatch_step 6 193 .p2align ARM_BX_ALIGN_LOG2 194 dispatch_step 5 195 .p2align ARM_BX_ALIGN_LOG2 196 dispatch_step 4 197 .p2align ARM_BX_ALIGN_LOG2 198 dispatch_step 3 199 .p2align ARM_BX_ALIGN_LOG2 200 dispatch_step 2 201 .p2align ARM_BX_ALIGN_LOG2 202 dispatch_step 1 203 .p2align ARM_BX_ALIGN_LOG2 204 .purgem dispatch_step 205 .endm 206 207#endif 208 209#ifndef USE_NEON 210/* For bulk copies using GP registers. */ 211#define A_l r2 /* Call-clobbered. */ 212#define A_h r3 /* Call-clobbered. */ 213#define B_l r4 214#define B_h r5 215#define C_l r6 216#define C_h r7 217/* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */ 218#define D_l r10 219#define D_h r11 220#endif 221 222/* Number of lines ahead to pre-fetch data. If you change this the code 223 below will need adjustment to compensate. */ 224 225#define prefetch_lines 5 226 227#ifdef USE_VFP 228 .macro cpy_line_vfp vreg, base 229 vstr \vreg, [dst, #\base] 230 vldr \vreg, [src, #\base] 231 vstr d0, [dst, #\base + 8] 232 vldr d0, [src, #\base + 8] 233 vstr d1, [dst, #\base + 16] 234 vldr d1, [src, #\base + 16] 235 vstr d2, [dst, #\base + 24] 236 vldr d2, [src, #\base + 24] 237 vstr \vreg, [dst, #\base + 32] 238 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 239 vstr d0, [dst, #\base + 40] 240 vldr d0, [src, #\base + 40] 241 vstr d1, [dst, #\base + 48] 242 vldr d1, [src, #\base + 48] 243 vstr d2, [dst, #\base + 56] 244 vldr d2, [src, #\base + 56] 245 .endm 246 247 .macro cpy_tail_vfp vreg, base 248 vstr \vreg, [dst, #\base] 249 vldr \vreg, [src, #\base] 250 vstr d0, [dst, #\base + 8] 251 vldr d0, [src, #\base + 8] 252 vstr d1, [dst, #\base + 16] 253 vldr d1, [src, #\base + 16] 254 vstr d2, [dst, #\base + 24] 255 vldr d2, [src, #\base + 24] 256 vstr \vreg, [dst, #\base + 32] 257 vstr d0, [dst, #\base + 40] 258 vldr d0, [src, #\base + 40] 259 vstr d1, [dst, #\base + 48] 260 vldr d1, [src, #\base + 48] 261 vstr d2, [dst, #\base + 56] 262 vldr d2, [src, #\base + 56] 263 .endm 264#endif 265 266 .p2align 6 267ENTRY(memcpy) 268 269 mov dst, dstin /* Preserve dstin, we need to return it. */ 270 cmp count, #64 271 bhs .Lcpy_not_short 272 /* Deal with small copies quickly by dropping straight into the 273 exit block. */ 274 275.Ltail63unaligned: 276#ifdef USE_NEON 277 /* These need an extra layer of macro just to work around a 278 bug in the assembler's parser when an operand starts with 279 a {...}. https://sourceware.org/bugzilla/show_bug.cgi?id=15647 280 tracks that bug; it was not fixed as of binutils-2.23.2. */ 281 .macro neon_load_d0 reg 282 vld1.8 {d0}, [\reg]! 283 .endm 284 .macro neon_store_d0 reg 285 vst1.8 {d0}, [\reg]! 286 .endm 287 288 and tmp1, count, #0x38 289 .macro dispatch_step i 290 neon_load_d0 src 291 neon_store_d0 dst 292 .endm 293 dispatch_7_dword 294 295 tst count, #4 296 ldrne tmp1, [src], #4 297 strne tmp1, [dst], #4 298#else 299 /* Copy up to 15 full words of data. May not be aligned. */ 300 /* Cannot use VFP for unaligned data. */ 301 and tmp1, count, #0x3c 302 add dst, dst, tmp1 303 add src, src, tmp1 304 /* Jump directly into the sequence below at the correct offset. */ 305 .macro dispatch_step i 306 ldr tmp1, [src, #-(\i * 4)] 307 str tmp1, [dst, #-(\i * 4)] 308 .endm 309 dispatch_15_word 310#endif 311 312 lsls count, count, #31 313 ldrhcs tmp1, [src], #2 314 ldrbne src, [src] /* Src is dead, use as a scratch. */ 315 strhcs tmp1, [dst], #2 316 strbne src, [dst] 317 bx lr 318 319.Lcpy_not_short: 320 /* At least 64 bytes to copy, but don't know the alignment yet. */ 321 str tmp2, [sp, #-FRAME_SIZE]! 322 cfi_adjust_cfa_offset (FRAME_SIZE) 323 cfi_rel_offset (tmp2, 0) 324 cfi_remember_state 325 and tmp2, src, #7 326 and tmp1, dst, #7 327 cmp tmp1, tmp2 328 bne .Lcpy_notaligned 329 330#ifdef USE_VFP 331 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 332 that the FP pipeline is much better at streaming loads and 333 stores. This is outside the critical loop. */ 334 vmov.f32 s0, s0 335#endif 336 337 /* SRC and DST have the same mutual 64-bit alignment, but we may 338 still need to pre-copy some bytes to get to natural alignment. 339 We bring SRC and DST into full 64-bit alignment. */ 340 lsls tmp2, dst, #29 341 beq 1f 342 rsbs tmp2, tmp2, #0 343 sub count, count, tmp2, lsr #29 344 ldrmi tmp1, [src], #4 345 strmi tmp1, [dst], #4 346 lsls tmp2, tmp2, #2 347 ldrhcs tmp1, [src], #2 348 ldrbne tmp2, [src], #1 349 strhcs tmp1, [dst], #2 350 strbne tmp2, [dst], #1 351 3521: 353 subs tmp2, count, #64 /* Use tmp2 for count. */ 354 blo .Ltail63aligned 355 356 cmp tmp2, #512 357 bhs .Lcpy_body_long 358 359.Lcpy_body_medium: /* Count in tmp2. */ 360#ifdef USE_VFP 3611: 362 vldr d0, [src, #0] 363 subs tmp2, tmp2, #64 364 vldr d1, [src, #8] 365 vstr d0, [dst, #0] 366 vldr d0, [src, #16] 367 vstr d1, [dst, #8] 368 vldr d1, [src, #24] 369 vstr d0, [dst, #16] 370 vldr d0, [src, #32] 371 vstr d1, [dst, #24] 372 vldr d1, [src, #40] 373 vstr d0, [dst, #32] 374 vldr d0, [src, #48] 375 vstr d1, [dst, #40] 376 vldr d1, [src, #56] 377 vstr d0, [dst, #48] 378 add src, src, #64 379 vstr d1, [dst, #56] 380 add dst, dst, #64 381 bhs 1b 382 tst tmp2, #0x3f 383 beq .Ldone 384 385.Ltail63aligned: /* Count in tmp2. */ 386 and tmp1, tmp2, #0x38 387 add dst, dst, tmp1 388 add src, src, tmp1 389 .macro dispatch_step i 390 vldr d0, [src, #-(\i * 8)] 391 vstr d0, [dst, #-(\i * 8)] 392 .endm 393 dispatch_7_dword 394#else 395 sub src, src, #8 396 sub dst, dst, #8 3971: 398 ldrd A_l, A_h, [src, #8] 399 strd A_l, A_h, [dst, #8] 400 ldrd A_l, A_h, [src, #16] 401 strd A_l, A_h, [dst, #16] 402 ldrd A_l, A_h, [src, #24] 403 strd A_l, A_h, [dst, #24] 404 ldrd A_l, A_h, [src, #32] 405 strd A_l, A_h, [dst, #32] 406 ldrd A_l, A_h, [src, #40] 407 strd A_l, A_h, [dst, #40] 408 ldrd A_l, A_h, [src, #48] 409 strd A_l, A_h, [dst, #48] 410 ldrd A_l, A_h, [src, #56] 411 strd A_l, A_h, [dst, #56] 412 ldrd A_l, A_h, [src, #64]! 413 strd A_l, A_h, [dst, #64]! 414 subs tmp2, tmp2, #64 415 bhs 1b 416 tst tmp2, #0x3f 417 bne 1f 418 ldr tmp2,[sp], #FRAME_SIZE 419 cfi_adjust_cfa_offset (-FRAME_SIZE) 420 cfi_restore (tmp2) 421 bx lr 422 423 cfi_restore_state 424 cfi_remember_state 4251: 426 add src, src, #8 427 add dst, dst, #8 428 429.Ltail63aligned: /* Count in tmp2. */ 430 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 431 we know that the src and dest are 64-bit aligned so we can use 432 LDRD/STRD to improve efficiency. */ 433 /* TMP2 is now negative, but we don't care about that. The bottom 434 six bits still tell us how many bytes are left to copy. */ 435 436 and tmp1, tmp2, #0x38 437 add dst, dst, tmp1 438 add src, src, tmp1 439 .macro dispatch_step i 440 ldrd A_l, A_h, [src, #-(\i * 8)] 441 strd A_l, A_h, [dst, #-(\i * 8)] 442 .endm 443 dispatch_7_dword 444#endif 445 446 tst tmp2, #4 447 ldrne tmp1, [src], #4 448 strne tmp1, [dst], #4 449 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 450 ldrhcs tmp1, [src], #2 451 ldrbne tmp2, [src] 452 strhcs tmp1, [dst], #2 453 strbne tmp2, [dst] 454 455.Ldone: 456 ldr tmp2, [sp], #FRAME_SIZE 457 cfi_adjust_cfa_offset (-FRAME_SIZE) 458 cfi_restore (tmp2) 459 bx lr 460 461 cfi_restore_state 462 cfi_remember_state 463 464.Lcpy_body_long: /* Count in tmp2. */ 465 466 /* Long copy. We know that there's at least (prefetch_lines * 64) 467 bytes to go. */ 468#ifdef USE_VFP 469 /* Don't use PLD. Instead, read some data in advance of the current 470 copy position into a register. This should act like a PLD 471 operation but we won't have to repeat the transfer. */ 472 473 vldr d3, [src, #0] 474 vldr d4, [src, #64] 475 vldr d5, [src, #128] 476 vldr d6, [src, #192] 477 vldr d7, [src, #256] 478 479 vldr d0, [src, #8] 480 vldr d1, [src, #16] 481 vldr d2, [src, #24] 482 add src, src, #32 483 484 subs tmp2, tmp2, #prefetch_lines * 64 * 2 485 blo 2f 4861: 487 cpy_line_vfp d3, 0 488 cpy_line_vfp d4, 64 489 cpy_line_vfp d5, 128 490 add dst, dst, #3 * 64 491 add src, src, #3 * 64 492 cpy_line_vfp d6, 0 493 cpy_line_vfp d7, 64 494 add dst, dst, #2 * 64 495 add src, src, #2 * 64 496 subs tmp2, tmp2, #prefetch_lines * 64 497 bhs 1b 498 4992: 500 cpy_tail_vfp d3, 0 501 cpy_tail_vfp d4, 64 502 cpy_tail_vfp d5, 128 503 add src, src, #3 * 64 504 add dst, dst, #3 * 64 505 cpy_tail_vfp d6, 0 506 vstr d7, [dst, #64] 507 vldr d7, [src, #64] 508 vstr d0, [dst, #64 + 8] 509 vldr d0, [src, #64 + 8] 510 vstr d1, [dst, #64 + 16] 511 vldr d1, [src, #64 + 16] 512 vstr d2, [dst, #64 + 24] 513 vldr d2, [src, #64 + 24] 514 vstr d7, [dst, #64 + 32] 515 add src, src, #96 516 vstr d0, [dst, #64 + 40] 517 vstr d1, [dst, #64 + 48] 518 vstr d2, [dst, #64 + 56] 519 add dst, dst, #128 520 add tmp2, tmp2, #prefetch_lines * 64 521 b .Lcpy_body_medium 522#else 523 /* Long copy. Use an SMS style loop to maximize the I/O 524 bandwidth of the core. We don't have enough spare registers 525 to synthesise prefetching, so use PLD operations. */ 526 /* Pre-bias src and dst. */ 527 sub src, src, #8 528 sub dst, dst, #8 529 pld [src, #8] 530 pld [src, #72] 531 subs tmp2, tmp2, #64 532 pld [src, #136] 533 ldrd A_l, A_h, [src, #8] 534 strd B_l, B_h, [sp, #8] 535 cfi_rel_offset (B_l, 8) 536 cfi_rel_offset (B_h, 12) 537 ldrd B_l, B_h, [src, #16] 538 strd C_l, C_h, [sp, #16] 539 cfi_rel_offset (C_l, 16) 540 cfi_rel_offset (C_h, 20) 541 ldrd C_l, C_h, [src, #24] 542 strd D_l, D_h, [sp, #24] 543 cfi_rel_offset (D_l, 24) 544 cfi_rel_offset (D_h, 28) 545 pld [src, #200] 546 ldrd D_l, D_h, [src, #32]! 547 b 1f 548 .p2align 6 5492: 550 pld [src, #232] 551 strd A_l, A_h, [dst, #40] 552 ldrd A_l, A_h, [src, #40] 553 strd B_l, B_h, [dst, #48] 554 ldrd B_l, B_h, [src, #48] 555 strd C_l, C_h, [dst, #56] 556 ldrd C_l, C_h, [src, #56] 557 strd D_l, D_h, [dst, #64]! 558 ldrd D_l, D_h, [src, #64]! 559 subs tmp2, tmp2, #64 5601: 561 strd A_l, A_h, [dst, #8] 562 ldrd A_l, A_h, [src, #8] 563 strd B_l, B_h, [dst, #16] 564 ldrd B_l, B_h, [src, #16] 565 strd C_l, C_h, [dst, #24] 566 ldrd C_l, C_h, [src, #24] 567 strd D_l, D_h, [dst, #32] 568 ldrd D_l, D_h, [src, #32] 569 bcs 2b 570 /* Save the remaining bytes and restore the callee-saved regs. */ 571 strd A_l, A_h, [dst, #40] 572 add src, src, #40 573 strd B_l, B_h, [dst, #48] 574 ldrd B_l, B_h, [sp, #8] 575 cfi_restore (B_l) 576 cfi_restore (B_h) 577 strd C_l, C_h, [dst, #56] 578 ldrd C_l, C_h, [sp, #16] 579 cfi_restore (C_l) 580 cfi_restore (C_h) 581 strd D_l, D_h, [dst, #64] 582 ldrd D_l, D_h, [sp, #24] 583 cfi_restore (D_l) 584 cfi_restore (D_h) 585 add dst, dst, #72 586 tst tmp2, #0x3f 587 bne .Ltail63aligned 588 ldr tmp2, [sp], #FRAME_SIZE 589 cfi_adjust_cfa_offset (-FRAME_SIZE) 590 cfi_restore (tmp2) 591 bx lr 592#endif 593 594 cfi_restore_state 595 cfi_remember_state 596 597.Lcpy_notaligned: 598 pld [src, #0] 599 pld [src, #64] 600 /* There's at least 64 bytes to copy, but there is no mutual 601 alignment. */ 602 /* Bring DST to 64-bit alignment. */ 603 lsls tmp2, dst, #29 604 pld [src, #(2 * 64)] 605 beq 1f 606 rsbs tmp2, tmp2, #0 607 sub count, count, tmp2, lsr #29 608 ldrmi tmp1, [src], #4 609 strmi tmp1, [dst], #4 610 lsls tmp2, tmp2, #2 611 ldrbne tmp1, [src], #1 612 ldrhcs tmp2, [src], #2 613 strbne tmp1, [dst], #1 614 strhcs tmp2, [dst], #2 6151: 616 pld [src, #(3 * 64)] 617 subs count, count, #64 618 ldrlo tmp2, [sp], #FRAME_SIZE 619 blo .Ltail63unaligned 620 pld [src, #(4 * 64)] 621 622#ifdef USE_NEON 623 /* These need an extra layer of macro just to work around a 624 bug in the assembler's parser when an operand starts with 625 a {...}. */ 626 .macro neon_load_multi reglist, basereg 627 vld1.8 {\reglist}, [\basereg]! 628 .endm 629 .macro neon_store_multi reglist, basereg 630 vst1.8 {\reglist}, [ALIGN (\basereg, 64)]! 631 .endm 632 633 neon_load_multi d0-d3, src 634 neon_load_multi d4-d7, src 635 subs count, count, #64 636 blo 2f 6371: 638 pld [src, #(4 * 64)] 639 neon_store_multi d0-d3, dst 640 neon_load_multi d0-d3, src 641 neon_store_multi d4-d7, dst 642 neon_load_multi d4-d7, src 643 subs count, count, #64 644 bhs 1b 6452: 646 neon_store_multi d0-d3, dst 647 neon_store_multi d4-d7, dst 648 ands count, count, #0x3f 649#else 650 /* Use an SMS style loop to maximize the I/O bandwidth. */ 651 sub src, src, #4 652 sub dst, dst, #8 653 subs tmp2, count, #64 /* Use tmp2 for count. */ 654 ldr A_l, [src, #4] 655 ldr A_h, [src, #8] 656 strd B_l, B_h, [sp, #8] 657 cfi_rel_offset (B_l, 8) 658 cfi_rel_offset (B_h, 12) 659 ldr B_l, [src, #12] 660 ldr B_h, [src, #16] 661 strd C_l, C_h, [sp, #16] 662 cfi_rel_offset (C_l, 16) 663 cfi_rel_offset (C_h, 20) 664 ldr C_l, [src, #20] 665 ldr C_h, [src, #24] 666 strd D_l, D_h, [sp, #24] 667 cfi_rel_offset (D_l, 24) 668 cfi_rel_offset (D_h, 28) 669 ldr D_l, [src, #28] 670 ldr D_h, [src, #32]! 671 b 1f 672 .p2align 6 6732: 674 pld [src, #(5 * 64) - (32 - 4)] 675 strd A_l, A_h, [dst, #40] 676 ldr A_l, [src, #36] 677 ldr A_h, [src, #40] 678 strd B_l, B_h, [dst, #48] 679 ldr B_l, [src, #44] 680 ldr B_h, [src, #48] 681 strd C_l, C_h, [dst, #56] 682 ldr C_l, [src, #52] 683 ldr C_h, [src, #56] 684 strd D_l, D_h, [dst, #64]! 685 ldr D_l, [src, #60] 686 ldr D_h, [src, #64]! 687 subs tmp2, tmp2, #64 6881: 689 strd A_l, A_h, [dst, #8] 690 ldr A_l, [src, #4] 691 ldr A_h, [src, #8] 692 strd B_l, B_h, [dst, #16] 693 ldr B_l, [src, #12] 694 ldr B_h, [src, #16] 695 strd C_l, C_h, [dst, #24] 696 ldr C_l, [src, #20] 697 ldr C_h, [src, #24] 698 strd D_l, D_h, [dst, #32] 699 ldr D_l, [src, #28] 700 ldr D_h, [src, #32] 701 bcs 2b 702 703 /* Save the remaining bytes and restore the callee-saved regs. */ 704 strd A_l, A_h, [dst, #40] 705 add src, src, #36 706 strd B_l, B_h, [dst, #48] 707 ldrd B_l, B_h, [sp, #8] 708 cfi_restore (B_l) 709 cfi_restore (B_h) 710 strd C_l, C_h, [dst, #56] 711 ldrd C_l, C_h, [sp, #16] 712 cfi_restore (C_l) 713 cfi_restore (C_h) 714 strd D_l, D_h, [dst, #64] 715 ldrd D_l, D_h, [sp, #24] 716 cfi_restore (D_l) 717 cfi_restore (D_h) 718 add dst, dst, #72 719 ands count, tmp2, #0x3f 720#endif 721 ldr tmp2, [sp], #FRAME_SIZE 722 cfi_adjust_cfa_offset (-FRAME_SIZE) 723 cfi_restore (tmp2) 724 bne .Ltail63unaligned 725 bx lr 726 727END(memcpy) 728libc_hidden_builtin_def (memcpy) 729