1/* Optimized memcpy implementation for PowerPC32/POWER7. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); 22 Returns 'dst'. */ 23 24 .machine power7 25EALIGN (memcpy, 5, 0) 26 CALL_MCOUNT 27 28 stwu 1,-32(1) 29 cfi_adjust_cfa_offset(32) 30 stw 30,20(1) 31 cfi_offset(30,(20-32)) 32 stw 31,24(1) 33 mr 30,3 34 cmplwi cr1,5,31 35 neg 0,3 36 cfi_offset(31,-8) 37 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move 38 code. */ 39 40 andi. 11,3,15 /* Check alignment of DST. */ 41 clrlwi 10,4,28 /* Check alignment of SRC. */ 42 cmplw cr6,10,11 /* SRC and DST alignments match? */ 43 mr 12,4 44 mr 31,5 45 bne cr6,L(copy_GE_32_unaligned) 46 47 srwi 9,5,3 /* Number of full quadwords remaining. */ 48 49 beq L(copy_GE_32_aligned_cont) 50 51 clrlwi 0,0,29 52 mtcrf 0x01,0 53 subf 31,0,5 54 55 /* Get the SRC aligned to 8 bytes. */ 56 571: bf 31,2f 58 lbz 6,0(12) 59 addi 12,12,1 60 stb 6,0(3) 61 addi 3,3,1 622: bf 30,4f 63 lhz 6,0(12) 64 addi 12,12,2 65 sth 6,0(3) 66 addi 3,3,2 674: bf 29,0f 68 lwz 6,0(12) 69 addi 12,12,4 70 stw 6,0(3) 71 addi 3,3,4 720: 73 clrlwi 10,12,29 /* Check alignment of SRC again. */ 74 srwi 9,31,3 /* Number of full doublewords remaining. */ 75 76L(copy_GE_32_aligned_cont): 77 78 clrlwi 11,31,29 79 mtcrf 0x01,9 80 81 srwi 8,31,5 82 cmplwi cr1,9,4 83 cmplwi cr6,11,0 84 mr 11,12 85 86 /* Copy 1~3 doublewords so the main loop starts 87 at a multiple of 32 bytes. */ 88 89 bf 30,1f 90 lfd 6,0(12) 91 lfd 7,8(12) 92 addi 11,12,16 93 mtctr 8 94 stfd 6,0(3) 95 stfd 7,8(3) 96 addi 10,3,16 97 bf 31,4f 98 lfd 0,16(12) 99 stfd 0,16(3) 100 blt cr1,3f 101 addi 11,12,24 102 addi 10,3,24 103 b 4f 104 105 .align 4 1061: /* Copy 1 doubleword and set the counter. */ 107 mr 10,3 108 mtctr 8 109 bf 31,4f 110 lfd 6,0(12) 111 addi 11,12,8 112 stfd 6,0(3) 113 addi 10,3,8 114 115L(aligned_copy): 116 /* Main aligned copy loop. Copies up to 128-bytes at a time. */ 117 .align 4 1184: 119 /* check for any 32-byte or 64-byte lumps that are outside of a 120 nice 128-byte range. R8 contains the number of 32-byte 121 lumps, so drop this into the CR, and use the SO/EQ bits to help 122 handle the 32- or 64- byte lumps. Then handle the rest with an 123 unrolled 128-bytes-at-a-time copy loop. */ 124 mtocrf 1,8 125 li 6,16 # 16() index 126 li 7,32 # 32() index 127 li 8,48 # 48() index 128 129L(aligned_32byte): 130 /* if the SO bit (indicating a 32-byte lump) is not set, move along. */ 131 bns cr7,L(aligned_64byte) 132 lxvd2x 6,0,11 133 lxvd2x 7,11,6 134 addi 11,11,32 135 stxvd2x 6,0,10 136 stxvd2x 7,10,6 137 addi 10,10,32 138 139L(aligned_64byte): 140 /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */ 141 bne cr7,L(aligned_128setup) 142 lxvd2x 6,0,11 143 lxvd2x 7,11,6 144 lxvd2x 8,11,7 145 lxvd2x 9,11,8 146 addi 11,11,64 147 stxvd2x 6,0,10 148 stxvd2x 7,10,6 149 stxvd2x 8,10,7 150 stxvd2x 9,10,8 151 addi 10,10,64 152 153L(aligned_128setup): 154 /* Set up for the 128-byte at a time copy loop. */ 155 srwi 8,31,7 156 cmpwi 8,0 # Any 4x lumps left? 157 beq 3f # if not, move along. 158 lxvd2x 6,0,11 159 lxvd2x 7,11,6 160 mtctr 8 # otherwise, load the ctr and begin. 161 li 8,48 # 48() index 162 b L(aligned_128loop) 163 164L(aligned_128head): 165 /* for the 2nd + iteration of this loop. */ 166 lxvd2x 6,0,11 167 lxvd2x 7,11,6 168L(aligned_128loop): 169 lxvd2x 8,11,7 170 lxvd2x 9,11,8 171 stxvd2x 6,0,10 172 addi 11,11,64 173 stxvd2x 7,10,6 174 stxvd2x 8,10,7 175 stxvd2x 9,10,8 176 lxvd2x 6,0,11 177 lxvd2x 7,11,6 178 addi 10,10,64 179 lxvd2x 8,11,7 180 lxvd2x 9,11,8 181 addi 11,11,64 182 stxvd2x 6,0,10 183 stxvd2x 7,10,6 184 stxvd2x 8,10,7 185 stxvd2x 9,10,8 186 addi 10,10,64 187 bdnz L(aligned_128head) 188 1893: 190 /* Check for tail bytes. */ 191 clrrwi 0,31,3 192 mtcrf 0x01,31 193 beq cr6,0f 194 195.L9: 196 add 3,3,0 197 add 12,12,0 198 199 /* At this point we have a tail of 0-7 bytes and we know that the 200 destination is doubleword-aligned. */ 2014: /* Copy 4 bytes. */ 202 bf 29,2f 203 204 lwz 6,0(12) 205 addi 12,12,4 206 stw 6,0(3) 207 addi 3,3,4 2082: /* Copy 2 bytes. */ 209 bf 30,1f 210 211 lhz 6,0(12) 212 addi 12,12,2 213 sth 6,0(3) 214 addi 3,3,2 2151: /* Copy 1 byte. */ 216 bf 31,0f 217 218 lbz 6,0(12) 219 stb 6,0(3) 2200: /* Return original DST pointer. */ 221 mr 3,30 222 lwz 30,20(1) 223 lwz 31,24(1) 224 addi 1,1,32 225 blr 226 227 /* Handle copies of 0~31 bytes. */ 228 .align 4 229L(copy_LT_32): 230 cmplwi cr6,5,8 231 mr 12,4 232 mtcrf 0x01,5 233 ble cr6,L(copy_LE_8) 234 235 /* At least 9 bytes to go. */ 236 neg 8,4 237 clrrwi 11,4,2 238 andi. 0,8,3 239 cmplwi cr1,5,16 240 mr 10,5 241 beq L(copy_LT_32_aligned) 242 243 /* Force 4-bytes alignment for SRC. */ 244 mtocrf 0x01,0 245 subf 10,0,5 2462: bf 30,1f 247 248 lhz 6,0(12) 249 addi 12,12,2 250 sth 6,0(3) 251 addi 3,3,2 2521: bf 31,L(end_4bytes_alignment) 253 254 lbz 6,0(12) 255 addi 12,12,1 256 stb 6,0(3) 257 addi 3,3,1 258 259 .align 4 260L(end_4bytes_alignment): 261 cmplwi cr1,10,16 262 mtcrf 0x01,10 263 264L(copy_LT_32_aligned): 265 /* At least 6 bytes to go, and SRC is word-aligned. */ 266 blt cr1,8f 267 268 /* Copy 16 bytes. */ 269 lwz 6,0(12) 270 lwz 7,4(12) 271 stw 6,0(3) 272 lwz 8,8(12) 273 stw 7,4(3) 274 lwz 6,12(12) 275 addi 12,12,16 276 stw 8,8(3) 277 stw 6,12(3) 278 addi 3,3,16 2798: /* Copy 8 bytes. */ 280 bf 28,4f 281 282 lwz 6,0(12) 283 lwz 7,4(12) 284 addi 12,12,8 285 stw 6,0(3) 286 stw 7,4(3) 287 addi 3,3,8 2884: /* Copy 4 bytes. */ 289 bf 29,2f 290 291 lwz 6,0(12) 292 addi 12,12,4 293 stw 6,0(3) 294 addi 3,3,4 2952: /* Copy 2-3 bytes. */ 296 bf 30,1f 297 298 lhz 6,0(12) 299 sth 6,0(3) 300 bf 31,0f 301 lbz 7,2(12) 302 stb 7,2(3) 303 304 /* Return original DST pointer. */ 305 mr 3,30 306 lwz 30,20(1) 307 addi 1,1,32 308 blr 309 310 .align 4 3111: /* Copy 1 byte. */ 312 bf 31,0f 313 314 lbz 6,0(12) 315 stb 6,0(3) 3160: /* Return original DST pointer. */ 317 mr 3,30 318 lwz 30,20(1) 319 addi 1,1,32 320 blr 321 322 /* Handles copies of 0~8 bytes. */ 323 .align 4 324L(copy_LE_8): 325 bne cr6,4f 326 327 /* Though we could've used lfd/stfd here, they are still 328 slow for unaligned cases. */ 329 330 lwz 6,0(4) 331 lwz 7,4(4) 332 stw 6,0(3) 333 stw 7,4(3) 334 335 /* Return original DST pointer. */ 336 mr 3,30 337 lwz 30,20(1) 338 addi 1,1,32 339 blr 340 341 .align 4 3424: /* Copies 4~7 bytes. */ 343 bf 29,2b 344 345 lwz 6,0(4) 346 stw 6,0(3) 347 bf 30,5f 348 lhz 7,4(4) 349 sth 7,4(3) 350 bf 31,0f 351 lbz 8,6(4) 352 stb 8,6(3) 353 354 /* Return original DST pointer. */ 355 mr 3,30 356 lwz 30,20(1) 357 addi 1,1,32 358 blr 359 360 .align 4 3615: /* Copy 1 byte. */ 362 bf 31,0f 363 364 lbz 6,4(4) 365 stb 6,4(3) 366 3670: /* Return original DST pointer. */ 368 mr 3,30 369 lwz 30,20(1) 370 addi 1,1,32 371 blr 372 373 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but 374 SRC is not. Use aligned quadword loads from SRC, shifted to realign 375 the data, allowing for aligned DST stores. */ 376 .align 4 377L(copy_GE_32_unaligned): 378 andi. 11,3,15 /* Check alignment of DST. */ 379 clrlwi 0,0,28 /* Number of bytes until the 1st 380 quadword of DST. */ 381 srwi 9,5,4 /* Number of full quadwords remaining. */ 382 383 beq L(copy_GE_32_unaligned_cont) 384 385 /* DST is not quadword aligned, get it aligned. */ 386 387 mtcrf 0x01,0 388 subf 31,0,5 389 390 /* Vector instructions work best when proper alignment (16-bytes) 391 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 3921: /* Copy 1 byte. */ 393 bf 31,2f 394 395 lbz 6,0(12) 396 addi 12,12,1 397 stb 6,0(3) 398 addi 3,3,1 3992: /* Copy 2 bytes. */ 400 bf 30,4f 401 402 lhz 6,0(12) 403 addi 12,12,2 404 sth 6,0(3) 405 addi 3,3,2 4064: /* Copy 4 bytes. */ 407 bf 29,8f 408 409 lwz 6,0(12) 410 addi 12,12,4 411 stw 6,0(3) 412 addi 3,3,4 4138: /* Copy 8 bytes. */ 414 bf 28,0f 415 416 lfd 6,0(12) 417 addi 12,12,8 418 stfd 6,0(3) 419 addi 3,3,8 4200: 421 clrlwi 10,12,28 /* Check alignment of SRC. */ 422 srwi 9,31,4 /* Number of full quadwords remaining. */ 423 424 /* The proper alignment is present, it is OK to copy the bytes now. */ 425L(copy_GE_32_unaligned_cont): 426 427 /* Setup two indexes to speed up the indexed vector operations. */ 428 clrlwi 11,31,28 429 li 6,16 /* Index for 16-bytes offsets. */ 430 li 7,32 /* Index for 32-bytes offsets. */ 431 cmplwi cr1,11,0 432 srwi 8,31,5 /* Setup the loop counter. */ 433 mr 10,3 434 mr 11,12 435 mtcrf 0x01,9 436 cmplwi cr6,9,1 437#ifdef __LITTLE_ENDIAN__ 438 lvsr 5,0,12 439#else 440 lvsl 5,0,12 441#endif 442 lvx 3,0,12 443 bf 31,L(setup_unaligned_loop) 444 445 /* Copy another 16 bytes to align to 32-bytes due to the loop . */ 446 lvx 4,12,6 447#ifdef __LITTLE_ENDIAN__ 448 vperm 6,4,3,5 449#else 450 vperm 6,3,4,5 451#endif 452 addi 11,12,16 453 addi 10,3,16 454 stvx 6,0,3 455 vor 3,4,4 456 457L(setup_unaligned_loop): 458 mtctr 8 459 ble cr6,L(end_unaligned_loop) 460 461 /* Copy 32 bytes at a time using vector instructions. */ 462 .align 4 463L(unaligned_loop): 464 465 /* Note: vr6/vr10 may contain data that was already copied, 466 but in order to get proper alignment, we may have to copy 467 some portions again. This is faster than having unaligned 468 vector instructions though. */ 469 470 lvx 4,11,6 /* vr4 = r11+16. */ 471#ifdef __LITTLE_ENDIAN__ 472 vperm 6,4,3,5 473#else 474 vperm 6,3,4,5 475#endif 476 lvx 3,11,7 /* vr3 = r11+32. */ 477#ifdef __LITTLE_ENDIAN__ 478 vperm 10,3,4,5 479#else 480 vperm 10,4,3,5 481#endif 482 addi 11,11,32 483 stvx 6,0,10 484 stvx 10,10,6 485 addi 10,10,32 486 487 bdnz L(unaligned_loop) 488 489 .align 4 490L(end_unaligned_loop): 491 492 /* Check for tail bytes. */ 493 clrrwi 0,31,4 494 mtcrf 0x01,31 495 beq cr1,0f 496 497 add 3,3,0 498 add 12,12,0 499 500 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 5018: /* Copy 8 bytes. */ 502 bf 28,4f 503 504 lwz 6,0(12) 505 lwz 7,4(12) 506 addi 12,12,8 507 stw 6,0(3) 508 stw 7,4(3) 509 addi 3,3,8 5104: /* Copy 4 bytes. */ 511 bf 29,2f 512 513 lwz 6,0(12) 514 addi 12,12,4 515 stw 6,0(3) 516 addi 3,3,4 5172: /* Copy 2~3 bytes. */ 518 bf 30,1f 519 520 lhz 6,0(12) 521 addi 12,12,2 522 sth 6,0(3) 523 addi 3,3,2 5241: /* Copy 1 byte. */ 525 bf 31,0f 526 527 lbz 6,0(12) 528 stb 6,0(3) 5290: /* Return original DST pointer. */ 530 mr 3,30 531 lwz 30,20(1) 532 lwz 31,24(1) 533 addi 1,1,32 534 blr 535 536END (memcpy) 537libc_hidden_builtin_def (memcpy) 538