1/* Optimized memcpy implementation for PowerPC A2. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <rtld-global-offsets.h> 21 22#ifndef MEMCPY 23# define MEMCPY memcpy 24#endif 25 26#define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */ 27#define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */ 28 29 .section ".toc","aw" 30__GLRO_DEF(dl_cache_line_size) 31 32 33 .section ".text" 34 .align 2 35 36 37 .machine a2 38ENTRY (MEMCPY, 5) 39 CALL_MCOUNT 3 40 41 dcbt 0,r4 /* Prefetch ONE SRC cacheline */ 42 cmpldi cr1,r5,16 /* is size < 16 ? */ 43 mr r6,r3 /* Copy dest reg to r6; */ 44 blt+ cr1,L(shortcopy) 45 46 47 /* Big copy (16 bytes or more) 48 49 Figure out how far to the nearest quadword boundary, or if we are 50 on one already. Also get the cache line size. 51 52 r3 - return value (always) 53 r4 - current source addr 54 r5 - copy length 55 r6 - current dest addr 56 */ 57 58 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */ 59 /* Get the cache line size. */ 60 __GLRO (r9, dl_cache_line_size, 61 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) 62 clrldi r8,r8,64-4 /* align to 16byte boundary */ 63 sub r7,r4,r3 /* compute offset to src from dest */ 64 cmpldi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */ 65 addi r10,r9,-1 /* Cache line mask */ 66 beq+ L(dst_aligned) 67 68 69 70 /* Destination is not aligned on quadword boundary. Get us to one. 71 72 r3 - return value (always) 73 r4 - current source addr 74 r5 - copy length 75 r6 - current dest addr 76 r7 - offset to src from dest 77 r8 - number of bytes to quadword boundary 78 */ 79 80 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ 81 subf r5,r8,r5 /* adjust remaining len */ 82 83 bf cr7*4+3,1f 84 lbzx r0,r7,r6 /* copy 1 byte addr */ 85 stb r0,0(r6) 86 addi r6,r6,1 871: 88 bf cr7*4+2,2f 89 lhzx r0,r7,r6 /* copy 2 byte addr */ 90 sth r0,0(r6) 91 addi r6,r6,2 922: 93 bf cr7*4+1,4f 94 lwzx r0,r7,r6 /* copy 4 byte addr */ 95 stw r0,0(r6) 96 addi r6,r6,4 974: 98 bf cr7*4+0,8f 99 ldx r0,r7,r6 /* copy 8 byte addr */ 100 std r0,0(r6) 101 addi r6,r6,8 1028: 103 add r4,r7,r6 /* update src addr */ 104 105 106 107 /* Dest is quadword aligned now. 108 109 Lots of decisions to make. If we are copying less than a cache 110 line we won't be here long. If we are not on a cache line 111 boundary we need to get there. And then we need to figure out 112 how many cache lines ahead to pre-touch. 113 114 r3 - return value (always) 115 r4 - current source addr 116 r5 - copy length 117 r6 - current dest addr 118 */ 119 120 121 .align 4 122L(dst_aligned): 123 cmpdi cr0,r9,0 /* Cache line size set? */ 124 bne+ cr0,L(cachelineset) 125 126/* Cache line size not set: generic byte copy without much optimization */ 127 clrldi. r0,r5,63 /* If length is odd copy one byte */ 128 beq L(cachelinenotset_align) 129 lbz r7,0(r4) /* Read one byte from source */ 130 addi r5,r5,-1 /* Update length */ 131 addi r4,r4,1 /* Update source pointer address */ 132 stb r7,0(r6) /* Store one byte at dest */ 133 addi r6,r6,1 /* Update dest pointer address */ 134L(cachelinenotset_align): 135 cmpdi cr7,r5,0 /* If length is 0 return */ 136 beqlr cr7 137 ori r2,r2,0 /* Force a new dispatch group */ 138L(cachelinenotset_loop): 139 addic. r5,r5,-2 /* Update length */ 140 lbz r7,0(r4) /* Load 2 bytes from source */ 141 lbz r8,1(r4) 142 addi r4,r4,2 /* Update source pointer address */ 143 stb r7,0(r6) /* Store 2 bytes on dest */ 144 stb r8,1(r6) 145 addi r6,r6,2 /* Update dest pointer address */ 146 bne L(cachelinenotset_loop) 147 blr 148 149 150L(cachelineset): 151 cmpd cr5,r5,r10 /* Less than a cacheline to go? */ 152 153 neg r7,r6 /* How far to next cacheline bdy? */ 154 155 addi r6,r6,-8 /* prepare for stdu */ 156 cmpdi cr0,r9,128 157 addi r4,r4,-8 /* prepare for ldu */ 158 159 160 ble+ cr5,L(lessthancacheline) 161 162 beq- cr0,L(big_lines) /* 128 byte line code */ 163 164 165 166 /* More than a cacheline left to go, and using 64 byte cachelines */ 167 168 clrldi r7,r7,64-6 /* How far to next cacheline bdy? */ 169 170 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */ 171 172 /* Reduce total len by what it takes to get to the next cache line */ 173 subf r5,r7,r5 174 srdi r7,r7,4 /* How many qws to get to the line bdy? */ 175 176 /* How many full cache lines to copy after getting to a line bdy? */ 177 srdi r10,r5,6 178 179 cmpldi r10,0 /* If no full cache lines to copy ... */ 180 li r11,0 /* number cachelines to copy with prefetch */ 181 beq L(nocacheprefetch) 182 183 184 /* We are here because we have at least one full cache line to copy, 185 and therefore some pre-touching to do. */ 186 187 cmpldi r10,PREFETCH_AHEAD 188 li r12,64+8 /* prefetch distance */ 189 ble L(lessthanmaxprefetch) 190 191 /* We can only do so much pre-fetching. R11 will have the count of 192 lines left to prefetch after the initial batch of prefetches 193 are executed. */ 194 195 subi r11,r10,PREFETCH_AHEAD 196 li r10,PREFETCH_AHEAD 197 198L(lessthanmaxprefetch): 199 mtctr r10 200 201 /* At this point r10/ctr hold the number of lines to prefetch in this 202 initial batch, and r11 holds any remainder. */ 203 204L(prefetchSRC): 205 dcbt r12,r4 206 addi r12,r12,64 207 bdnz L(prefetchSRC) 208 209 210 /* Prefetching is done, or was not needed. 211 212 cr6 - are we on a cacheline boundary already? 213 r7 - number of quadwords to the next cacheline boundary 214 */ 215 216L(nocacheprefetch): 217 mtctr r7 218 219 cmpldi cr1,r5,64 /* Less than a cache line to copy? */ 220 221 /* How many bytes are left after we copy whatever full 222 cache lines we can get? */ 223 clrldi r5,r5,64-6 224 225 beq cr6,L(cachelinealigned) 226 227 228 /* Copy quadwords up to the next cacheline boundary */ 229 230L(aligntocacheline): 231 ld r9,0x08(r4) 232 ld r7,0x10(r4) 233 addi r4,r4,0x10 234 std r9,0x08(r6) 235 stdu r7,0x10(r6) 236 bdnz L(aligntocacheline) 237 238 239 .align 4 240L(cachelinealigned): /* copy while cache lines */ 241 242 blt- cr1,L(lessthancacheline) /* size <64 */ 243 244L(outerloop): 245 cmpdi r11,0 246 mtctr r11 247 beq- L(endloop) 248 249 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */ 250 251 .align 4 252 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 253L(loop): /* Copy aligned body */ 254 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 255 ld r9, 0x08(r4) 256 dcbz r11,r6 257 ld r7, 0x10(r4) 258 ld r8, 0x18(r4) 259 ld r0, 0x20(r4) 260 std r9, 0x08(r6) 261 std r7, 0x10(r6) 262 std r8, 0x18(r6) 263 std r0, 0x20(r6) 264 ld r9, 0x28(r4) 265 ld r7, 0x30(r4) 266 ld r8, 0x38(r4) 267 ld r0, 0x40(r4) 268 addi r4, r4,0x40 269 std r9, 0x28(r6) 270 std r7, 0x30(r6) 271 std r8, 0x38(r6) 272 stdu r0, 0x40(r6) 273 274 bdnz L(loop) 275 276 277L(endloop): 278 cmpdi r10,0 279 beq- L(endloop2) 280 mtctr r10 281 282L(loop2): /* Copy aligned body */ 283 ld r9, 0x08(r4) 284 ld r7, 0x10(r4) 285 ld r8, 0x18(r4) 286 ld r0, 0x20(r4) 287 std r9, 0x08(r6) 288 std r7, 0x10(r6) 289 std r8, 0x18(r6) 290 std r0, 0x20(r6) 291 ld r9, 0x28(r4) 292 ld r7, 0x30(r4) 293 ld r8, 0x38(r4) 294 ld r0, 0x40(r4) 295 addi r4, r4,0x40 296 std r9, 0x28(r6) 297 std r7, 0x30(r6) 298 std r8, 0x38(r6) 299 stdu r0, 0x40(r6) 300 301 bdnz L(loop2) 302L(endloop2): 303 304 305 .align 4 306L(lessthancacheline): /* Was there less than cache to do ? */ 307 cmpldi cr0,r5,16 308 srdi r7,r5,4 /* divide size by 16 */ 309 blt- L(do_lt16) 310 mtctr r7 311 312L(copy_remaining): 313 ld r8,0x08(r4) 314 ld r7,0x10(r4) 315 addi r4,r4,0x10 316 std r8,0x08(r6) 317 stdu r7,0x10(r6) 318 bdnz L(copy_remaining) 319 320L(do_lt16): /* less than 16 ? */ 321 cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */ 322 beqlr+ /* no rest to copy */ 323 addi r4,r4,8 324 addi r6,r6,8 325 326L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */ 327 mtcrf 0x01,r5 328 sub r7,r4,r6 329 bf- cr7*4+0,8f 330 ldx r0,r7,r6 /* copy 8 byte */ 331 std r0,0(r6) 332 addi r6,r6,8 3338: 334 bf cr7*4+1,4f 335 lwzx r0,r7,r6 /* copy 4 byte */ 336 stw r0,0(r6) 337 addi r6,r6,4 3384: 339 bf cr7*4+2,2f 340 lhzx r0,r7,r6 /* copy 2 byte */ 341 sth r0,0(r6) 342 addi r6,r6,2 3432: 344 bf cr7*4+3,1f 345 lbzx r0,r7,r6 /* copy 1 byte */ 346 stb r0,0(r6) 3471: 348 blr 349 350 351 352 353 354 /* Similar to above, but for use with 128 byte lines. */ 355 356 357L(big_lines): 358 359 clrldi r7,r7,64-7 /* How far to next cacheline bdy? */ 360 361 cmpldi cr6,r7,0 /* Are we on a cacheline bdy already? */ 362 363 /* Reduce total len by what it takes to get to the next cache line */ 364 subf r5,r7,r5 365 srdi r7,r7,4 /* How many qws to get to the line bdy? */ 366 367 /* How many full cache lines to copy after getting to a line bdy? */ 368 srdi r10,r5,7 369 370 cmpldi r10,0 /* If no full cache lines to copy ... */ 371 li r11,0 /* number cachelines to copy with prefetch */ 372 beq L(nocacheprefetch_128) 373 374 375 /* We are here because we have at least one full cache line to copy, 376 and therefore some pre-touching to do. */ 377 378 cmpldi r10,PREFETCH_AHEAD 379 li r12,128+8 /* prefetch distance */ 380 ble L(lessthanmaxprefetch_128) 381 382 /* We can only do so much pre-fetching. R11 will have the count of 383 lines left to prefetch after the initial batch of prefetches 384 are executed. */ 385 386 subi r11,r10,PREFETCH_AHEAD 387 li r10,PREFETCH_AHEAD 388 389L(lessthanmaxprefetch_128): 390 mtctr r10 391 392 /* At this point r10/ctr hold the number of lines to prefetch in this 393 initial batch, and r11 holds any remainder. */ 394 395L(prefetchSRC_128): 396 dcbt r12,r4 397 addi r12,r12,128 398 bdnz L(prefetchSRC_128) 399 400 401 /* Prefetching is done, or was not needed. 402 403 cr6 - are we on a cacheline boundary already? 404 r7 - number of quadwords to the next cacheline boundary 405 */ 406 407L(nocacheprefetch_128): 408 mtctr r7 409 410 cmpldi cr1,r5,128 /* Less than a cache line to copy? */ 411 412 /* How many bytes are left after we copy whatever full 413 cache lines we can get? */ 414 clrldi r5,r5,64-7 415 416 beq cr6,L(cachelinealigned_128) 417 418 419 /* Copy quadwords up to the next cacheline boundary */ 420 421L(aligntocacheline_128): 422 ld r9,0x08(r4) 423 ld r7,0x10(r4) 424 addi r4,r4,0x10 425 std r9,0x08(r6) 426 stdu r7,0x10(r6) 427 bdnz L(aligntocacheline_128) 428 429 430L(cachelinealigned_128): /* copy while cache lines */ 431 432 blt- cr1,L(lessthancacheline) /* size <128 */ 433 434L(outerloop_128): 435 cmpdi r11,0 436 mtctr r11 437 beq- L(endloop_128) 438 439 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ 440 441 .align 4 442 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 443L(loop_128): /* Copy aligned body */ 444 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 445 ld r9, 0x08(r4) 446 dcbz r11,r6 447 ld r7, 0x10(r4) 448 ld r8, 0x18(r4) 449 ld r0, 0x20(r4) 450 std r9, 0x08(r6) 451 std r7, 0x10(r6) 452 std r8, 0x18(r6) 453 std r0, 0x20(r6) 454 ld r9, 0x28(r4) 455 ld r7, 0x30(r4) 456 ld r8, 0x38(r4) 457 ld r0, 0x40(r4) 458 std r9, 0x28(r6) 459 std r7, 0x30(r6) 460 std r8, 0x38(r6) 461 std r0, 0x40(r6) 462 ld r9, 0x48(r4) 463 ld r7, 0x50(r4) 464 ld r8, 0x58(r4) 465 ld r0, 0x60(r4) 466 std r9, 0x48(r6) 467 std r7, 0x50(r6) 468 std r8, 0x58(r6) 469 std r0, 0x60(r6) 470 ld r9, 0x68(r4) 471 ld r7, 0x70(r4) 472 ld r8, 0x78(r4) 473 ld r0, 0x80(r4) 474 addi r4, r4,0x80 475 std r9, 0x68(r6) 476 std r7, 0x70(r6) 477 std r8, 0x78(r6) 478 stdu r0, 0x80(r6) 479 480 bdnz L(loop_128) 481 482 483L(endloop_128): 484 cmpdi r10,0 485 beq- L(endloop2_128) 486 mtctr r10 487 488L(loop2_128): /* Copy aligned body */ 489 ld r9, 0x08(r4) 490 ld r7, 0x10(r4) 491 ld r8, 0x18(r4) 492 ld r0, 0x20(r4) 493 std r9, 0x08(r6) 494 std r7, 0x10(r6) 495 std r8, 0x18(r6) 496 std r0, 0x20(r6) 497 ld r9, 0x28(r4) 498 ld r7, 0x30(r4) 499 ld r8, 0x38(r4) 500 ld r0, 0x40(r4) 501 std r9, 0x28(r6) 502 std r7, 0x30(r6) 503 std r8, 0x38(r6) 504 std r0, 0x40(r6) 505 ld r9, 0x48(r4) 506 ld r7, 0x50(r4) 507 ld r8, 0x58(r4) 508 ld r0, 0x60(r4) 509 std r9, 0x48(r6) 510 std r7, 0x50(r6) 511 std r8, 0x58(r6) 512 std r0, 0x60(r6) 513 ld r9, 0x68(r4) 514 ld r7, 0x70(r4) 515 ld r8, 0x78(r4) 516 ld r0, 0x80(r4) 517 addi r4, r4,0x80 518 std r9, 0x68(r6) 519 std r7, 0x70(r6) 520 std r8, 0x78(r6) 521 stdu r0, 0x80(r6) 522 523 bdnz L(loop2_128) 524L(endloop2_128): 525 526 b L(lessthancacheline) 527 528 529END_GEN_TB (MEMCPY,TB_TOCLESS) 530libc_hidden_builtin_def (memcpy) 531