1/* Optimized memcpy implementation for PowerPC A2. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20#include <rtld-global-offsets.h> 21 22#define PREFETCH_AHEAD 4 /* no cache lines SRC prefetching ahead */ 23#define ZERO_AHEAD 2 /* no cache lines DST zeroing ahead */ 24 25 .machine a2 26EALIGN (memcpy, 5, 0) 27 CALL_MCOUNT 28 29 dcbt 0,r4 /* Prefetch ONE SRC cacheline */ 30 cmplwi cr1,r5,16 /* is size < 16 ? */ 31 mr r6,r3 /* Copy dest reg to r6; */ 32 blt+ cr1,L(shortcopy) 33 34 35 /* Big copy (16 bytes or more) 36 37 Figure out how far to the nearest quadword boundary, or if we are 38 on one already. 39 40 r3 - return value (always) 41 r4 - current source addr 42 r5 - copy length 43 r6 - current dest addr 44 */ 45 46 neg r8,r3 /* LS 4 bits = # bytes to 8-byte dest bdry */ 47 clrlwi r8,r8,32-4 /* align to 16byte boundary */ 48 sub r7,r4,r3 /* compute offset to src from dest */ 49 cmplwi cr0,r8,0 /* Were we aligned on a 16 byte bdy? */ 50 beq+ L(dst_aligned) 51 52 53 54 /* Destination is not aligned on quadword boundary. Get us to one. 55 56 r3 - return value (always) 57 r4 - current source addr 58 r5 - copy length 59 r6 - current dest addr 60 r7 - offset to src from dest 61 r8 - number of bytes to quadword boundary 62 */ 63 64 mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ 65 subf r5,r8,r5 /* adjust remaining len */ 66 67 bf cr7*4+3,1f 68 lbzx r0,r7,r6 /* copy 1 byte addr */ 69 stb r0,0(r6) 70 addi r6,r6,1 711: 72 bf cr7*4+2,2f 73 lhzx r0,r7,r6 /* copy 2 byte addr */ 74 sth r0,0(r6) 75 addi r6,r6,2 762: 77 bf cr7*4+1,4f 78 lwzx r0,r7,r6 /* copy 4 byte addr */ 79 stw r0,0(r6) 80 addi r6,r6,4 814: 82 bf cr7*4+0,8f 83 lfdx r0,r7,r6 /* copy 8 byte addr */ 84 stfd r0,0(r6) 85 addi r6,r6,8 868: 87 add r4,r7,r6 /* update src addr */ 88 89 90 91 /* Dest is quadword aligned now. 92 93 Lots of decisions to make. If we are copying less than a cache 94 line we won't be here long. If we are not on a cache line 95 boundary we need to get there. And then we need to figure out 96 how many cache lines ahead to pre-touch. 97 98 r3 - return value (always) 99 r4 - current source addr 100 r5 - copy length 101 r6 - current dest addr 102 */ 103 104 105 .align 4 106L(dst_aligned): 107 108 109#ifdef PIC 110 mflr r0 111/* Establishes GOT addressability so we can load the cache line size 112 from rtld_global_ro. This value was set from the aux vector during 113 startup. */ 114 SETUP_GOT_ACCESS(r9,got_label) 115 addis r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@ha 116 addi r9,r9,_GLOBAL_OFFSET_TABLE_-got_label@l 117 mtlr r0 118#endif 119 __GLRO(r9, r9, _dl_cache_line_size, 120 RTLD_GLOBAL_RO_DL_CACHE_LINE_SIZE_OFFSET) 121 122 cmplwi cr5, r9, 0 123 bne+ cr5,L(cachelineset) 124 125/* Cache line size not set: generic byte copy without much optimization */ 126 andi. r0,r5,1 /* If length is odd copy one byte. */ 127 beq L(cachelinenotset_align) 128 lbz r7,0(r4) /* Read one byte from source. */ 129 addi r5,r5,-1 /* Update length. */ 130 addi r4,r4,1 /* Update source pointer address. */ 131 stb r7,0(r6) /* Store one byte on dest. */ 132 addi r6,r6,1 /* Update dest pointer address. */ 133L(cachelinenotset_align): 134 cmpwi cr7,r5,0 /* If length is 0 return. */ 135 beqlr cr7 136 ori r2,r2,0 /* Force a new dispatch group. */ 137L(cachelinenotset_loop): 138 addic. r5,r5,-2 /* Update length. */ 139 lbz r7,0(r4) /* Load 2 bytes from source. */ 140 lbz r8,1(r4) 141 addi r4,r4,2 /* Update source pointer address. */ 142 stb r7,0(r6) /* Store 2 bytes on dest. */ 143 stb r8,1(r6) 144 addi r6,r6,2 /* Update dest pointer address. */ 145 bne L(cachelinenotset_loop) 146 blr 147 148 149L(cachelineset): 150 151 addi r10,r9,-1 152 153 cmpw cr5,r5,r10 /* Less than a cacheline to go? */ 154 155 neg r7,r6 /* How far to next cacheline bdy? */ 156 157 addi r6,r6,-8 /* prepare for stdu */ 158 cmpwi cr0,r9,128 159 addi r4,r4,-8 /* prepare for ldu */ 160 161 162 ble+ cr5,L(lessthancacheline) 163 164 beq- cr0,L(big_lines) /* 128 byte line code */ 165 166 167 168 169 /* More than a cacheline left to go, and using 64 byte cachelines */ 170 171 clrlwi r7,r7,32-6 /* How far to next cacheline bdy? */ 172 173 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */ 174 175 /* Reduce total len by what it takes to get to the next cache line */ 176 subf r5,r7,r5 177 srwi r7,r7,4 /* How many qws to get to the line bdy? */ 178 179 /* How many full cache lines to copy after getting to a line bdy? */ 180 srwi r10,r5,6 181 182 cmplwi r10,0 /* If no full cache lines to copy ... */ 183 li r11,0 /* number cachelines to copy with prefetch */ 184 beq L(nocacheprefetch) 185 186 187 /* We are here because we have at least one full cache line to copy, 188 and therefore some pre-touching to do. */ 189 190 cmplwi r10,PREFETCH_AHEAD 191 li r12,64+8 /* prefetch distance */ 192 ble L(lessthanmaxprefetch) 193 194 /* We can only do so much pre-fetching. R11 will have the count of 195 lines left to prefetch after the initial batch of prefetches 196 are executed. */ 197 198 subi r11,r10,PREFETCH_AHEAD 199 li r10,PREFETCH_AHEAD 200 201L(lessthanmaxprefetch): 202 mtctr r10 203 204 /* At this point r10/ctr hold the number of lines to prefetch in this 205 initial batch, and r11 holds any remainder. */ 206 207L(prefetchSRC): 208 dcbt r12,r4 209 addi r12,r12,64 210 bdnz L(prefetchSRC) 211 212 213 /* Prefetching is done, or was not needed. 214 215 cr6 - are we on a cacheline boundary already? 216 r7 - number of quadwords to the next cacheline boundary 217 */ 218 219L(nocacheprefetch): 220 mtctr r7 221 222 cmplwi cr1,r5,64 /* Less than a cache line to copy? */ 223 224 /* How many bytes are left after we copy whatever full 225 cache lines we can get? */ 226 clrlwi r5,r5,32-6 227 228 beq cr6,L(cachelinealigned) 229 230 231 /* Copy quadwords up to the next cacheline boundary */ 232 233L(aligntocacheline): 234 lfd fp9,0x08(r4) 235 lfdu fp10,0x10(r4) 236 stfd fp9,0x08(r6) 237 stfdu fp10,0x10(r6) 238 bdnz L(aligntocacheline) 239 240 241 .align 4 242L(cachelinealigned): /* copy while cache lines */ 243 244 blt- cr1,L(lessthancacheline) /* size <64 */ 245 246L(outerloop): 247 cmpwi r11,0 248 mtctr r11 249 beq- L(endloop) 250 251 li r11,64*ZERO_AHEAD +8 /* DCBZ dist */ 252 253 .align 4 254 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 255L(loop): /* Copy aligned body */ 256 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 257 lfd fp9, 0x08(r4) 258 dcbz r11,r6 259 lfd fp10, 0x10(r4) 260 lfd fp11, 0x18(r4) 261 lfd fp12, 0x20(r4) 262 stfd fp9, 0x08(r6) 263 stfd fp10, 0x10(r6) 264 stfd fp11, 0x18(r6) 265 stfd fp12, 0x20(r6) 266 lfd fp9, 0x28(r4) 267 lfd fp10, 0x30(r4) 268 lfd fp11, 0x38(r4) 269 lfdu fp12, 0x40(r4) 270 stfd fp9, 0x28(r6) 271 stfd fp10, 0x30(r6) 272 stfd fp11, 0x38(r6) 273 stfdu fp12, 0x40(r6) 274 275 bdnz L(loop) 276 277 278L(endloop): 279 cmpwi r10,0 280 beq- L(endloop2) 281 mtctr r10 282 283L(loop2): /* Copy aligned body */ 284 lfd fp9, 0x08(r4) 285 lfd fp10, 0x10(r4) 286 lfd fp11, 0x18(r4) 287 lfd fp12, 0x20(r4) 288 stfd fp9, 0x08(r6) 289 stfd fp10, 0x10(r6) 290 stfd fp11, 0x18(r6) 291 stfd fp12, 0x20(r6) 292 lfd fp9, 0x28(r4) 293 lfd fp10, 0x30(r4) 294 lfd fp11, 0x38(r4) 295 lfdu fp12, 0x40(r4) 296 stfd fp9, 0x28(r6) 297 stfd fp10, 0x30(r6) 298 stfd fp11, 0x38(r6) 299 stfdu fp12, 0x40(r6) 300 301 bdnz L(loop2) 302L(endloop2): 303 304 305 .align 4 306L(lessthancacheline): /* Was there less than cache to do ? */ 307 cmplwi cr0,r5,16 308 srwi r7,r5,4 /* divide size by 16 */ 309 blt- L(do_lt16) 310 mtctr r7 311 312L(copy_remaining): 313 lfd fp9, 0x08(r4) 314 lfdu fp10, 0x10(r4) 315 stfd fp9, 0x08(r6) 316 stfdu fp10, 0x10(r6) 317 bdnz L(copy_remaining) 318 319L(do_lt16): /* less than 16 ? */ 320 cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ 321 beqlr+ /* no rest to copy */ 322 addi r4,r4,8 323 addi r6,r6,8 324 325L(shortcopy): /* SIMPLE COPY to handle size =< 15 bytes */ 326 mtcrf 0x01,r5 327 sub r7,r4,r6 328 bf- cr7*4+0,8f 329 lfdx fp9,r7,r6 /* copy 8 byte */ 330 stfd fp9,0(r6) 331 addi r6,r6,8 3328: 333 bf cr7*4+1,4f 334 lwzx r0,r7,r6 /* copy 4 byte */ 335 stw r0,0(r6) 336 addi r6,r6,4 3374: 338 bf cr7*4+2,2f 339 lhzx r0,r7,r6 /* copy 2 byte */ 340 sth r0,0(r6) 341 addi r6,r6,2 3422: 343 bf cr7*4+3,1f 344 lbzx r0,r7,r6 /* copy 1 byte */ 345 stb r0,0(r6) 3461: 347 blr 348 349 350 351 352 353 /* Similar to above, but for use with 128 byte lines. */ 354 355 356L(big_lines): 357 358 clrlwi r7,r7,32-7 /* How far to next cacheline bdy? */ 359 360 cmplwi cr6,r7,0 /* Are we on a cacheline bdy already? */ 361 362 /* Reduce total len by what it takes to get to the next cache line */ 363 subf r5,r7,r5 364 srwi r7,r7,4 /* How many qw to get to the line bdy? */ 365 366 /* How many full cache lines to copy after getting to a line bdy? */ 367 srwi r10,r5,7 368 369 cmplwi r10,0 /* If no full cache lines to copy ... */ 370 li r11,0 /* number cachelines to copy with prefetch */ 371 beq L(nocacheprefetch_128) 372 373 374 /* We are here because we have at least one full cache line to copy, 375 and therefore some pre-touching to do. */ 376 377 cmplwi r10,PREFETCH_AHEAD 378 li r12,128+8 /* prefetch distance */ 379 ble L(lessthanmaxprefetch_128) 380 381 /* We can only do so much pre-fetching. R11 will have the count of 382 lines left to prefetch after the initial batch of prefetches 383 are executed. */ 384 385 subi r11,r10,PREFETCH_AHEAD 386 li r10,PREFETCH_AHEAD 387 388L(lessthanmaxprefetch_128): 389 mtctr r10 390 391 /* At this point r10/ctr hold the number of lines to prefetch in this 392 initial batch, and r11 holds any remainder. */ 393 394L(prefetchSRC_128): 395 dcbt r12,r4 396 addi r12,r12,128 397 bdnz L(prefetchSRC_128) 398 399 400 /* Prefetching is done, or was not needed. 401 402 cr6 - are we on a cacheline boundary already? 403 r7 - number of quadwords to the next cacheline boundary 404 */ 405 406L(nocacheprefetch_128): 407 mtctr r7 408 409 cmplwi cr1,r5,128 /* Less than a cache line to copy? */ 410 411 /* How many bytes are left after we copy whatever full 412 cache lines we can get? */ 413 clrlwi r5,r5,32-7 414 415 beq cr6,L(cachelinealigned_128) 416 417 418 /* Copy quadwords up to the next cacheline boundary */ 419 420L(aligntocacheline_128): 421 lfd fp9,0x08(r4) 422 lfdu fp10,0x10(r4) 423 stfd fp9,0x08(r6) 424 stfdu fp10,0x10(r6) 425 bdnz L(aligntocacheline_128) 426 427 428L(cachelinealigned_128): /* copy while cache lines */ 429 430 blt- cr1,L(lessthancacheline) /* size <128 */ 431 432L(outerloop_128): 433 cmpwi r11,0 434 mtctr r11 435 beq- L(endloop_128) 436 437 li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ 438 439 .align 4 440 /* Copy whole cachelines, optimized by prefetching SRC cacheline */ 441L(loop_128): /* Copy aligned body */ 442 dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ 443 lfd fp9, 0x08(r4) 444 dcbz r11,r6 445 lfd fp10, 0x10(r4) 446 lfd fp11, 0x18(r4) 447 lfd fp12, 0x20(r4) 448 stfd fp9, 0x08(r6) 449 stfd fp10, 0x10(r6) 450 stfd fp11, 0x18(r6) 451 stfd fp12, 0x20(r6) 452 lfd fp9, 0x28(r4) 453 lfd fp10, 0x30(r4) 454 lfd fp11, 0x38(r4) 455 lfd fp12, 0x40(r4) 456 stfd fp9, 0x28(r6) 457 stfd fp10, 0x30(r6) 458 stfd fp11, 0x38(r6) 459 stfd fp12, 0x40(r6) 460 lfd fp9, 0x48(r4) 461 lfd fp10, 0x50(r4) 462 lfd fp11, 0x58(r4) 463 lfd fp12, 0x60(r4) 464 stfd fp9, 0x48(r6) 465 stfd fp10, 0x50(r6) 466 stfd fp11, 0x58(r6) 467 stfd fp12, 0x60(r6) 468 lfd fp9, 0x68(r4) 469 lfd fp10, 0x70(r4) 470 lfd fp11, 0x78(r4) 471 lfdu fp12, 0x80(r4) 472 stfd fp9, 0x68(r6) 473 stfd fp10, 0x70(r6) 474 stfd fp11, 0x78(r6) 475 stfdu fp12, 0x80(r6) 476 477 bdnz L(loop_128) 478 479 480L(endloop_128): 481 cmpwi r10,0 482 beq- L(endloop2_128) 483 mtctr r10 484 485L(loop2_128): /* Copy aligned body */ 486 lfd fp9, 0x08(r4) 487 lfd fp10, 0x10(r4) 488 lfd fp11, 0x18(r4) 489 lfd fp12, 0x20(r4) 490 stfd fp9, 0x08(r6) 491 stfd fp10, 0x10(r6) 492 stfd fp11, 0x18(r6) 493 stfd fp12, 0x20(r6) 494 lfd fp9, 0x28(r4) 495 lfd fp10, 0x30(r4) 496 lfd fp11, 0x38(r4) 497 lfd fp12, 0x40(r4) 498 stfd fp9, 0x28(r6) 499 stfd fp10, 0x30(r6) 500 stfd fp11, 0x38(r6) 501 stfd fp12, 0x40(r6) 502 lfd fp9, 0x48(r4) 503 lfd fp10, 0x50(r4) 504 lfd fp11, 0x58(r4) 505 lfd fp12, 0x60(r4) 506 stfd fp9, 0x48(r6) 507 stfd fp10, 0x50(r6) 508 stfd fp11, 0x58(r6) 509 stfd fp12, 0x60(r6) 510 lfd fp9, 0x68(r4) 511 lfd fp10, 0x70(r4) 512 lfd fp11, 0x78(r4) 513 lfdu fp12, 0x80(r4) 514 stfd fp9, 0x68(r6) 515 stfd fp10, 0x70(r6) 516 stfd fp11, 0x78(r6) 517 stfdu fp12, 0x80(r6) 518 bdnz L(loop2_128) 519L(endloop2_128): 520 521 b L(lessthancacheline) 522 523 524END (memcpy) 525libc_hidden_builtin_def (memcpy) 526