1/* Optimized mempcpy implementation for POWER7. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 22/* void * [r3] __mempcpy (void *dst [r3], void *src [r4], size_t len [r5]); 23 Returns 'dst' + 'len'. */ 24 25#ifndef MEMPCPY 26# define MEMPCPY __mempcpy 27#endif 28 .machine power7 29ENTRY_TOCLESS (MEMPCPY, 5) 30 CALL_MCOUNT 3 31 32 cmpldi cr1,5,31 33 neg 0,3 34 std 3,-16(1) 35 std 31,-8(1) 36 cfi_offset(31,-8) 37 ble cr1,L(copy_LT_32) /* If move < 32 bytes use short move 38 code. */ 39 40 andi. 11,3,7 /* Check alignment of DST. */ 41 42 43 clrldi 10,4,61 /* Check alignment of SRC. */ 44 cmpld cr6,10,11 /* SRC and DST alignments match? */ 45 mr 12,4 46 mr 31,5 47 bne cr6,L(copy_GE_32_unaligned) 48 49 srdi 9,5,3 /* Number of full quadwords remaining. */ 50 51 beq L(copy_GE_32_aligned_cont) 52 53 clrldi 0,0,61 54 mtcrf 0x01,0 55 subf 31,0,5 56 57 /* Get the SRC aligned to 8 bytes. */ 58 591: bf 31,2f 60 lbz 6,0(12) 61 addi 12,12,1 62 stb 6,0(3) 63 addi 3,3,1 642: bf 30,4f 65 lhz 6,0(12) 66 addi 12,12,2 67 sth 6,0(3) 68 addi 3,3,2 694: bf 29,0f 70 lwz 6,0(12) 71 addi 12,12,4 72 stw 6,0(3) 73 addi 3,3,4 740: 75 clrldi 10,12,61 /* Check alignment of SRC again. */ 76 srdi 9,31,3 /* Number of full doublewords remaining. */ 77 78L(copy_GE_32_aligned_cont): 79 80 clrldi 11,31,61 81 mtcrf 0x01,9 82 83 srdi 8,31,5 84 cmpldi cr1,9,4 85 cmpldi cr6,11,0 86 mr 11,12 87 88 /* Copy 1~3 doublewords so the main loop starts 89 at a multiple of 32 bytes. */ 90 91 bf 30,1f 92 ld 6,0(12) 93 ld 7,8(12) 94 addi 11,12,16 95 mtctr 8 96 std 6,0(3) 97 std 7,8(3) 98 addi 10,3,16 99 bf 31,4f 100 ld 0,16(12) 101 std 0,16(3) 102 blt cr1,3f 103 addi 11,12,24 104 addi 10,3,24 105 b 4f 106 107 .align 4 1081: /* Copy 1 doubleword and set the counter. */ 109 mr 10,3 110 mtctr 8 111 bf 31,4f 112 ld 6,0(12) 113 addi 11,12,8 114 std 6,0(3) 115 addi 10,3,8 116 117 /* Main aligned copy loop. Copies 32-bytes at a time. */ 118 .align 4 1194: 120 ld 6,0(11) 121 ld 7,8(11) 122 ld 8,16(11) 123 ld 0,24(11) 124 addi 11,11,32 125 126 std 6,0(10) 127 std 7,8(10) 128 std 8,16(10) 129 std 0,24(10) 130 addi 10,10,32 131 bdnz 4b 1323: 133 134 /* Check for tail bytes. */ 135 rldicr 0,31,0,60 136 mtcrf 0x01,31 137 beq cr6,0f 138 139.L9: 140 add 3,3,0 141 add 12,12,0 142 143 /* At this point we have a tail of 0-7 bytes and we know that the 144 destination is doubleword-aligned. */ 1454: /* Copy 4 bytes. */ 146 bf 29,2f 147 148 lwz 6,0(12) 149 addi 12,12,4 150 stw 6,0(3) 151 addi 3,3,4 1522: /* Copy 2 bytes. */ 153 bf 30,1f 154 155 lhz 6,0(12) 156 addi 12,12,2 157 sth 6,0(3) 158 addi 3,3,2 1591: /* Copy 1 byte. */ 160 bf 31,0f 161 162 lbz 6,0(12) 163 stb 6,0(3) 1640: /* Return DST + LEN pointer. */ 165 ld 31,-8(1) 166 ld 3,-16(1) 167 add 3,3,5 168 blr 169 170 /* Handle copies of 0~31 bytes. */ 171 .align 4 172L(copy_LT_32): 173 cmpldi cr6,5,8 174 mr 12,4 175 mtcrf 0x01,5 176 ble cr6,L(copy_LE_8) 177 178 /* At least 9 bytes to go. */ 179 neg 8,4 180 clrrdi 11,4,2 181 andi. 0,8,3 182 cmpldi cr1,5,16 183 mr 10,5 184 beq L(copy_LT_32_aligned) 185 186 /* Force 4-bytes alignment for SRC. */ 187 mtocrf 0x01,0 188 subf 10,0,5 1892: bf 30,1f 190 191 lhz 6,0(12) 192 addi 12,12,2 193 sth 6,0(3) 194 addi 3,3,2 1951: bf 31,L(end_4bytes_alignment) 196 197 lbz 6,0(12) 198 addi 12,12,1 199 stb 6,0(3) 200 addi 3,3,1 201 202 .align 4 203L(end_4bytes_alignment): 204 cmpldi cr1,10,16 205 mtcrf 0x01,10 206 207L(copy_LT_32_aligned): 208 /* At least 6 bytes to go, and SRC is word-aligned. */ 209 blt cr1,8f 210 211 /* Copy 16 bytes. */ 212 lwz 6,0(12) 213 lwz 7,4(12) 214 stw 6,0(3) 215 lwz 8,8(12) 216 stw 7,4(3) 217 lwz 6,12(12) 218 addi 12,12,16 219 stw 8,8(3) 220 stw 6,12(3) 221 addi 3,3,16 2228: /* Copy 8 bytes. */ 223 bf 28,4f 224 225 lwz 6,0(12) 226 lwz 7,4(12) 227 addi 12,12,8 228 stw 6,0(3) 229 stw 7,4(3) 230 addi 3,3,8 2314: /* Copy 4 bytes. */ 232 bf 29,2f 233 234 lwz 6,0(12) 235 addi 12,12,4 236 stw 6,0(3) 237 addi 3,3,4 2382: /* Copy 2-3 bytes. */ 239 bf 30,1f 240 241 lhz 6,0(12) 242 sth 6,0(3) 243 bf 31,0f 244 lbz 7,2(12) 245 stb 7,2(3) 246 ld 3,-16(1) 247 add 3,3,5 248 blr 249 250 .align 4 2511: /* Copy 1 byte. */ 252 bf 31,0f 253 254 lbz 6,0(12) 255 stb 6,0(3) 2560: /* Return DST + LEN pointer. */ 257 ld 3,-16(1) 258 add 3,3,5 259 blr 260 261 /* Handles copies of 0~8 bytes. */ 262 .align 4 263L(copy_LE_8): 264 bne cr6,4f 265 266 /* Though we could've used ld/std here, they are still 267 slow for unaligned cases. */ 268 269 lwz 6,0(4) 270 lwz 7,4(4) 271 stw 6,0(3) 272 stw 7,4(3) 273 ld 3,-16(1) /* Return DST + LEN pointer. */ 274 add 3,3,5 275 blr 276 277 .align 4 2784: /* Copies 4~7 bytes. */ 279 bf 29,2b 280 281 lwz 6,0(4) 282 stw 6,0(3) 283 bf 30,5f 284 lhz 7,4(4) 285 sth 7,4(3) 286 bf 31,0f 287 lbz 8,6(4) 288 stb 8,6(3) 289 ld 3,-16(1) 290 add 3,3,5 291 blr 292 293 .align 4 2945: /* Copy 1 byte. */ 295 bf 31,0f 296 297 lbz 6,4(4) 298 stb 6,4(3) 299 3000: /* Return DST + LEN pointer. */ 301 ld 3,-16(1) 302 add 3,3,5 303 blr 304 305 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but 306 SRC is not. Use aligned quadword loads from SRC, shifted to realign 307 the data, allowing for aligned DST stores. */ 308 .align 4 309L(copy_GE_32_unaligned): 310 clrldi 0,0,60 /* Number of bytes until the 1st 311 quadword. */ 312 andi. 11,3,15 /* Check alignment of DST (against 313 quadwords). */ 314 srdi 9,5,4 /* Number of full quadwords remaining. */ 315 316 beq L(copy_GE_32_unaligned_cont) 317 318 /* SRC is not quadword aligned, get it aligned. */ 319 320 mtcrf 0x01,0 321 subf 31,0,5 322 323 /* Vector instructions work best when proper alignment (16-bytes) 324 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 3251: /* Copy 1 byte. */ 326 bf 31,2f 327 328 lbz 6,0(12) 329 addi 12,12,1 330 stb 6,0(3) 331 addi 3,3,1 3322: /* Copy 2 bytes. */ 333 bf 30,4f 334 335 lhz 6,0(12) 336 addi 12,12,2 337 sth 6,0(3) 338 addi 3,3,2 3394: /* Copy 4 bytes. */ 340 bf 29,8f 341 342 lwz 6,0(12) 343 addi 12,12,4 344 stw 6,0(3) 345 addi 3,3,4 3468: /* Copy 8 bytes. */ 347 bf 28,0f 348 349 ld 6,0(12) 350 addi 12,12,8 351 std 6,0(3) 352 addi 3,3,8 3530: 354 clrldi 10,12,60 /* Check alignment of SRC. */ 355 srdi 9,31,4 /* Number of full quadwords remaining. */ 356 357 /* The proper alignment is present, it is OK to copy the bytes now. */ 358L(copy_GE_32_unaligned_cont): 359 360 /* Setup two indexes to speed up the indexed vector operations. */ 361 clrldi 11,31,60 362 li 6,16 /* Index for 16-bytes offsets. */ 363 li 7,32 /* Index for 32-bytes offsets. */ 364 cmpldi cr1,11,0 365 srdi 8,31,5 /* Setup the loop counter. */ 366 mr 10,3 367 mr 11,12 368 mtcrf 0x01,9 369 cmpldi cr6,9,1 370#ifdef __LITTLE_ENDIAN__ 371 lvsr 5,0,12 372#else 373 lvsl 5,0,12 374#endif 375 lvx 3,0,12 376 bf 31,L(setup_unaligned_loop) 377 378 /* Copy another 16 bytes to align to 32-bytes due to the loop . */ 379 lvx 4,12,6 380#ifdef __LITTLE_ENDIAN__ 381 vperm 6,4,3,5 382#else 383 vperm 6,3,4,5 384#endif 385 addi 11,12,16 386 addi 10,3,16 387 stvx 6,0,3 388 vor 3,4,4 389 390L(setup_unaligned_loop): 391 mtctr 8 392 ble cr6,L(end_unaligned_loop) 393 394 /* Copy 32 bytes at a time using vector instructions. */ 395 .align 4 396L(unaligned_loop): 397 398 /* Note: vr6/vr10 may contain data that was already copied, 399 but in order to get proper alignment, we may have to copy 400 some portions again. This is faster than having unaligned 401 vector instructions though. */ 402 403 lvx 4,11,6 /* vr4 = r11+16. */ 404#ifdef __LITTLE_ENDIAN__ 405 vperm 6,4,3,5 406#else 407 vperm 6,3,4,5 408#endif 409 lvx 3,11,7 /* vr3 = r11+32. */ 410#ifdef __LITTLE_ENDIAN__ 411 vperm 10,3,4,5 412#else 413 vperm 10,4,3,5 414#endif 415 addi 11,11,32 416 stvx 6,0,10 417 stvx 10,10,6 418 addi 10,10,32 419 420 bdnz L(unaligned_loop) 421 422 .align 4 423L(end_unaligned_loop): 424 425 /* Check for tail bytes. */ 426 rldicr 0,31,0,59 427 mtcrf 0x01,31 428 beq cr1,0f 429 430 add 3,3,0 431 add 12,12,0 432 433 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 4348: /* Copy 8 bytes. */ 435 bf 28,4f 436 437 lwz 6,0(12) 438 lwz 7,4(12) 439 addi 12,12,8 440 stw 6,0(3) 441 stw 7,4(3) 442 addi 3,3,8 4434: /* Copy 4 bytes. */ 444 bf 29,2f 445 446 lwz 6,0(12) 447 addi 12,12,4 448 stw 6,0(3) 449 addi 3,3,4 4502: /* Copy 2~3 bytes. */ 451 bf 30,1f 452 453 lhz 6,0(12) 454 addi 12,12,2 455 sth 6,0(3) 456 addi 3,3,2 4571: /* Copy 1 byte. */ 458 bf 31,0f 459 460 lbz 6,0(12) 461 stb 6,0(3) 4620: /* Return DST + LEN pointer. */ 463 ld 31,-8(1) 464 ld 3,-16(1) 465 add 3,3,5 466 blr 467 468END_GEN_TB (MEMPCPY,TB_TOCLESS) 469libc_hidden_def (__mempcpy) 470weak_alias (__mempcpy, mempcpy) 471libc_hidden_builtin_def (mempcpy) 472