1/* Optimized memcpy implementation for PowerPC64/POWER7. 2 Copyright (C) 2010-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 22/* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]); 23 Returns 'dst'. */ 24 25#ifndef MEMCPY 26# define MEMCPY memcpy 27#endif 28 29#define dst 11 /* Use r11 so r3 kept unchanged. */ 30#define src 4 31#define cnt 5 32 33 .machine power7 34ENTRY_TOCLESS (MEMCPY, 5) 35 CALL_MCOUNT 3 36 37 cmpldi cr1,cnt,31 38 neg 0,3 39 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move 40 code. */ 41 42/* Align copies using VSX instructions to quadword. It is to avoid alignment 43 traps when memcpy is used on non-cacheable memory (for instance, memory 44 mapped I/O). */ 45 andi. 10,3,15 46 clrldi 11,4,60 47 cmpld cr6,10,11 /* SRC and DST alignments match? */ 48 49 mr dst,3 50 bne cr6,L(copy_GE_32_unaligned) 51 beq L(aligned_copy) 52 53 mtocrf 0x01,0 54 clrldi 0,0,60 55 56/* Get the DST and SRC aligned to 16 bytes. */ 571: 58 bf 31,2f 59 lbz 6,0(src) 60 addi src,src,1 61 stb 6,0(dst) 62 addi dst,dst,1 632: 64 bf 30,4f 65 lhz 6,0(src) 66 addi src,src,2 67 sth 6,0(dst) 68 addi dst,dst,2 694: 70 bf 29,8f 71 lwz 6,0(src) 72 addi src,src,4 73 stw 6,0(dst) 74 addi dst,dst,4 758: 76 bf 28,16f 77 ld 6,0(src) 78 addi src,src,8 79 std 6,0(dst) 80 addi dst,dst,8 8116: 82 subf cnt,0,cnt 83 84/* Main aligned copy loop. Copies 128 bytes at a time. */ 85L(aligned_copy): 86 li 6,16 87 li 7,32 88 li 8,48 89 mtocrf 0x02,cnt 90 srdi 12,cnt,7 91 cmpdi 12,0 92 beq L(aligned_tail) 93 lvx 6,0,src 94 lvx 7,src,6 95 mtctr 12 96 b L(aligned_128loop) 97 98 .align 4 99L(aligned_128head): 100 /* for the 2nd + iteration of this loop. */ 101 lvx 6,0,src 102 lvx 7,src,6 103L(aligned_128loop): 104 lvx 8,src,7 105 lvx 9,src,8 106 stvx 6,0,dst 107 addi src,src,64 108 stvx 7,dst,6 109 stvx 8,dst,7 110 stvx 9,dst,8 111 lvx 6,0,src 112 lvx 7,src,6 113 addi dst,dst,64 114 lvx 8,src,7 115 lvx 9,src,8 116 addi src,src,64 117 stvx 6,0,dst 118 stvx 7,dst,6 119 stvx 8,dst,7 120 stvx 9,dst,8 121 addi dst,dst,64 122 bdnz L(aligned_128head) 123 124L(aligned_tail): 125 mtocrf 0x01,cnt 126 bf 25,32f 127 lvx 6,0,src 128 lvx 7,src,6 129 lvx 8,src,7 130 lvx 9,src,8 131 addi src,src,64 132 stvx 6,0,dst 133 stvx 7,dst,6 134 stvx 8,dst,7 135 stvx 9,dst,8 136 addi dst,dst,64 13732: 138 bf 26,16f 139 lvx 6,0,src 140 lvx 7,src,6 141 addi src,src,32 142 stvx 6,0,dst 143 stvx 7,dst,6 144 addi dst,dst,32 14516: 146 bf 27,8f 147 lvx 6,0,src 148 addi src,src,16 149 stvx 6,0,dst 150 addi dst,dst,16 1518: 152 bf 28,4f 153 ld 6,0(src) 154 addi src,src,8 155 std 6,0(dst) 156 addi dst,dst,8 1574: /* Copies 4~7 bytes. */ 158 bf 29,L(tail2) 159 lwz 6,0(src) 160 stw 6,0(dst) 161 bf 30,L(tail5) 162 lhz 7,4(src) 163 sth 7,4(dst) 164 bflr 31 165 lbz 8,6(src) 166 stb 8,6(dst) 167 /* Return original DST pointer. */ 168 blr 169 170 171/* Handle copies of 0~31 bytes. */ 172 .align 4 173L(copy_LT_32): 174 mr dst,3 175 cmpldi cr6,cnt,8 176 mtocrf 0x01,cnt 177 ble cr6,L(copy_LE_8) 178 179 /* At least 9 bytes to go. */ 180 neg 8,4 181 andi. 0,8,3 182 cmpldi cr1,cnt,16 183 beq L(copy_LT_32_aligned) 184 185 /* Force 4-byte alignment for SRC. */ 186 mtocrf 0x01,0 187 subf cnt,0,cnt 1882: 189 bf 30,1f 190 lhz 6,0(src) 191 addi src,src,2 192 sth 6,0(dst) 193 addi dst,dst,2 1941: 195 bf 31,L(end_4bytes_alignment) 196 lbz 6,0(src) 197 addi src,src,1 198 stb 6,0(dst) 199 addi dst,dst,1 200 201 .align 4 202L(end_4bytes_alignment): 203 cmpldi cr1,cnt,16 204 mtocrf 0x01,cnt 205 206L(copy_LT_32_aligned): 207 /* At least 6 bytes to go, and SRC is word-aligned. */ 208 blt cr1,8f 209 210 /* Copy 16 bytes. */ 211 lwz 6,0(src) 212 lwz 7,4(src) 213 stw 6,0(dst) 214 lwz 8,8(src) 215 stw 7,4(dst) 216 lwz 6,12(src) 217 addi src,src,16 218 stw 8,8(dst) 219 stw 6,12(dst) 220 addi dst,dst,16 2218: /* Copy 8 bytes. */ 222 bf 28,L(tail4) 223 lwz 6,0(src) 224 lwz 7,4(src) 225 addi src,src,8 226 stw 6,0(dst) 227 stw 7,4(dst) 228 addi dst,dst,8 229 230 .align 4 231/* Copies 4~7 bytes. */ 232L(tail4): 233 bf 29,L(tail2) 234 lwz 6,0(src) 235 stw 6,0(dst) 236 bf 30,L(tail5) 237 lhz 7,4(src) 238 sth 7,4(dst) 239 bflr 31 240 lbz 8,6(src) 241 stb 8,6(dst) 242 /* Return original DST pointer. */ 243 blr 244 245 .align 4 246/* Copies 2~3 bytes. */ 247L(tail2): 248 bf 30,1f 249 lhz 6,0(src) 250 sth 6,0(dst) 251 bflr 31 252 lbz 7,2(src) 253 stb 7,2(dst) 254 blr 255 256 .align 4 257L(tail5): 258 bflr 31 259 lbz 6,4(src) 260 stb 6,4(dst) 261 blr 262 263 .align 4 2641: 265 bflr 31 266 lbz 6,0(src) 267 stb 6,0(dst) 268 /* Return original DST pointer. */ 269 blr 270 271 272/* Handles copies of 0~8 bytes. */ 273 .align 4 274L(copy_LE_8): 275 bne cr6,L(tail4) 276 277 /* Though we could've used ld/std here, they are still 278 slow for unaligned cases. */ 279 280 lwz 6,0(src) 281 lwz 7,4(src) 282 stw 6,0(dst) 283 stw 7,4(dst) 284 blr 285 286 287/* Handle copies of 32+ bytes where DST is aligned (to quadword) but 288 SRC is not. Use aligned quadword loads from SRC, shifted to realign 289 the data, allowing for aligned DST stores. */ 290 .align 4 291L(copy_GE_32_unaligned): 292 clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ 293 srdi 9,cnt,4 /* Number of full quadwords remaining. */ 294 295 beq L(copy_GE_32_unaligned_cont) 296 297 /* DST is not quadword aligned, get it aligned. */ 298 299 mtocrf 0x01,0 300 subf cnt,0,cnt 301 302 /* Vector instructions work best when proper alignment (16-bytes) 303 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */ 3041: 305 bf 31,2f 306 lbz 6,0(src) 307 addi src,src,1 308 stb 6,0(dst) 309 addi dst,dst,1 3102: 311 bf 30,4f 312 lhz 6,0(src) 313 addi src,src,2 314 sth 6,0(dst) 315 addi dst,dst,2 3164: 317 bf 29,8f 318 lwz 6,0(src) 319 addi src,src,4 320 stw 6,0(dst) 321 addi dst,dst,4 3228: 323 bf 28,0f 324 ld 6,0(src) 325 addi src,src,8 326 std 6,0(dst) 327 addi dst,dst,8 3280: 329 srdi 9,cnt,4 /* Number of full quadwords remaining. */ 330 331 /* The proper alignment is present, it is OK to copy the bytes now. */ 332L(copy_GE_32_unaligned_cont): 333 334 /* Setup two indexes to speed up the indexed vector operations. */ 335 clrldi 10,cnt,60 336 li 6,16 /* Index for 16-bytes offsets. */ 337 li 7,32 /* Index for 32-bytes offsets. */ 338 cmpldi cr1,10,0 339 srdi 8,cnt,5 /* Setup the loop counter. */ 340 mtocrf 0x01,9 341 cmpldi cr6,9,1 342#ifdef __LITTLE_ENDIAN__ 343 lvsr 5,0,src 344#else 345 lvsl 5,0,src 346#endif 347 lvx 3,0,src 348 li 0,0 349 bf 31,L(setup_unaligned_loop) 350 351 /* Copy another 16 bytes to align to 32-bytes due to the loop. */ 352 lvx 4,src,6 353#ifdef __LITTLE_ENDIAN__ 354 vperm 6,4,3,5 355#else 356 vperm 6,3,4,5 357#endif 358 addi src,src,16 359 stvx 6,0,dst 360 addi dst,dst,16 361 vor 3,4,4 362 clrrdi 0,src,60 363 364L(setup_unaligned_loop): 365 mtctr 8 366 ble cr6,L(end_unaligned_loop) 367 368 /* Copy 32 bytes at a time using vector instructions. */ 369 .align 4 370L(unaligned_loop): 371 372 /* Note: vr6/vr10 may contain data that was already copied, 373 but in order to get proper alignment, we may have to copy 374 some portions again. This is faster than having unaligned 375 vector instructions though. */ 376 377 lvx 4,src,6 378#ifdef __LITTLE_ENDIAN__ 379 vperm 6,4,3,5 380#else 381 vperm 6,3,4,5 382#endif 383 lvx 3,src,7 384#ifdef __LITTLE_ENDIAN__ 385 vperm 10,3,4,5 386#else 387 vperm 10,4,3,5 388#endif 389 addi src,src,32 390 stvx 6,0,dst 391 stvx 10,dst,6 392 addi dst,dst,32 393 bdnz L(unaligned_loop) 394 395 clrrdi 0,src,60 396 397 .align 4 398L(end_unaligned_loop): 399 400 /* Check for tail bytes. */ 401 mtocrf 0x01,cnt 402 beqlr cr1 403 404 add src,src,0 405 406 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */ 407 /* Copy 8 bytes. */ 408 bf 28,4f 409 lwz 6,0(src) 410 lwz 7,4(src) 411 addi src,src,8 412 stw 6,0(dst) 413 stw 7,4(dst) 414 addi dst,dst,8 4154: /* Copy 4~7 bytes. */ 416 bf 29,L(tail2) 417 lwz 6,0(src) 418 stw 6,0(dst) 419 bf 30,L(tail5) 420 lhz 7,4(src) 421 sth 7,4(dst) 422 bflr 31 423 lbz 8,6(src) 424 stb 8,6(dst) 425 /* Return original DST pointer. */ 426 blr 427 428END_GEN_TB (MEMCPY,TB_TOCLESS) 429libc_hidden_builtin_def (memcpy) 430