1/* Optimized memcpy implementation for POWER10. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, see 17 <http://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21 22#ifndef MEMCPY 23# define MEMCPY memcpy 24#endif 25 26/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); 27 Returns 'dst'. */ 28 29 .machine power9 30ENTRY_TOCLESS (MEMCPY, 5) 31 CALL_MCOUNT 3 32 33 /* Copy up to 16 bytes. */ 34 sldi r6,r5,56 /* Prepare [l|st]xvl counter. */ 35 lxvl v10,r4,r6 36 stxvl v10,r3,r6 37 subic. r6,r5,16 /* Return if len <= 16. */ 38 blelr 39 40 /* If len >= 256, assume nothing got copied before and copy 41 again. This might cause issues with overlapped memory, but memcpy 42 is not expected to treat overlapped memory. */ 43 cmpdi r5,256 44 bge L(copy_ge_256) 45 /* 16 < len < 256 and the first 16 bytes have already been copied. */ 46 addi r10,r3,16 /* Keep r3 intact as return value. */ 47 addi r4,r4,16 48 subi r5,r5,16 49 b L(copy_lt_256) /* Avoid the main loop if len < 256. */ 50 51 .p2align 5 52L(copy_ge_256): 53 mr r10,r3 /* Keep r3 intact as return value. */ 54 /* Align dst to 16 bytes. */ 55 andi. r9,r10,0xf 56 beq L(dst_is_align_16) 57 lxv v10,0(r4) 58 subfic r12,r9,16 59 subf r5,r12,r5 60 add r4,r4,r12 61 stxv v10,0(r3) 62 add r10,r3,r12 63 64L(dst_is_align_16): 65 srdi r9,r5,7 /* Divide by 128. */ 66 mtctr r9 67 addi r6,r4,64 68 addi r7,r10,64 69 70 71 /* Main loop, copy 128 bytes per iteration. 72 Use r6=src+64 and r7=dest+64 in order to reduce the dependency on 73 r4 and r10. */ 74 .p2align 5 75L(copy_128): 76 77 lxv v10, 0(r4) 78 lxv v11, 16(r4) 79 lxv v12, 32(r4) 80 lxv v13, 48(r4) 81 82 addi r4,r4,128 83 84 stxv v10, 0(r10) 85 stxv v11, 16(r10) 86 stxv v12, 32(r10) 87 stxv v13, 48(r10) 88 89 addi r10,r10,128 90 91 lxv v10, 0(r6) 92 lxv v11, 16(r6) 93 lxv v12, 32(r6) 94 lxv v13, 48(r6) 95 96 addi r6,r6,128 97 98 stxv v10, 0(r7) 99 stxv v11, 16(r7) 100 stxv v12, 32(r7) 101 stxv v13, 48(r7) 102 103 addi r7,r7,128 104 105 bdnz L(copy_128) 106 107 clrldi. r5,r5,64-7 /* Have we copied everything? */ 108 beqlr 109 110 .p2align 5 111L(copy_lt_256): 112 cmpdi r5,16 113 ble L(copy_le_16) 114 srdi. r9,r5,5 /* Divide by 32. */ 115 beq L(copy_lt_32) 116 mtctr r9 117 /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce 118 the dependency on r4 and r10. */ 119 addi r6,r4,32 120 addi r7,r10,32 121 addi r8,r4,64 122 addi r9,r10,64 123 124 .p2align 5 125 /* Copy 32 bytes at a time, unaligned. 126 The loop is unrolled 3 times in order to reduce the dependency on 127 r4 and r10, copying up-to 96 bytes per iteration. */ 128L(copy_32): 129 lxv v10, 0(r4) 130 lxv v11, 16(r4) 131 stxv v10, 0(r10) 132 stxv v11, 16(r10) 133 bdz L(end_copy_32a) 134 addi r4,r4,96 135 addi r10,r10,96 136 137 lxv v10, 0(r6) 138 lxv v11, 16(r6) 139 addi r6,r6,96 140 stxv v10, 0(r7) 141 stxv v11, 16(r7) 142 bdz L(end_copy_32b) 143 addi r7,r7,96 144 145 lxv v12, 0(r8) 146 lxv v13, 16(r8) 147 addi r8,r8,96 148 stxv v12, 0(r9) 149 stxv v13, 16(r9) 150 addi r9,r9,96 151 bdnz L(copy_32) 152 153 clrldi. r5,r5,64-5 /* Have we copied everything? */ 154 beqlr 155 cmpdi r5,16 156 ble L(copy_le_16) 157 b L(copy_lt_32) 158 159 .p2align 5 160L(end_copy_32a): 161 clrldi. r5,r5,64-5 /* Have we copied everything? */ 162 beqlr 163 /* 32 bytes have been copied since the last update of r4 and r10. */ 164 addi r4,r4,32 165 addi r10,r10,32 166 cmpdi r5,16 167 ble L(copy_le_16) 168 b L(copy_lt_32) 169 170 .p2align 5 171L(end_copy_32b): 172 clrldi. r5,r5,64-5 /* Have we copied everything? */ 173 beqlr 174 /* The last iteration of the loop copied 64 bytes. Update r4 and r10 175 accordingly. */ 176 addi r4,r4,-32 177 addi r10,r10,-32 178 cmpdi r5,16 179 ble L(copy_le_16) 180 181 .p2align 5 182L(copy_lt_32): 183 lxv v10, 0(r4) 184 stxv v10, 0(r10) 185 addi r4,r4,16 186 addi r10,r10,16 187 subi r5,r5,16 188 189 .p2align 5 190L(copy_le_16): 191 sldi r6,r5,56 192 lxvl v10,r4,r6 193 stxvl v10,r10,r6 194 blr 195 196 197END_GEN_TB (MEMCPY,TB_TOCLESS) 198libc_hidden_builtin_def (memcpy) 199