1/* Optimized memcpy for Fujitsu A64FX processor. 2 Copyright (C) 2021-2022 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library. If not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22#undef BTI_C 23#define BTI_C 24 25/* Assumptions: 26 * 27 * ARMv8.2-a, AArch64, unaligned accesses, sve 28 * 29 */ 30 31#define dstin x0 32#define src x1 33#define n x2 34#define dst x3 35#define dstend x4 36#define srcend x5 37#define tmp x6 38#define vlen x7 39#define vlen8 x8 40 41#if HAVE_AARCH64_SVE_ASM 42# if IS_IN (libc) 43# define MEMCPY __memcpy_a64fx 44# define MEMMOVE __memmove_a64fx 45 46 .arch armv8.2-a+sve 47 48 .macro ld1b_unroll8 49 ld1b z0.b, p0/z, [src, 0, mul vl] 50 ld1b z1.b, p0/z, [src, 1, mul vl] 51 ld1b z2.b, p0/z, [src, 2, mul vl] 52 ld1b z3.b, p0/z, [src, 3, mul vl] 53 ld1b z4.b, p0/z, [src, 4, mul vl] 54 ld1b z5.b, p0/z, [src, 5, mul vl] 55 ld1b z6.b, p0/z, [src, 6, mul vl] 56 ld1b z7.b, p0/z, [src, 7, mul vl] 57 .endm 58 59 .macro stld1b_unroll4a 60 st1b z0.b, p0, [dst, 0, mul vl] 61 st1b z1.b, p0, [dst, 1, mul vl] 62 ld1b z0.b, p0/z, [src, 0, mul vl] 63 ld1b z1.b, p0/z, [src, 1, mul vl] 64 st1b z2.b, p0, [dst, 2, mul vl] 65 st1b z3.b, p0, [dst, 3, mul vl] 66 ld1b z2.b, p0/z, [src, 2, mul vl] 67 ld1b z3.b, p0/z, [src, 3, mul vl] 68 .endm 69 70 .macro stld1b_unroll4b 71 st1b z4.b, p0, [dst, 4, mul vl] 72 st1b z5.b, p0, [dst, 5, mul vl] 73 ld1b z4.b, p0/z, [src, 4, mul vl] 74 ld1b z5.b, p0/z, [src, 5, mul vl] 75 st1b z6.b, p0, [dst, 6, mul vl] 76 st1b z7.b, p0, [dst, 7, mul vl] 77 ld1b z6.b, p0/z, [src, 6, mul vl] 78 ld1b z7.b, p0/z, [src, 7, mul vl] 79 .endm 80 81 .macro stld1b_unroll8 82 stld1b_unroll4a 83 stld1b_unroll4b 84 .endm 85 86 .macro st1b_unroll8 87 st1b z0.b, p0, [dst, 0, mul vl] 88 st1b z1.b, p0, [dst, 1, mul vl] 89 st1b z2.b, p0, [dst, 2, mul vl] 90 st1b z3.b, p0, [dst, 3, mul vl] 91 st1b z4.b, p0, [dst, 4, mul vl] 92 st1b z5.b, p0, [dst, 5, mul vl] 93 st1b z6.b, p0, [dst, 6, mul vl] 94 st1b z7.b, p0, [dst, 7, mul vl] 95 .endm 96 97#undef BTI_C 98#define BTI_C 99 100ENTRY (MEMCPY) 101 102 PTR_ARG (0) 103 PTR_ARG (1) 104 SIZE_ARG (2) 105 106 cntb vlen 107 cmp n, vlen, lsl 1 108 b.hi L(copy_small) 109 whilelo p1.b, vlen, n 110 whilelo p0.b, xzr, n 111 ld1b z0.b, p0/z, [src, 0, mul vl] 112 ld1b z1.b, p1/z, [src, 1, mul vl] 113 st1b z0.b, p0, [dstin, 0, mul vl] 114 st1b z1.b, p1, [dstin, 1, mul vl] 115 ret 116 117 .p2align 4 118 119L(copy_small): 120 cmp n, vlen, lsl 3 121 b.hi L(copy_large) 122 add dstend, dstin, n 123 add srcend, src, n 124 cmp n, vlen, lsl 2 125 b.hi 1f 126 127 /* Copy 2-4 vectors. */ 128 ptrue p0.b 129 ld1b z0.b, p0/z, [src, 0, mul vl] 130 ld1b z1.b, p0/z, [src, 1, mul vl] 131 ld1b z2.b, p0/z, [srcend, -2, mul vl] 132 ld1b z3.b, p0/z, [srcend, -1, mul vl] 133 st1b z0.b, p0, [dstin, 0, mul vl] 134 st1b z1.b, p0, [dstin, 1, mul vl] 135 st1b z2.b, p0, [dstend, -2, mul vl] 136 st1b z3.b, p0, [dstend, -1, mul vl] 137 ret 138 139 .p2align 4 140 /* Copy 4-8 vectors. */ 1411: ptrue p0.b 142 ld1b z0.b, p0/z, [src, 0, mul vl] 143 ld1b z1.b, p0/z, [src, 1, mul vl] 144 ld1b z2.b, p0/z, [src, 2, mul vl] 145 ld1b z3.b, p0/z, [src, 3, mul vl] 146 ld1b z4.b, p0/z, [srcend, -4, mul vl] 147 ld1b z5.b, p0/z, [srcend, -3, mul vl] 148 ld1b z6.b, p0/z, [srcend, -2, mul vl] 149 ld1b z7.b, p0/z, [srcend, -1, mul vl] 150 st1b z0.b, p0, [dstin, 0, mul vl] 151 st1b z1.b, p0, [dstin, 1, mul vl] 152 st1b z2.b, p0, [dstin, 2, mul vl] 153 st1b z3.b, p0, [dstin, 3, mul vl] 154 st1b z4.b, p0, [dstend, -4, mul vl] 155 st1b z5.b, p0, [dstend, -3, mul vl] 156 st1b z6.b, p0, [dstend, -2, mul vl] 157 st1b z7.b, p0, [dstend, -1, mul vl] 158 ret 159 160 .p2align 4 161 /* At least 8 vectors - always align to vector length for 162 higher and consistent write performance. */ 163L(copy_large): 164 sub tmp, vlen, 1 165 and tmp, dstin, tmp 166 sub tmp, vlen, tmp 167 whilelo p1.b, xzr, tmp 168 ld1b z1.b, p1/z, [src] 169 st1b z1.b, p1, [dstin] 170 add dst, dstin, tmp 171 add src, src, tmp 172 sub n, n, tmp 173 ptrue p0.b 174 175 lsl vlen8, vlen, 3 176 subs n, n, vlen8 177 b.ls 3f 178 ld1b_unroll8 179 add src, src, vlen8 180 subs n, n, vlen8 181 b.ls 2f 182 183 .p2align 4 184 /* 8x unrolled and software pipelined loop. */ 1851: stld1b_unroll8 186 add dst, dst, vlen8 187 add src, src, vlen8 188 subs n, n, vlen8 189 b.hi 1b 1902: st1b_unroll8 191 add dst, dst, vlen8 1923: add n, n, vlen8 193 194 /* Move last 0-8 vectors. */ 195L(last_bytes): 196 cmp n, vlen, lsl 1 197 b.hi 1f 198 whilelo p0.b, xzr, n 199 whilelo p1.b, vlen, n 200 ld1b z0.b, p0/z, [src, 0, mul vl] 201 ld1b z1.b, p1/z, [src, 1, mul vl] 202 st1b z0.b, p0, [dst, 0, mul vl] 203 st1b z1.b, p1, [dst, 1, mul vl] 204 ret 205 206 .p2align 4 207 2081: add srcend, src, n 209 add dstend, dst, n 210 ld1b z0.b, p0/z, [src, 0, mul vl] 211 ld1b z1.b, p0/z, [src, 1, mul vl] 212 ld1b z2.b, p0/z, [srcend, -2, mul vl] 213 ld1b z3.b, p0/z, [srcend, -1, mul vl] 214 cmp n, vlen, lsl 2 215 b.hi 1f 216 217 st1b z0.b, p0, [dst, 0, mul vl] 218 st1b z1.b, p0, [dst, 1, mul vl] 219 st1b z2.b, p0, [dstend, -2, mul vl] 220 st1b z3.b, p0, [dstend, -1, mul vl] 221 ret 222 2231: ld1b z4.b, p0/z, [src, 2, mul vl] 224 ld1b z5.b, p0/z, [src, 3, mul vl] 225 ld1b z6.b, p0/z, [srcend, -4, mul vl] 226 ld1b z7.b, p0/z, [srcend, -3, mul vl] 227 st1b z0.b, p0, [dst, 0, mul vl] 228 st1b z1.b, p0, [dst, 1, mul vl] 229 st1b z4.b, p0, [dst, 2, mul vl] 230 st1b z5.b, p0, [dst, 3, mul vl] 231 st1b z6.b, p0, [dstend, -4, mul vl] 232 st1b z7.b, p0, [dstend, -3, mul vl] 233 st1b z2.b, p0, [dstend, -2, mul vl] 234 st1b z3.b, p0, [dstend, -1, mul vl] 235 ret 236 237END (MEMCPY) 238libc_hidden_builtin_def (MEMCPY) 239 240 241ENTRY_ALIGN (MEMMOVE, 4) 242 243 PTR_ARG (0) 244 PTR_ARG (1) 245 SIZE_ARG (2) 246 247 /* Fast case for up to 2 vectors. */ 248 cntb vlen 249 cmp n, vlen, lsl 1 250 b.hi 1f 251 whilelo p0.b, xzr, n 252 whilelo p1.b, vlen, n 253 ld1b z0.b, p0/z, [src, 0, mul vl] 254 ld1b z1.b, p1/z, [src, 1, mul vl] 255 st1b z0.b, p0, [dstin, 0, mul vl] 256 st1b z1.b, p1, [dstin, 1, mul vl] 257L(full_overlap): 258 ret 259 260 .p2align 4 261 /* Check for overlapping moves. Return if there is a full overlap. 262 Small moves up to 8 vectors use the overlap-safe copy_small code. 263 Non-overlapping or overlapping moves with dst < src use memcpy. 264 Overlapping moves with dst > src use a backward copy loop. */ 2651: sub tmp, dstin, src 266 ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */ 267 b.eq L(full_overlap) 268 cmp n, vlen, lsl 3 269 b.ls L(copy_small) 270 cmp tmp, n 271 b.hs L(copy_large) 272 273 /* Align to vector length. */ 274 add dst, dstin, n 275 sub tmp, vlen, 1 276 ands tmp, dst, tmp 277 csel tmp, tmp, vlen, ne 278 whilelo p1.b, xzr, tmp 279 sub n, n, tmp 280 ld1b z1.b, p1/z, [src, n] 281 st1b z1.b, p1, [dstin, n] 282 add src, src, n 283 add dst, dstin, n 284 285 ptrue p0.b 286 lsl vlen8, vlen, 3 287 subs n, n, vlen8 288 b.ls 3f 289 sub src, src, vlen8 290 ld1b_unroll8 291 subs n, n, vlen8 292 b.ls 2f 293 294 .p2align 4 295 /* 8x unrolled and software pipelined backward copy loop. */ 2961: sub src, src, vlen8 297 sub dst, dst, vlen8 298 stld1b_unroll8 299 subs n, n, vlen8 300 b.hi 1b 3012: sub dst, dst, vlen8 302 st1b_unroll8 3033: add n, n, vlen8 304 305 /* Adjust src/dst for last 0-8 vectors. */ 306 sub src, src, n 307 mov dst, dstin 308 b L(last_bytes) 309 310END (MEMMOVE) 311libc_hidden_builtin_def (MEMMOVE) 312# endif /* IS_IN (libc) */ 313#endif /* HAVE_AARCH64_SVE_ASM */ 314