1/* Copyright (C) 2012-2022 Free Software Foundation, Inc. 2 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library. If not, see 17 <https://www.gnu.org/licenses/>. */ 18 19#include <sysdep.h> 20 21/* Assumptions: 22 * 23 * ARMv8-a, AArch64, unaligned accesses. 24 * 25 */ 26 27#define dstin x0 28#define src x1 29#define count x2 30#define dst x3 31#define srcend x4 32#define dstend x5 33#define A_l x6 34#define A_lw w6 35#define A_h x7 36#define B_l x8 37#define B_lw w8 38#define B_h x9 39#define C_l x10 40#define C_lw w10 41#define C_h x11 42#define D_l x12 43#define D_h x13 44#define E_l x14 45#define E_h x15 46#define F_l x16 47#define F_h x17 48#define G_l count 49#define G_h dst 50#define H_l src 51#define H_h srcend 52#define tmp1 x14 53 54#ifndef MEMMOVE 55# define MEMMOVE memmove 56#endif 57#ifndef MEMCPY 58# define MEMCPY memcpy 59#endif 60 61/* This implementation supports both memcpy and memmove and shares most code. 62 It uses unaligned accesses and branchless sequences to keep the code small, 63 simple and improve performance. 64 65 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 66 copies of up to 128 bytes, and large copies. The overhead of the overlap 67 check in memmove is negligible since it is only required for large copies. 68 69 Large copies use a software pipelined loop processing 64 bytes per 70 iteration. The destination pointer is 16-byte aligned to minimize 71 unaligned accesses. The loop tail is handled by always copying 64 bytes 72 from the end. 73*/ 74 75ENTRY_ALIGN (MEMCPY, 6) 76 PTR_ARG (0) 77 PTR_ARG (1) 78 SIZE_ARG (2) 79 80 add srcend, src, count 81 add dstend, dstin, count 82 cmp count, 128 83 b.hi L(copy_long) 84 cmp count, 32 85 b.hi L(copy32_128) 86 87 /* Small copies: 0..32 bytes. */ 88 cmp count, 16 89 b.lo L(copy16) 90 ldp A_l, A_h, [src] 91 ldp D_l, D_h, [srcend, -16] 92 stp A_l, A_h, [dstin] 93 stp D_l, D_h, [dstend, -16] 94 ret 95 96 /* Copy 8-15 bytes. */ 97L(copy16): 98 tbz count, 3, L(copy8) 99 ldr A_l, [src] 100 ldr A_h, [srcend, -8] 101 str A_l, [dstin] 102 str A_h, [dstend, -8] 103 ret 104 105 .p2align 3 106 /* Copy 4-7 bytes. */ 107L(copy8): 108 tbz count, 2, L(copy4) 109 ldr A_lw, [src] 110 ldr B_lw, [srcend, -4] 111 str A_lw, [dstin] 112 str B_lw, [dstend, -4] 113 ret 114 115 /* Copy 0..3 bytes using a branchless sequence. */ 116L(copy4): 117 cbz count, L(copy0) 118 lsr tmp1, count, 1 119 ldrb A_lw, [src] 120 ldrb C_lw, [srcend, -1] 121 ldrb B_lw, [src, tmp1] 122 strb A_lw, [dstin] 123 strb B_lw, [dstin, tmp1] 124 strb C_lw, [dstend, -1] 125L(copy0): 126 ret 127 128 .p2align 4 129 /* Medium copies: 33..128 bytes. */ 130L(copy32_128): 131 ldp A_l, A_h, [src] 132 ldp B_l, B_h, [src, 16] 133 ldp C_l, C_h, [srcend, -32] 134 ldp D_l, D_h, [srcend, -16] 135 cmp count, 64 136 b.hi L(copy128) 137 stp A_l, A_h, [dstin] 138 stp B_l, B_h, [dstin, 16] 139 stp C_l, C_h, [dstend, -32] 140 stp D_l, D_h, [dstend, -16] 141 ret 142 143 .p2align 4 144 /* Copy 65..128 bytes. */ 145L(copy128): 146 ldp E_l, E_h, [src, 32] 147 ldp F_l, F_h, [src, 48] 148 cmp count, 96 149 b.ls L(copy96) 150 ldp G_l, G_h, [srcend, -64] 151 ldp H_l, H_h, [srcend, -48] 152 stp G_l, G_h, [dstend, -64] 153 stp H_l, H_h, [dstend, -48] 154L(copy96): 155 stp A_l, A_h, [dstin] 156 stp B_l, B_h, [dstin, 16] 157 stp E_l, E_h, [dstin, 32] 158 stp F_l, F_h, [dstin, 48] 159 stp C_l, C_h, [dstend, -32] 160 stp D_l, D_h, [dstend, -16] 161 ret 162 163 .p2align 4 164 /* Copy more than 128 bytes. */ 165L(copy_long): 166 /* Copy 16 bytes and then align dst to 16-byte alignment. */ 167 ldp D_l, D_h, [src] 168 and tmp1, dstin, 15 169 bic dst, dstin, 15 170 sub src, src, tmp1 171 add count, count, tmp1 /* Count is now 16 too large. */ 172 ldp A_l, A_h, [src, 16] 173 stp D_l, D_h, [dstin] 174 ldp B_l, B_h, [src, 32] 175 ldp C_l, C_h, [src, 48] 176 ldp D_l, D_h, [src, 64]! 177 subs count, count, 128 + 16 /* Test and readjust count. */ 178 b.ls L(copy64_from_end) 179 180L(loop64): 181 stp A_l, A_h, [dst, 16] 182 ldp A_l, A_h, [src, 16] 183 stp B_l, B_h, [dst, 32] 184 ldp B_l, B_h, [src, 32] 185 stp C_l, C_h, [dst, 48] 186 ldp C_l, C_h, [src, 48] 187 stp D_l, D_h, [dst, 64]! 188 ldp D_l, D_h, [src, 64]! 189 subs count, count, 64 190 b.hi L(loop64) 191 192 /* Write the last iteration and copy 64 bytes from the end. */ 193L(copy64_from_end): 194 ldp E_l, E_h, [srcend, -64] 195 stp A_l, A_h, [dst, 16] 196 ldp A_l, A_h, [srcend, -48] 197 stp B_l, B_h, [dst, 32] 198 ldp B_l, B_h, [srcend, -32] 199 stp C_l, C_h, [dst, 48] 200 ldp C_l, C_h, [srcend, -16] 201 stp D_l, D_h, [dst, 64] 202 stp E_l, E_h, [dstend, -64] 203 stp A_l, A_h, [dstend, -48] 204 stp B_l, B_h, [dstend, -32] 205 stp C_l, C_h, [dstend, -16] 206 ret 207 208END (MEMCPY) 209libc_hidden_builtin_def (MEMCPY) 210 211ENTRY_ALIGN (MEMMOVE, 4) 212 PTR_ARG (0) 213 PTR_ARG (1) 214 SIZE_ARG (2) 215 216 add srcend, src, count 217 add dstend, dstin, count 218 cmp count, 128 219 b.hi L(move_long) 220 cmp count, 32 221 b.hi L(copy32_128) 222 223 /* Small copies: 0..32 bytes. */ 224 cmp count, 16 225 b.lo L(copy16) 226 ldp A_l, A_h, [src] 227 ldp D_l, D_h, [srcend, -16] 228 stp A_l, A_h, [dstin] 229 stp D_l, D_h, [dstend, -16] 230 ret 231 232 .p2align 4 233L(move_long): 234 /* Only use backward copy if there is an overlap. */ 235 sub tmp1, dstin, src 236 cbz tmp1, L(copy0) 237 cmp tmp1, count 238 b.hs L(copy_long) 239 240 /* Large backwards copy for overlapping copies. 241 Copy 16 bytes and then align dst to 16-byte alignment. */ 242 ldp D_l, D_h, [srcend, -16] 243 and tmp1, dstend, 15 244 sub srcend, srcend, tmp1 245 sub count, count, tmp1 246 ldp A_l, A_h, [srcend, -16] 247 stp D_l, D_h, [dstend, -16] 248 ldp B_l, B_h, [srcend, -32] 249 ldp C_l, C_h, [srcend, -48] 250 ldp D_l, D_h, [srcend, -64]! 251 sub dstend, dstend, tmp1 252 subs count, count, 128 253 b.ls L(copy64_from_start) 254 255L(loop64_backwards): 256 stp A_l, A_h, [dstend, -16] 257 ldp A_l, A_h, [srcend, -16] 258 stp B_l, B_h, [dstend, -32] 259 ldp B_l, B_h, [srcend, -32] 260 stp C_l, C_h, [dstend, -48] 261 ldp C_l, C_h, [srcend, -48] 262 stp D_l, D_h, [dstend, -64]! 263 ldp D_l, D_h, [srcend, -64]! 264 subs count, count, 64 265 b.hi L(loop64_backwards) 266 267 /* Write the last iteration and copy 64 bytes from the start. */ 268L(copy64_from_start): 269 ldp G_l, G_h, [src, 48] 270 stp A_l, A_h, [dstend, -16] 271 ldp A_l, A_h, [src, 32] 272 stp B_l, B_h, [dstend, -32] 273 ldp B_l, B_h, [src, 16] 274 stp C_l, C_h, [dstend, -48] 275 ldp C_l, C_h, [src] 276 stp D_l, D_h, [dstend, -64] 277 stp G_l, G_h, [dstin, 48] 278 stp A_l, A_h, [dstin, 32] 279 stp B_l, B_h, [dstin, 16] 280 stp C_l, C_h, [dstin] 281 ret 282 283END (MEMMOVE) 284libc_hidden_builtin_def (MEMMOVE) 285