1/* Generic optimized memcpy using SIMD. 2 Copyright (C) 2020-2022 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library. If not, see 18 <https://www.gnu.org/licenses/>. */ 19 20#include <sysdep.h> 21 22/* Assumptions: 23 * 24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 25 * 26 */ 27 28#define dstin x0 29#define src x1 30#define count x2 31#define dst x3 32#define srcend x4 33#define dstend x5 34#define A_l x6 35#define A_lw w6 36#define A_h x7 37#define B_l x8 38#define B_lw w8 39#define B_h x9 40#define C_lw w10 41#define tmp1 x14 42 43#define A_q q0 44#define B_q q1 45#define C_q q2 46#define D_q q3 47#define E_q q4 48#define F_q q5 49#define G_q q6 50#define H_q q7 51 52 53/* This implementation supports both memcpy and memmove and shares most code. 54 It uses unaligned accesses and branchless sequences to keep the code small, 55 simple and improve performance. 56 57 Copies are split into 3 main cases: small copies of up to 32 bytes, medium 58 copies of up to 128 bytes, and large copies. The overhead of the overlap 59 check in memmove is negligible since it is only required for large copies. 60 61 Large copies use a software pipelined loop processing 64 bytes per 62 iteration. The destination pointer is 16-byte aligned to minimize 63 unaligned accesses. The loop tail is handled by always copying 64 bytes 64 from the end. */ 65 66ENTRY (__memcpy_simd) 67 PTR_ARG (0) 68 PTR_ARG (1) 69 SIZE_ARG (2) 70 71 add srcend, src, count 72 add dstend, dstin, count 73 cmp count, 128 74 b.hi L(copy_long) 75 cmp count, 32 76 b.hi L(copy32_128) 77 78 /* Small copies: 0..32 bytes. */ 79 cmp count, 16 80 b.lo L(copy16) 81 ldr A_q, [src] 82 ldr B_q, [srcend, -16] 83 str A_q, [dstin] 84 str B_q, [dstend, -16] 85 ret 86 87 /* Copy 8-15 bytes. */ 88L(copy16): 89 tbz count, 3, L(copy8) 90 ldr A_l, [src] 91 ldr A_h, [srcend, -8] 92 str A_l, [dstin] 93 str A_h, [dstend, -8] 94 ret 95 96 /* Copy 4-7 bytes. */ 97L(copy8): 98 tbz count, 2, L(copy4) 99 ldr A_lw, [src] 100 ldr B_lw, [srcend, -4] 101 str A_lw, [dstin] 102 str B_lw, [dstend, -4] 103 ret 104 105 /* Copy 0..3 bytes using a branchless sequence. */ 106L(copy4): 107 cbz count, L(copy0) 108 lsr tmp1, count, 1 109 ldrb A_lw, [src] 110 ldrb C_lw, [srcend, -1] 111 ldrb B_lw, [src, tmp1] 112 strb A_lw, [dstin] 113 strb B_lw, [dstin, tmp1] 114 strb C_lw, [dstend, -1] 115L(copy0): 116 ret 117 118 .p2align 4 119 /* Medium copies: 33..128 bytes. */ 120L(copy32_128): 121 ldp A_q, B_q, [src] 122 ldp C_q, D_q, [srcend, -32] 123 cmp count, 64 124 b.hi L(copy128) 125 stp A_q, B_q, [dstin] 126 stp C_q, D_q, [dstend, -32] 127 ret 128 129 .p2align 4 130 /* Copy 65..128 bytes. */ 131L(copy128): 132 ldp E_q, F_q, [src, 32] 133 cmp count, 96 134 b.ls L(copy96) 135 ldp G_q, H_q, [srcend, -64] 136 stp G_q, H_q, [dstend, -64] 137L(copy96): 138 stp A_q, B_q, [dstin] 139 stp E_q, F_q, [dstin, 32] 140 stp C_q, D_q, [dstend, -32] 141 ret 142 143 /* Align loop64 below to 16 bytes. */ 144 nop 145 146 /* Copy more than 128 bytes. */ 147L(copy_long): 148 /* Copy 16 bytes and then align src to 16-byte alignment. */ 149 ldr D_q, [src] 150 and tmp1, src, 15 151 bic src, src, 15 152 sub dst, dstin, tmp1 153 add count, count, tmp1 /* Count is now 16 too large. */ 154 ldp A_q, B_q, [src, 16] 155 str D_q, [dstin] 156 ldp C_q, D_q, [src, 48] 157 subs count, count, 128 + 16 /* Test and readjust count. */ 158 b.ls L(copy64_from_end) 159L(loop64): 160 stp A_q, B_q, [dst, 16] 161 ldp A_q, B_q, [src, 80] 162 stp C_q, D_q, [dst, 48] 163 ldp C_q, D_q, [src, 112] 164 add src, src, 64 165 add dst, dst, 64 166 subs count, count, 64 167 b.hi L(loop64) 168 169 /* Write the last iteration and copy 64 bytes from the end. */ 170L(copy64_from_end): 171 ldp E_q, F_q, [srcend, -64] 172 stp A_q, B_q, [dst, 16] 173 ldp A_q, B_q, [srcend, -32] 174 stp C_q, D_q, [dst, 48] 175 stp E_q, F_q, [dstend, -64] 176 stp A_q, B_q, [dstend, -32] 177 ret 178 179END (__memcpy_simd) 180libc_hidden_builtin_def (__memcpy_simd) 181 182 183ENTRY (__memmove_simd) 184 PTR_ARG (0) 185 PTR_ARG (1) 186 SIZE_ARG (2) 187 188 add srcend, src, count 189 add dstend, dstin, count 190 cmp count, 128 191 b.hi L(move_long) 192 cmp count, 32 193 b.hi L(copy32_128) 194 195 /* Small moves: 0..32 bytes. */ 196 cmp count, 16 197 b.lo L(copy16) 198 ldr A_q, [src] 199 ldr B_q, [srcend, -16] 200 str A_q, [dstin] 201 str B_q, [dstend, -16] 202 ret 203 204L(move_long): 205 /* Only use backward copy if there is an overlap. */ 206 sub tmp1, dstin, src 207 cbz tmp1, L(move0) 208 cmp tmp1, count 209 b.hs L(copy_long) 210 211 /* Large backwards copy for overlapping copies. 212 Copy 16 bytes and then align srcend to 16-byte alignment. */ 213L(copy_long_backwards): 214 ldr D_q, [srcend, -16] 215 and tmp1, srcend, 15 216 bic srcend, srcend, 15 217 sub count, count, tmp1 218 ldp A_q, B_q, [srcend, -32] 219 str D_q, [dstend, -16] 220 ldp C_q, D_q, [srcend, -64] 221 sub dstend, dstend, tmp1 222 subs count, count, 128 223 b.ls L(copy64_from_start) 224 225L(loop64_backwards): 226 str B_q, [dstend, -16] 227 str A_q, [dstend, -32] 228 ldp A_q, B_q, [srcend, -96] 229 str D_q, [dstend, -48] 230 str C_q, [dstend, -64]! 231 ldp C_q, D_q, [srcend, -128] 232 sub srcend, srcend, 64 233 subs count, count, 64 234 b.hi L(loop64_backwards) 235 236 /* Write the last iteration and copy 64 bytes from the start. */ 237L(copy64_from_start): 238 ldp E_q, F_q, [src, 32] 239 stp A_q, B_q, [dstend, -32] 240 ldp A_q, B_q, [src] 241 stp C_q, D_q, [dstend, -64] 242 stp E_q, F_q, [dstin, 32] 243 stp A_q, B_q, [dstin] 244L(move0): 245 ret 246 247END (__memmove_simd) 248libc_hidden_builtin_def (__memmove_simd) 249