1/* A Thunderx Optimized memcpy implementation for AARCH64. 2 Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 4 This file is part of the GNU C Library. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, see 18 <https://www.gnu.org/licenses/>. */ 19 20/* The actual code in this memcpy and memmove should be identical to the 21 generic version except for the code under '#ifdef THUNDERX'. This is 22 to make is easier to keep this version and the generic version in sync 23 for changes that are not specific to thunderx. */ 24 25#include <sysdep.h> 26 27/* Assumptions: 28 * 29 * ARMv8-a, AArch64, unaligned accesses. 30 * 31 */ 32 33#define dstin x0 34#define src x1 35#define count x2 36#define dst x3 37#define srcend x4 38#define dstend x5 39#define A_l x6 40#define A_lw w6 41#define A_h x7 42#define A_hw w7 43#define B_l x8 44#define B_lw w8 45#define B_h x9 46#define C_l x10 47#define C_h x11 48#define D_l x12 49#define D_h x13 50#define E_l src 51#define E_h count 52#define F_l srcend 53#define F_h dst 54#define G_l count 55#define G_h dst 56#define tmp1 x14 57 58/* Copies are split into 3 main cases: small copies of up to 16 bytes, 59 medium copies of 17..96 bytes which are fully unrolled. Large copies 60 of more than 96 bytes align the destination and use an unrolled loop 61 processing 64 bytes per iteration. 62 In order to share code with memmove, small and medium copies read all 63 data before writing, allowing any kind of overlap. So small, medium 64 and large backwards memmoves are handled by falling through into memcpy. 65 Overlapping large forward memmoves use a loop that copies backwards. 66*/ 67 68#ifndef MEMMOVE 69# define MEMMOVE memmove 70#endif 71#ifndef MEMCPY 72# define MEMCPY memcpy 73#endif 74 75#if IS_IN (libc) 76 77# undef MEMCPY 78# define MEMCPY __memcpy_thunderx 79# undef MEMMOVE 80# define MEMMOVE __memmove_thunderx 81 82ENTRY_ALIGN (MEMMOVE, 6) 83 84 PTR_ARG (0) 85 PTR_ARG (1) 86 SIZE_ARG (2) 87 88 sub tmp1, dstin, src 89 cmp count, 96 90 ccmp tmp1, count, 2, hi 91 b.lo L(move_long) 92 93 /* Common case falls through into memcpy. */ 94END (MEMMOVE) 95libc_hidden_builtin_def (MEMMOVE) 96ENTRY (MEMCPY) 97 98 PTR_ARG (0) 99 PTR_ARG (1) 100 SIZE_ARG (2) 101 102 prfm PLDL1KEEP, [src] 103 add srcend, src, count 104 add dstend, dstin, count 105 cmp count, 16 106 b.ls L(copy16) 107 cmp count, 96 108 b.hi L(copy_long) 109 110 /* Medium copies: 17..96 bytes. */ 111 sub tmp1, count, 1 112 ldp A_l, A_h, [src] 113 tbnz tmp1, 6, L(copy96) 114 ldp D_l, D_h, [srcend, -16] 115 tbz tmp1, 5, 1f 116 ldp B_l, B_h, [src, 16] 117 ldp C_l, C_h, [srcend, -32] 118 stp B_l, B_h, [dstin, 16] 119 stp C_l, C_h, [dstend, -32] 1201: 121 stp A_l, A_h, [dstin] 122 stp D_l, D_h, [dstend, -16] 123 ret 124 125 .p2align 4 126 /* Small copies: 0..16 bytes. */ 127L(copy16): 128 cmp count, 8 129 b.lo 1f 130 ldr A_l, [src] 131 ldr A_h, [srcend, -8] 132 str A_l, [dstin] 133 str A_h, [dstend, -8] 134 ret 135 .p2align 4 1361: 137 tbz count, 2, 1f 138 ldr A_lw, [src] 139 ldr A_hw, [srcend, -4] 140 str A_lw, [dstin] 141 str A_hw, [dstend, -4] 142 ret 143 144 /* Copy 0..3 bytes. Use a branchless sequence that copies the same 145 byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1461: 147 cbz count, 2f 148 lsr tmp1, count, 1 149 ldrb A_lw, [src] 150 ldrb A_hw, [srcend, -1] 151 ldrb B_lw, [src, tmp1] 152 strb A_lw, [dstin] 153 strb B_lw, [dstin, tmp1] 154 strb A_hw, [dstend, -1] 1552: ret 156 157 .p2align 4 158 /* Copy 64..96 bytes. Copy 64 bytes from the start and 159 32 bytes from the end. */ 160L(copy96): 161 ldp B_l, B_h, [src, 16] 162 ldp C_l, C_h, [src, 32] 163 ldp D_l, D_h, [src, 48] 164 ldp E_l, E_h, [srcend, -32] 165 ldp F_l, F_h, [srcend, -16] 166 stp A_l, A_h, [dstin] 167 stp B_l, B_h, [dstin, 16] 168 stp C_l, C_h, [dstin, 32] 169 stp D_l, D_h, [dstin, 48] 170 stp E_l, E_h, [dstend, -32] 171 stp F_l, F_h, [dstend, -16] 172 ret 173 174 /* Align DST to 16 byte alignment so that we don't cross cache line 175 boundaries on both loads and stores. There are at least 96 bytes 176 to copy, so copy 16 bytes unaligned and then align. The loop 177 copies 64 bytes per iteration and prefetches one iteration ahead. */ 178 179 .p2align 4 180L(copy_long): 181 182 /* On thunderx, large memcpy's are helped by software prefetching. 183 This loop is identical to the one below it but with prefetching 184 instructions included. For loops that are less than 32768 bytes, 185 the prefetching does not help and slow the code down so we only 186 use the prefetching loop for the largest memcpys. */ 187 188 cmp count, #32768 189 b.lo L(copy_long_without_prefetch) 190 and tmp1, dstin, 15 191 bic dst, dstin, 15 192 ldp D_l, D_h, [src] 193 sub src, src, tmp1 194 prfm pldl1strm, [src, 384] 195 add count, count, tmp1 /* Count is now 16 too large. */ 196 ldp A_l, A_h, [src, 16] 197 stp D_l, D_h, [dstin] 198 ldp B_l, B_h, [src, 32] 199 ldp C_l, C_h, [src, 48] 200 ldp D_l, D_h, [src, 64]! 201 subs count, count, 128 + 16 /* Test and readjust count. */ 202 203L(prefetch_loop64): 204 tbz src, #6, 1f 205 prfm pldl1strm, [src, 512] 2061: 207 stp A_l, A_h, [dst, 16] 208 ldp A_l, A_h, [src, 16] 209 stp B_l, B_h, [dst, 32] 210 ldp B_l, B_h, [src, 32] 211 stp C_l, C_h, [dst, 48] 212 ldp C_l, C_h, [src, 48] 213 stp D_l, D_h, [dst, 64]! 214 ldp D_l, D_h, [src, 64]! 215 subs count, count, 64 216 b.hi L(prefetch_loop64) 217 b L(last64) 218 219L(copy_long_without_prefetch): 220 221 and tmp1, dstin, 15 222 bic dst, dstin, 15 223 ldp D_l, D_h, [src] 224 sub src, src, tmp1 225 add count, count, tmp1 /* Count is now 16 too large. */ 226 ldp A_l, A_h, [src, 16] 227 stp D_l, D_h, [dstin] 228 ldp B_l, B_h, [src, 32] 229 ldp C_l, C_h, [src, 48] 230 ldp D_l, D_h, [src, 64]! 231 subs count, count, 128 + 16 /* Test and readjust count. */ 232 b.ls L(last64) 233L(loop64): 234 stp A_l, A_h, [dst, 16] 235 ldp A_l, A_h, [src, 16] 236 stp B_l, B_h, [dst, 32] 237 ldp B_l, B_h, [src, 32] 238 stp C_l, C_h, [dst, 48] 239 ldp C_l, C_h, [src, 48] 240 stp D_l, D_h, [dst, 64]! 241 ldp D_l, D_h, [src, 64]! 242 subs count, count, 64 243 b.hi L(loop64) 244 245 /* Write the last full set of 64 bytes. The remainder is at most 64 246 bytes, so it is safe to always copy 64 bytes from the end even if 247 there is just 1 byte left. */ 248L(last64): 249 ldp E_l, E_h, [srcend, -64] 250 stp A_l, A_h, [dst, 16] 251 ldp A_l, A_h, [srcend, -48] 252 stp B_l, B_h, [dst, 32] 253 ldp B_l, B_h, [srcend, -32] 254 stp C_l, C_h, [dst, 48] 255 ldp C_l, C_h, [srcend, -16] 256 stp D_l, D_h, [dst, 64] 257 stp E_l, E_h, [dstend, -64] 258 stp A_l, A_h, [dstend, -48] 259 stp B_l, B_h, [dstend, -32] 260 stp C_l, C_h, [dstend, -16] 261 ret 262 263 .p2align 4 264L(move_long): 265 cbz tmp1, 3f 266 267 add srcend, src, count 268 add dstend, dstin, count 269 270 /* Align dstend to 16 byte alignment so that we don't cross cache line 271 boundaries on both loads and stores. There are at least 96 bytes 272 to copy, so copy 16 bytes unaligned and then align. The loop 273 copies 64 bytes per iteration and prefetches one iteration ahead. */ 274 275 and tmp1, dstend, 15 276 ldp D_l, D_h, [srcend, -16] 277 sub srcend, srcend, tmp1 278 sub count, count, tmp1 279 ldp A_l, A_h, [srcend, -16] 280 stp D_l, D_h, [dstend, -16] 281 ldp B_l, B_h, [srcend, -32] 282 ldp C_l, C_h, [srcend, -48] 283 ldp D_l, D_h, [srcend, -64]! 284 sub dstend, dstend, tmp1 285 subs count, count, 128 286 b.ls 2f 287 288 nop 2891: 290 stp A_l, A_h, [dstend, -16] 291 ldp A_l, A_h, [srcend, -16] 292 stp B_l, B_h, [dstend, -32] 293 ldp B_l, B_h, [srcend, -32] 294 stp C_l, C_h, [dstend, -48] 295 ldp C_l, C_h, [srcend, -48] 296 stp D_l, D_h, [dstend, -64]! 297 ldp D_l, D_h, [srcend, -64]! 298 subs count, count, 64 299 b.hi 1b 300 301 /* Write the last full set of 64 bytes. The remainder is at most 64 302 bytes, so it is safe to always copy 64 bytes from the start even if 303 there is just 1 byte left. */ 3042: 305 ldp G_l, G_h, [src, 48] 306 stp A_l, A_h, [dstend, -16] 307 ldp A_l, A_h, [src, 32] 308 stp B_l, B_h, [dstend, -32] 309 ldp B_l, B_h, [src, 16] 310 stp C_l, C_h, [dstend, -48] 311 ldp C_l, C_h, [src] 312 stp D_l, D_h, [dstend, -64] 313 stp G_l, G_h, [dstin, 48] 314 stp A_l, A_h, [dstin, 32] 315 stp B_l, B_h, [dstin, 16] 316 stp C_l, C_h, [dstin] 3173: ret 318 319END (MEMCPY) 320libc_hidden_builtin_def (MEMCPY) 321 322#endif 323