1/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
2
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* Assumptions:
22 *
23 * ARMv8-a, AArch64, unaligned accesses.
24 *
25 */
26
27#define dstin	x0
28#define src	x1
29#define count	x2
30#define dst	x3
31#define srcend	x4
32#define dstend	x5
33#define A_l	x6
34#define A_lw	w6
35#define A_h	x7
36#define B_l	x8
37#define B_lw	w8
38#define B_h	x9
39#define C_l	x10
40#define C_lw	w10
41#define C_h	x11
42#define D_l	x12
43#define D_h	x13
44#define E_l	x14
45#define E_h	x15
46#define F_l	x16
47#define F_h	x17
48#define G_l	count
49#define G_h	dst
50#define H_l	src
51#define H_h	srcend
52#define tmp1	x14
53
54#ifndef MEMMOVE
55# define MEMMOVE memmove
56#endif
57#ifndef MEMCPY
58# define MEMCPY memcpy
59#endif
60
61/* This implementation supports both memcpy and memmove and shares most code.
62   It uses unaligned accesses and branchless sequences to keep the code small,
63   simple and improve performance.
64
65   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
66   copies of up to 128 bytes, and large copies.  The overhead of the overlap
67   check in memmove is negligible since it is only required for large copies.
68
69   Large copies use a software pipelined loop processing 64 bytes per
70   iteration.  The destination pointer is 16-byte aligned to minimize
71   unaligned accesses.  The loop tail is handled by always copying 64 bytes
72   from the end.
73*/
74
75ENTRY_ALIGN (MEMCPY, 6)
76	PTR_ARG (0)
77	PTR_ARG (1)
78	SIZE_ARG (2)
79
80	add	srcend, src, count
81	add	dstend, dstin, count
82	cmp	count, 128
83	b.hi	L(copy_long)
84	cmp	count, 32
85	b.hi	L(copy32_128)
86
87	/* Small copies: 0..32 bytes.  */
88	cmp	count, 16
89	b.lo	L(copy16)
90	ldp	A_l, A_h, [src]
91	ldp	D_l, D_h, [srcend, -16]
92	stp	A_l, A_h, [dstin]
93	stp	D_l, D_h, [dstend, -16]
94	ret
95
96	/* Copy 8-15 bytes.  */
97L(copy16):
98	tbz	count, 3, L(copy8)
99	ldr	A_l, [src]
100	ldr	A_h, [srcend, -8]
101	str	A_l, [dstin]
102	str	A_h, [dstend, -8]
103	ret
104
105	.p2align 3
106	/* Copy 4-7 bytes.  */
107L(copy8):
108	tbz	count, 2, L(copy4)
109	ldr	A_lw, [src]
110	ldr	B_lw, [srcend, -4]
111	str	A_lw, [dstin]
112	str	B_lw, [dstend, -4]
113	ret
114
115	/* Copy 0..3 bytes using a branchless sequence.  */
116L(copy4):
117	cbz	count, L(copy0)
118	lsr	tmp1, count, 1
119	ldrb	A_lw, [src]
120	ldrb	C_lw, [srcend, -1]
121	ldrb	B_lw, [src, tmp1]
122	strb	A_lw, [dstin]
123	strb	B_lw, [dstin, tmp1]
124	strb	C_lw, [dstend, -1]
125L(copy0):
126	ret
127
128	.p2align 4
129	/* Medium copies: 33..128 bytes.  */
130L(copy32_128):
131	ldp	A_l, A_h, [src]
132	ldp	B_l, B_h, [src, 16]
133	ldp	C_l, C_h, [srcend, -32]
134	ldp	D_l, D_h, [srcend, -16]
135	cmp	count, 64
136	b.hi	L(copy128)
137	stp	A_l, A_h, [dstin]
138	stp	B_l, B_h, [dstin, 16]
139	stp	C_l, C_h, [dstend, -32]
140	stp	D_l, D_h, [dstend, -16]
141	ret
142
143	.p2align 4
144	/* Copy 65..128 bytes.  */
145L(copy128):
146	ldp	E_l, E_h, [src, 32]
147	ldp	F_l, F_h, [src, 48]
148	cmp	count, 96
149	b.ls	L(copy96)
150	ldp	G_l, G_h, [srcend, -64]
151	ldp	H_l, H_h, [srcend, -48]
152	stp	G_l, G_h, [dstend, -64]
153	stp	H_l, H_h, [dstend, -48]
154L(copy96):
155	stp	A_l, A_h, [dstin]
156	stp	B_l, B_h, [dstin, 16]
157	stp	E_l, E_h, [dstin, 32]
158	stp	F_l, F_h, [dstin, 48]
159	stp	C_l, C_h, [dstend, -32]
160	stp	D_l, D_h, [dstend, -16]
161	ret
162
163	.p2align 4
164	/* Copy more than 128 bytes.  */
165L(copy_long):
166	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
167	ldp	D_l, D_h, [src]
168	and	tmp1, dstin, 15
169	bic	dst, dstin, 15
170	sub	src, src, tmp1
171	add	count, count, tmp1	/* Count is now 16 too large.  */
172	ldp	A_l, A_h, [src, 16]
173	stp	D_l, D_h, [dstin]
174	ldp	B_l, B_h, [src, 32]
175	ldp	C_l, C_h, [src, 48]
176	ldp	D_l, D_h, [src, 64]!
177	subs	count, count, 128 + 16	/* Test and readjust count.  */
178	b.ls	L(copy64_from_end)
179
180L(loop64):
181	stp	A_l, A_h, [dst, 16]
182	ldp	A_l, A_h, [src, 16]
183	stp	B_l, B_h, [dst, 32]
184	ldp	B_l, B_h, [src, 32]
185	stp	C_l, C_h, [dst, 48]
186	ldp	C_l, C_h, [src, 48]
187	stp	D_l, D_h, [dst, 64]!
188	ldp	D_l, D_h, [src, 64]!
189	subs	count, count, 64
190	b.hi	L(loop64)
191
192	/* Write the last iteration and copy 64 bytes from the end.  */
193L(copy64_from_end):
194	ldp	E_l, E_h, [srcend, -64]
195	stp	A_l, A_h, [dst, 16]
196	ldp	A_l, A_h, [srcend, -48]
197	stp	B_l, B_h, [dst, 32]
198	ldp	B_l, B_h, [srcend, -32]
199	stp	C_l, C_h, [dst, 48]
200	ldp	C_l, C_h, [srcend, -16]
201	stp	D_l, D_h, [dst, 64]
202	stp	E_l, E_h, [dstend, -64]
203	stp	A_l, A_h, [dstend, -48]
204	stp	B_l, B_h, [dstend, -32]
205	stp	C_l, C_h, [dstend, -16]
206	ret
207
208END (MEMCPY)
209libc_hidden_builtin_def (MEMCPY)
210
211ENTRY_ALIGN (MEMMOVE, 4)
212	PTR_ARG (0)
213	PTR_ARG (1)
214	SIZE_ARG (2)
215
216	add	srcend, src, count
217	add	dstend, dstin, count
218	cmp	count, 128
219	b.hi	L(move_long)
220	cmp	count, 32
221	b.hi	L(copy32_128)
222
223	/* Small copies: 0..32 bytes.  */
224	cmp	count, 16
225	b.lo	L(copy16)
226	ldp	A_l, A_h, [src]
227	ldp	D_l, D_h, [srcend, -16]
228	stp	A_l, A_h, [dstin]
229	stp	D_l, D_h, [dstend, -16]
230	ret
231
232	.p2align 4
233L(move_long):
234	/* Only use backward copy if there is an overlap.  */
235	sub	tmp1, dstin, src
236	cbz	tmp1, L(copy0)
237	cmp	tmp1, count
238	b.hs	L(copy_long)
239
240	/* Large backwards copy for overlapping copies.
241	   Copy 16 bytes and then align dst to 16-byte alignment.  */
242	ldp	D_l, D_h, [srcend, -16]
243	and	tmp1, dstend, 15
244	sub	srcend, srcend, tmp1
245	sub	count, count, tmp1
246	ldp	A_l, A_h, [srcend, -16]
247	stp	D_l, D_h, [dstend, -16]
248	ldp	B_l, B_h, [srcend, -32]
249	ldp	C_l, C_h, [srcend, -48]
250	ldp	D_l, D_h, [srcend, -64]!
251	sub	dstend, dstend, tmp1
252	subs	count, count, 128
253	b.ls	L(copy64_from_start)
254
255L(loop64_backwards):
256	stp	A_l, A_h, [dstend, -16]
257	ldp	A_l, A_h, [srcend, -16]
258	stp	B_l, B_h, [dstend, -32]
259	ldp	B_l, B_h, [srcend, -32]
260	stp	C_l, C_h, [dstend, -48]
261	ldp	C_l, C_h, [srcend, -48]
262	stp	D_l, D_h, [dstend, -64]!
263	ldp	D_l, D_h, [srcend, -64]!
264	subs	count, count, 64
265	b.hi	L(loop64_backwards)
266
267	/* Write the last iteration and copy 64 bytes from the start.  */
268L(copy64_from_start):
269	ldp	G_l, G_h, [src, 48]
270	stp	A_l, A_h, [dstend, -16]
271	ldp	A_l, A_h, [src, 32]
272	stp	B_l, B_h, [dstend, -32]
273	ldp	B_l, B_h, [src, 16]
274	stp	C_l, C_h, [dstend, -48]
275	ldp	C_l, C_h, [src]
276	stp	D_l, D_h, [dstend, -64]
277	stp	G_l, G_h, [dstin, 48]
278	stp	A_l, A_h, [dstin, 32]
279	stp	B_l, B_h, [dstin, 16]
280	stp	C_l, C_h, [dstin]
281	ret
282
283END (MEMMOVE)
284libc_hidden_builtin_def (MEMMOVE)
285