1/* A Thunderx Optimized memcpy implementation for AARCH64.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20/* The actual code in this memcpy and memmove should be identical to the
21   generic version except for the code under '#ifdef THUNDERX'.  This is
22   to make is easier to keep this version and the generic version in sync
23   for changes that are not specific to thunderx.  */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, unaligned accesses.
30 *
31 */
32
33#define dstin	x0
34#define src	x1
35#define count	x2
36#define dst	x3
37#define srcend	x4
38#define dstend	x5
39#define A_l	x6
40#define A_lw	w6
41#define A_h	x7
42#define A_hw	w7
43#define B_l	x8
44#define B_lw	w8
45#define B_h	x9
46#define C_l	x10
47#define C_h	x11
48#define D_l	x12
49#define D_h	x13
50#define E_l	src
51#define E_h	count
52#define F_l	srcend
53#define F_h	dst
54#define G_l	count
55#define G_h	dst
56#define tmp1	x14
57
58/* Copies are split into 3 main cases: small copies of up to 16 bytes,
59   medium copies of 17..96 bytes which are fully unrolled. Large copies
60   of more than 96 bytes align the destination and use an unrolled loop
61   processing 64 bytes per iteration.
62   In order to share code with memmove, small and medium copies read all
63   data before writing, allowing any kind of overlap. So small, medium
64   and large backwards memmoves are handled by falling through into memcpy.
65   Overlapping large forward memmoves use a loop that copies backwards.
66*/
67
68#ifndef MEMMOVE
69# define MEMMOVE memmove
70#endif
71#ifndef MEMCPY
72# define MEMCPY memcpy
73#endif
74
75#if IS_IN (libc)
76
77#  undef MEMCPY
78#  define MEMCPY __memcpy_thunderx
79#  undef MEMMOVE
80#  define MEMMOVE __memmove_thunderx
81
82ENTRY_ALIGN (MEMMOVE, 6)
83
84	PTR_ARG (0)
85	PTR_ARG (1)
86	SIZE_ARG (2)
87
88	sub	tmp1, dstin, src
89	cmp	count, 96
90	ccmp	tmp1, count, 2, hi
91	b.lo	L(move_long)
92
93	/* Common case falls through into memcpy.  */
94END (MEMMOVE)
95libc_hidden_builtin_def (MEMMOVE)
96ENTRY (MEMCPY)
97
98	PTR_ARG (0)
99	PTR_ARG (1)
100	SIZE_ARG (2)
101
102	prfm	PLDL1KEEP, [src]
103	add	srcend, src, count
104	add	dstend, dstin, count
105	cmp	count, 16
106	b.ls	L(copy16)
107	cmp	count, 96
108	b.hi	L(copy_long)
109
110	/* Medium copies: 17..96 bytes.  */
111	sub	tmp1, count, 1
112	ldp	A_l, A_h, [src]
113	tbnz	tmp1, 6, L(copy96)
114	ldp	D_l, D_h, [srcend, -16]
115	tbz	tmp1, 5, 1f
116	ldp	B_l, B_h, [src, 16]
117	ldp	C_l, C_h, [srcend, -32]
118	stp	B_l, B_h, [dstin, 16]
119	stp	C_l, C_h, [dstend, -32]
1201:
121	stp	A_l, A_h, [dstin]
122	stp	D_l, D_h, [dstend, -16]
123	ret
124
125	.p2align 4
126	/* Small copies: 0..16 bytes.  */
127L(copy16):
128	cmp	count, 8
129	b.lo	1f
130	ldr	A_l, [src]
131	ldr	A_h, [srcend, -8]
132	str	A_l, [dstin]
133	str	A_h, [dstend, -8]
134	ret
135	.p2align 4
1361:
137	tbz	count, 2, 1f
138	ldr	A_lw, [src]
139	ldr	A_hw, [srcend, -4]
140	str	A_lw, [dstin]
141	str	A_hw, [dstend, -4]
142	ret
143
144	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
145	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1461:
147	cbz	count, 2f
148	lsr	tmp1, count, 1
149	ldrb	A_lw, [src]
150	ldrb	A_hw, [srcend, -1]
151	ldrb	B_lw, [src, tmp1]
152	strb	A_lw, [dstin]
153	strb	B_lw, [dstin, tmp1]
154	strb	A_hw, [dstend, -1]
1552:	ret
156
157	.p2align 4
158	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
159	   32 bytes from the end.  */
160L(copy96):
161	ldp	B_l, B_h, [src, 16]
162	ldp	C_l, C_h, [src, 32]
163	ldp	D_l, D_h, [src, 48]
164	ldp	E_l, E_h, [srcend, -32]
165	ldp	F_l, F_h, [srcend, -16]
166	stp	A_l, A_h, [dstin]
167	stp	B_l, B_h, [dstin, 16]
168	stp	C_l, C_h, [dstin, 32]
169	stp	D_l, D_h, [dstin, 48]
170	stp	E_l, E_h, [dstend, -32]
171	stp	F_l, F_h, [dstend, -16]
172	ret
173
174	/* Align DST to 16 byte alignment so that we don't cross cache line
175	   boundaries on both loads and stores.  There are at least 96 bytes
176	   to copy, so copy 16 bytes unaligned and then align.  The loop
177	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
178
179	.p2align 4
180L(copy_long):
181
182	/* On thunderx, large memcpy's are helped by software prefetching.
183	   This loop is identical to the one below it but with prefetching
184	   instructions included.  For loops that are less than 32768 bytes,
185	   the prefetching does not help and slow the code down so we only
186	   use the prefetching loop for the largest memcpys.  */
187
188	cmp	count, #32768
189	b.lo	L(copy_long_without_prefetch)
190	and	tmp1, dstin, 15
191	bic	dst, dstin, 15
192	ldp	D_l, D_h, [src]
193	sub	src, src, tmp1
194	prfm	pldl1strm, [src, 384]
195	add	count, count, tmp1	/* Count is now 16 too large.  */
196	ldp	A_l, A_h, [src, 16]
197	stp	D_l, D_h, [dstin]
198	ldp	B_l, B_h, [src, 32]
199	ldp	C_l, C_h, [src, 48]
200	ldp	D_l, D_h, [src, 64]!
201	subs	count, count, 128 + 16	/* Test and readjust count.  */
202
203L(prefetch_loop64):
204	tbz	src, #6, 1f
205	prfm	pldl1strm, [src, 512]
2061:
207	stp	A_l, A_h, [dst, 16]
208	ldp	A_l, A_h, [src, 16]
209	stp	B_l, B_h, [dst, 32]
210	ldp	B_l, B_h, [src, 32]
211	stp	C_l, C_h, [dst, 48]
212	ldp	C_l, C_h, [src, 48]
213	stp	D_l, D_h, [dst, 64]!
214	ldp	D_l, D_h, [src, 64]!
215	subs	count, count, 64
216	b.hi	L(prefetch_loop64)
217	b	L(last64)
218
219L(copy_long_without_prefetch):
220
221	and	tmp1, dstin, 15
222	bic	dst, dstin, 15
223	ldp	D_l, D_h, [src]
224	sub	src, src, tmp1
225	add	count, count, tmp1	/* Count is now 16 too large.  */
226	ldp	A_l, A_h, [src, 16]
227	stp	D_l, D_h, [dstin]
228	ldp	B_l, B_h, [src, 32]
229	ldp	C_l, C_h, [src, 48]
230	ldp	D_l, D_h, [src, 64]!
231	subs	count, count, 128 + 16	/* Test and readjust count.  */
232	b.ls	L(last64)
233L(loop64):
234	stp	A_l, A_h, [dst, 16]
235	ldp	A_l, A_h, [src, 16]
236	stp	B_l, B_h, [dst, 32]
237	ldp	B_l, B_h, [src, 32]
238	stp	C_l, C_h, [dst, 48]
239	ldp	C_l, C_h, [src, 48]
240	stp	D_l, D_h, [dst, 64]!
241	ldp	D_l, D_h, [src, 64]!
242	subs	count, count, 64
243	b.hi	L(loop64)
244
245	/* Write the last full set of 64 bytes.  The remainder is at most 64
246	   bytes, so it is safe to always copy 64 bytes from the end even if
247	   there is just 1 byte left.  */
248L(last64):
249	ldp	E_l, E_h, [srcend, -64]
250	stp	A_l, A_h, [dst, 16]
251	ldp	A_l, A_h, [srcend, -48]
252	stp	B_l, B_h, [dst, 32]
253	ldp	B_l, B_h, [srcend, -32]
254	stp	C_l, C_h, [dst, 48]
255	ldp	C_l, C_h, [srcend, -16]
256	stp	D_l, D_h, [dst, 64]
257	stp	E_l, E_h, [dstend, -64]
258	stp	A_l, A_h, [dstend, -48]
259	stp	B_l, B_h, [dstend, -32]
260	stp	C_l, C_h, [dstend, -16]
261	ret
262
263	.p2align 4
264L(move_long):
265	cbz	tmp1, 3f
266
267	add	srcend, src, count
268	add	dstend, dstin, count
269
270	/* Align dstend to 16 byte alignment so that we don't cross cache line
271	   boundaries on both loads and stores.  There are at least 96 bytes
272	   to copy, so copy 16 bytes unaligned and then align.  The loop
273	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
274
275	and	tmp1, dstend, 15
276	ldp	D_l, D_h, [srcend, -16]
277	sub	srcend, srcend, tmp1
278	sub	count, count, tmp1
279	ldp	A_l, A_h, [srcend, -16]
280	stp	D_l, D_h, [dstend, -16]
281	ldp	B_l, B_h, [srcend, -32]
282	ldp	C_l, C_h, [srcend, -48]
283	ldp	D_l, D_h, [srcend, -64]!
284	sub	dstend, dstend, tmp1
285	subs	count, count, 128
286	b.ls	2f
287
288	nop
2891:
290	stp	A_l, A_h, [dstend, -16]
291	ldp	A_l, A_h, [srcend, -16]
292	stp	B_l, B_h, [dstend, -32]
293	ldp	B_l, B_h, [srcend, -32]
294	stp	C_l, C_h, [dstend, -48]
295	ldp	C_l, C_h, [srcend, -48]
296	stp	D_l, D_h, [dstend, -64]!
297	ldp	D_l, D_h, [srcend, -64]!
298	subs	count, count, 64
299	b.hi	1b
300
301	/* Write the last full set of 64 bytes.  The remainder is at most 64
302	   bytes, so it is safe to always copy 64 bytes from the start even if
303	   there is just 1 byte left.  */
3042:
305	ldp	G_l, G_h, [src, 48]
306	stp	A_l, A_h, [dstend, -16]
307	ldp	A_l, A_h, [src, 32]
308	stp	B_l, B_h, [dstend, -32]
309	ldp	B_l, B_h, [src, 16]
310	stp	C_l, C_h, [dstend, -48]
311	ldp	C_l, C_h, [src]
312	stp	D_l, D_h, [dstend, -64]
313	stp	G_l, G_h, [dstin, 48]
314	stp	A_l, A_h, [dstin, 32]
315	stp	B_l, B_h, [dstin, 16]
316	stp	C_l, C_h, [dstin]
3173:	ret
318
319END (MEMCPY)
320libc_hidden_builtin_def (MEMCPY)
321
322#endif
323