1/* Optimized memcpy for Fujitsu A64FX processor.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library.  If not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22#undef BTI_C
23#define BTI_C
24
25/* Assumptions:
26 *
27 * ARMv8.2-a, AArch64, unaligned accesses, sve
28 *
29 */
30
31#define dstin	x0
32#define src	x1
33#define n	x2
34#define dst	x3
35#define dstend	x4
36#define srcend	x5
37#define tmp	x6
38#define vlen	x7
39#define vlen8	x8
40
41#if HAVE_AARCH64_SVE_ASM
42# if IS_IN (libc)
43#  define MEMCPY __memcpy_a64fx
44#  define MEMMOVE __memmove_a64fx
45
46	.arch armv8.2-a+sve
47
48	.macro ld1b_unroll8
49	ld1b	z0.b, p0/z, [src, 0, mul vl]
50	ld1b	z1.b, p0/z, [src, 1, mul vl]
51	ld1b	z2.b, p0/z, [src, 2, mul vl]
52	ld1b	z3.b, p0/z, [src, 3, mul vl]
53	ld1b	z4.b, p0/z, [src, 4, mul vl]
54	ld1b	z5.b, p0/z, [src, 5, mul vl]
55	ld1b	z6.b, p0/z, [src, 6, mul vl]
56	ld1b	z7.b, p0/z, [src, 7, mul vl]
57	.endm
58
59	.macro stld1b_unroll4a
60	st1b	z0.b, p0,   [dst, 0, mul vl]
61	st1b	z1.b, p0,   [dst, 1, mul vl]
62	ld1b	z0.b, p0/z, [src, 0, mul vl]
63	ld1b	z1.b, p0/z, [src, 1, mul vl]
64	st1b	z2.b, p0,   [dst, 2, mul vl]
65	st1b	z3.b, p0,   [dst, 3, mul vl]
66	ld1b	z2.b, p0/z, [src, 2, mul vl]
67	ld1b	z3.b, p0/z, [src, 3, mul vl]
68	.endm
69
70	.macro stld1b_unroll4b
71	st1b	z4.b, p0,   [dst, 4, mul vl]
72	st1b	z5.b, p0,   [dst, 5, mul vl]
73	ld1b	z4.b, p0/z, [src, 4, mul vl]
74	ld1b	z5.b, p0/z, [src, 5, mul vl]
75	st1b	z6.b, p0,   [dst, 6, mul vl]
76	st1b	z7.b, p0,   [dst, 7, mul vl]
77	ld1b	z6.b, p0/z, [src, 6, mul vl]
78	ld1b	z7.b, p0/z, [src, 7, mul vl]
79	.endm
80
81	.macro stld1b_unroll8
82	stld1b_unroll4a
83	stld1b_unroll4b
84	.endm
85
86	.macro st1b_unroll8
87	st1b	z0.b, p0, [dst, 0, mul vl]
88	st1b	z1.b, p0, [dst, 1, mul vl]
89	st1b	z2.b, p0, [dst, 2, mul vl]
90	st1b	z3.b, p0, [dst, 3, mul vl]
91	st1b	z4.b, p0, [dst, 4, mul vl]
92	st1b	z5.b, p0, [dst, 5, mul vl]
93	st1b	z6.b, p0, [dst, 6, mul vl]
94	st1b	z7.b, p0, [dst, 7, mul vl]
95	.endm
96
97#undef BTI_C
98#define BTI_C
99
100ENTRY (MEMCPY)
101
102	PTR_ARG (0)
103	PTR_ARG (1)
104	SIZE_ARG (2)
105
106	cntb	vlen
107	cmp	n, vlen, lsl 1
108	b.hi	L(copy_small)
109	whilelo	p1.b, vlen, n
110	whilelo	p0.b, xzr, n
111	ld1b	z0.b, p0/z, [src, 0, mul vl]
112	ld1b	z1.b, p1/z, [src, 1, mul vl]
113	st1b	z0.b, p0, [dstin, 0, mul vl]
114	st1b	z1.b, p1, [dstin, 1, mul vl]
115	ret
116
117	.p2align 4
118
119L(copy_small):
120	cmp	n, vlen, lsl 3
121	b.hi	L(copy_large)
122	add	dstend, dstin, n
123	add	srcend, src, n
124	cmp	n, vlen, lsl 2
125	b.hi	1f
126
127	/* Copy 2-4 vectors.  */
128	ptrue	p0.b
129	ld1b	z0.b, p0/z, [src, 0, mul vl]
130	ld1b	z1.b, p0/z, [src, 1, mul vl]
131	ld1b	z2.b, p0/z, [srcend, -2, mul vl]
132	ld1b	z3.b, p0/z, [srcend, -1, mul vl]
133	st1b	z0.b, p0, [dstin, 0, mul vl]
134	st1b	z1.b, p0, [dstin, 1, mul vl]
135	st1b	z2.b, p0, [dstend, -2, mul vl]
136	st1b	z3.b, p0, [dstend, -1, mul vl]
137	ret
138
139	.p2align 4
140	/* Copy 4-8 vectors.  */
1411:	ptrue	p0.b
142	ld1b	z0.b, p0/z, [src, 0, mul vl]
143	ld1b	z1.b, p0/z, [src, 1, mul vl]
144	ld1b	z2.b, p0/z, [src, 2, mul vl]
145	ld1b	z3.b, p0/z, [src, 3, mul vl]
146	ld1b	z4.b, p0/z, [srcend, -4, mul vl]
147	ld1b	z5.b, p0/z, [srcend, -3, mul vl]
148	ld1b	z6.b, p0/z, [srcend, -2, mul vl]
149	ld1b	z7.b, p0/z, [srcend, -1, mul vl]
150	st1b	z0.b, p0, [dstin, 0, mul vl]
151	st1b	z1.b, p0, [dstin, 1, mul vl]
152	st1b	z2.b, p0, [dstin, 2, mul vl]
153	st1b	z3.b, p0, [dstin, 3, mul vl]
154	st1b	z4.b, p0, [dstend, -4, mul vl]
155	st1b	z5.b, p0, [dstend, -3, mul vl]
156	st1b	z6.b, p0, [dstend, -2, mul vl]
157	st1b	z7.b, p0, [dstend, -1, mul vl]
158	ret
159
160	.p2align 4
161	/* At least 8 vectors - always align to vector length for
162	   higher and consistent write performance.  */
163L(copy_large):
164	sub	tmp, vlen, 1
165	and	tmp, dstin, tmp
166	sub	tmp, vlen, tmp
167	whilelo	p1.b, xzr, tmp
168	ld1b	z1.b, p1/z, [src]
169	st1b	z1.b, p1, [dstin]
170	add	dst, dstin, tmp
171	add	src, src, tmp
172	sub	n, n, tmp
173	ptrue	p0.b
174
175	lsl	vlen8, vlen, 3
176	subs	n, n, vlen8
177	b.ls	3f
178	ld1b_unroll8
179	add	src, src, vlen8
180	subs	n, n, vlen8
181	b.ls	2f
182
183	.p2align 4
184	/* 8x unrolled and software pipelined loop.  */
1851:	stld1b_unroll8
186	add	dst, dst, vlen8
187	add	src, src, vlen8
188	subs	n, n, vlen8
189	b.hi	1b
1902:	st1b_unroll8
191	add	dst, dst, vlen8
1923:	add	n, n, vlen8
193
194	/* Move last 0-8 vectors.  */
195L(last_bytes):
196	cmp	n, vlen, lsl 1
197	b.hi	1f
198	whilelo	p0.b, xzr, n
199	whilelo	p1.b, vlen, n
200	ld1b	z0.b, p0/z, [src, 0, mul vl]
201	ld1b	z1.b, p1/z, [src, 1, mul vl]
202	st1b	z0.b, p0, [dst, 0, mul vl]
203	st1b	z1.b, p1, [dst, 1, mul vl]
204	ret
205
206	.p2align 4
207
2081:	add	srcend, src, n
209	add	dstend, dst, n
210	ld1b	z0.b, p0/z, [src, 0, mul vl]
211	ld1b	z1.b, p0/z, [src, 1, mul vl]
212	ld1b	z2.b, p0/z, [srcend, -2, mul vl]
213	ld1b	z3.b, p0/z, [srcend, -1, mul vl]
214	cmp	n, vlen, lsl 2
215	b.hi	1f
216
217	st1b	z0.b, p0, [dst, 0, mul vl]
218	st1b	z1.b, p0, [dst, 1, mul vl]
219	st1b	z2.b, p0, [dstend, -2, mul vl]
220	st1b	z3.b, p0, [dstend, -1, mul vl]
221	ret
222
2231:	ld1b	z4.b, p0/z, [src, 2, mul vl]
224	ld1b	z5.b, p0/z, [src, 3, mul vl]
225	ld1b	z6.b, p0/z, [srcend, -4, mul vl]
226	ld1b	z7.b, p0/z, [srcend, -3, mul vl]
227	st1b	z0.b, p0, [dst, 0, mul vl]
228	st1b	z1.b, p0, [dst, 1, mul vl]
229	st1b	z4.b, p0, [dst, 2, mul vl]
230	st1b	z5.b, p0, [dst, 3, mul vl]
231	st1b	z6.b, p0, [dstend, -4, mul vl]
232	st1b	z7.b, p0, [dstend, -3, mul vl]
233	st1b	z2.b, p0, [dstend, -2, mul vl]
234	st1b	z3.b, p0, [dstend, -1, mul vl]
235	ret
236
237END (MEMCPY)
238libc_hidden_builtin_def (MEMCPY)
239
240
241ENTRY_ALIGN (MEMMOVE, 4)
242
243	PTR_ARG (0)
244	PTR_ARG (1)
245	SIZE_ARG (2)
246
247	/* Fast case for up to 2 vectors.  */
248	cntb	vlen
249	cmp	n, vlen, lsl 1
250	b.hi	1f
251	whilelo	p0.b, xzr, n
252	whilelo	p1.b, vlen, n
253	ld1b	z0.b, p0/z, [src, 0, mul vl]
254	ld1b	z1.b, p1/z, [src, 1, mul vl]
255	st1b	z0.b, p0, [dstin, 0, mul vl]
256	st1b	z1.b, p1, [dstin, 1, mul vl]
257L(full_overlap):
258	ret
259
260	.p2align 4
261	/* Check for overlapping moves. Return if there is a full overlap.
262	   Small moves up to 8 vectors use the overlap-safe copy_small code.
263	   Non-overlapping or overlapping moves with dst < src use memcpy.
264	   Overlapping moves with dst > src use a backward copy loop.  */
2651:	sub	tmp, dstin, src
266	ands	tmp, tmp, 0xffffffffffffff	/* Clear special tag bits.  */
267	b.eq	L(full_overlap)
268	cmp	n, vlen, lsl 3
269	b.ls	L(copy_small)
270	cmp	tmp, n
271	b.hs	L(copy_large)
272
273	/* Align to vector length.  */
274	add	dst, dstin, n
275	sub	tmp, vlen, 1
276	ands	tmp, dst, tmp
277	csel	tmp, tmp, vlen, ne
278	whilelo	p1.b, xzr, tmp
279	sub	n, n, tmp
280	ld1b	z1.b, p1/z, [src, n]
281	st1b	z1.b, p1, [dstin, n]
282	add	src, src, n
283	add	dst, dstin, n
284
285	ptrue	p0.b
286	lsl	vlen8, vlen, 3
287	subs	n, n, vlen8
288	b.ls	3f
289	sub	src, src, vlen8
290	ld1b_unroll8
291	subs	n, n, vlen8
292	b.ls	2f
293
294	.p2align 4
295	/* 8x unrolled and software pipelined backward copy loop.  */
2961:	sub	src, src, vlen8
297	sub	dst, dst, vlen8
298	stld1b_unroll8
299	subs	n, n, vlen8
300	b.hi	1b
3012:	sub	dst, dst, vlen8
302	st1b_unroll8
3033:	add	n, n, vlen8
304
305	/* Adjust src/dst for last 0-8 vectors.  */
306	sub	src, src, n
307	mov	dst, dstin
308	b	L(last_bytes)
309
310END (MEMMOVE)
311libc_hidden_builtin_def (MEMMOVE)
312# endif /* IS_IN (libc) */
313#endif /* HAVE_AARCH64_SVE_ASM */
314