1/* Generic optimized memcpy using SIMD.
2   Copyright (C) 2020-2022 Free Software Foundation, Inc.
3
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library.  If not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22/* Assumptions:
23 *
24 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
25 *
26 */
27
28#define dstin	x0
29#define src	x1
30#define count	x2
31#define dst	x3
32#define srcend	x4
33#define dstend	x5
34#define A_l	x6
35#define A_lw	w6
36#define A_h	x7
37#define B_l	x8
38#define B_lw	w8
39#define B_h	x9
40#define C_lw	w10
41#define tmp1	x14
42
43#define A_q	q0
44#define B_q	q1
45#define C_q	q2
46#define D_q	q3
47#define E_q	q4
48#define F_q	q5
49#define G_q	q6
50#define H_q	q7
51
52
53/* This implementation supports both memcpy and memmove and shares most code.
54   It uses unaligned accesses and branchless sequences to keep the code small,
55   simple and improve performance.
56
57   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
58   copies of up to 128 bytes, and large copies.  The overhead of the overlap
59   check in memmove is negligible since it is only required for large copies.
60
61   Large copies use a software pipelined loop processing 64 bytes per
62   iteration.  The destination pointer is 16-byte aligned to minimize
63   unaligned accesses.  The loop tail is handled by always copying 64 bytes
64   from the end.  */
65
66ENTRY (__memcpy_simd)
67	PTR_ARG (0)
68	PTR_ARG (1)
69	SIZE_ARG (2)
70
71	add	srcend, src, count
72	add	dstend, dstin, count
73	cmp	count, 128
74	b.hi	L(copy_long)
75	cmp	count, 32
76	b.hi	L(copy32_128)
77
78	/* Small copies: 0..32 bytes.  */
79	cmp	count, 16
80	b.lo	L(copy16)
81	ldr	A_q, [src]
82	ldr	B_q, [srcend, -16]
83	str	A_q, [dstin]
84	str	B_q, [dstend, -16]
85	ret
86
87	/* Copy 8-15 bytes.  */
88L(copy16):
89	tbz	count, 3, L(copy8)
90	ldr	A_l, [src]
91	ldr	A_h, [srcend, -8]
92	str	A_l, [dstin]
93	str	A_h, [dstend, -8]
94	ret
95
96	/* Copy 4-7 bytes.  */
97L(copy8):
98	tbz	count, 2, L(copy4)
99	ldr	A_lw, [src]
100	ldr	B_lw, [srcend, -4]
101	str	A_lw, [dstin]
102	str	B_lw, [dstend, -4]
103	ret
104
105	/* Copy 0..3 bytes using a branchless sequence.  */
106L(copy4):
107	cbz	count, L(copy0)
108	lsr	tmp1, count, 1
109	ldrb	A_lw, [src]
110	ldrb	C_lw, [srcend, -1]
111	ldrb	B_lw, [src, tmp1]
112	strb	A_lw, [dstin]
113	strb	B_lw, [dstin, tmp1]
114	strb	C_lw, [dstend, -1]
115L(copy0):
116	ret
117
118	.p2align 4
119	/* Medium copies: 33..128 bytes.  */
120L(copy32_128):
121	ldp	A_q, B_q, [src]
122	ldp	C_q, D_q, [srcend, -32]
123	cmp	count, 64
124	b.hi	L(copy128)
125	stp	A_q, B_q, [dstin]
126	stp	C_q, D_q, [dstend, -32]
127	ret
128
129	.p2align 4
130	/* Copy 65..128 bytes.  */
131L(copy128):
132	ldp	E_q, F_q, [src, 32]
133	cmp	count, 96
134	b.ls	L(copy96)
135	ldp	G_q, H_q, [srcend, -64]
136	stp	G_q, H_q, [dstend, -64]
137L(copy96):
138	stp	A_q, B_q, [dstin]
139	stp	E_q, F_q, [dstin, 32]
140	stp	C_q, D_q, [dstend, -32]
141	ret
142
143	/* Align loop64 below to 16 bytes.  */
144	nop
145
146	/* Copy more than 128 bytes.  */
147L(copy_long):
148	/* Copy 16 bytes and then align src to 16-byte alignment.  */
149	ldr	D_q, [src]
150	and	tmp1, src, 15
151	bic	src, src, 15
152	sub	dst, dstin, tmp1
153	add	count, count, tmp1	/* Count is now 16 too large.  */
154	ldp	A_q, B_q, [src, 16]
155	str	D_q, [dstin]
156	ldp	C_q, D_q, [src, 48]
157	subs	count, count, 128 + 16	/* Test and readjust count.  */
158	b.ls	L(copy64_from_end)
159L(loop64):
160	stp	A_q, B_q, [dst, 16]
161	ldp	A_q, B_q, [src, 80]
162	stp	C_q, D_q, [dst, 48]
163	ldp	C_q, D_q, [src, 112]
164	add	src, src, 64
165	add	dst, dst, 64
166	subs	count, count, 64
167	b.hi	L(loop64)
168
169	/* Write the last iteration and copy 64 bytes from the end.  */
170L(copy64_from_end):
171	ldp	E_q, F_q, [srcend, -64]
172	stp	A_q, B_q, [dst, 16]
173	ldp	A_q, B_q, [srcend, -32]
174	stp	C_q, D_q, [dst, 48]
175	stp	E_q, F_q, [dstend, -64]
176	stp	A_q, B_q, [dstend, -32]
177	ret
178
179END (__memcpy_simd)
180libc_hidden_builtin_def (__memcpy_simd)
181
182
183ENTRY (__memmove_simd)
184	PTR_ARG (0)
185	PTR_ARG (1)
186	SIZE_ARG (2)
187
188	add	srcend, src, count
189	add	dstend, dstin, count
190	cmp	count, 128
191	b.hi	L(move_long)
192	cmp	count, 32
193	b.hi	L(copy32_128)
194
195	/* Small moves: 0..32 bytes.  */
196	cmp	count, 16
197	b.lo	L(copy16)
198	ldr	A_q, [src]
199	ldr	B_q, [srcend, -16]
200	str	A_q, [dstin]
201	str	B_q, [dstend, -16]
202	ret
203
204L(move_long):
205	/* Only use backward copy if there is an overlap.  */
206	sub	tmp1, dstin, src
207	cbz	tmp1, L(move0)
208	cmp	tmp1, count
209	b.hs	L(copy_long)
210
211	/* Large backwards copy for overlapping copies.
212	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
213L(copy_long_backwards):
214	ldr	D_q, [srcend, -16]
215	and	tmp1, srcend, 15
216	bic	srcend, srcend, 15
217	sub	count, count, tmp1
218	ldp	A_q, B_q, [srcend, -32]
219	str	D_q, [dstend, -16]
220	ldp	C_q, D_q, [srcend, -64]
221	sub	dstend, dstend, tmp1
222	subs	count, count, 128
223	b.ls	L(copy64_from_start)
224
225L(loop64_backwards):
226	str	B_q, [dstend, -16]
227	str	A_q, [dstend, -32]
228	ldp	A_q, B_q, [srcend, -96]
229	str	D_q, [dstend, -48]
230	str	C_q, [dstend, -64]!
231	ldp	C_q, D_q, [srcend, -128]
232	sub	srcend, srcend, 64
233	subs	count, count, 64
234	b.hi	L(loop64_backwards)
235
236	/* Write the last iteration and copy 64 bytes from the start.  */
237L(copy64_from_start):
238	ldp	E_q, F_q, [src, 32]
239	stp	A_q, B_q, [dstend, -32]
240	ldp	A_q, B_q, [src]
241	stp	C_q, D_q, [dstend, -64]
242	stp	E_q, F_q, [dstin, 32]
243	stp	A_q, B_q, [dstin]
244L(move0):
245	ret
246
247END (__memmove_simd)
248libc_hidden_builtin_def (__memmove_simd)
249