1/* memmove/memcpy/mempcpy optimized for aligned access with SSSE3.
2   All versions must be listed in ifunc-impl-list.c.
3   Copyright (C) 2022 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20
21#include <isa-level.h>
22
23#if ISA_SHOULD_BUILD (2)
24
25# include <sysdep.h>
26# ifndef MEMMOVE
27#  define MEMMOVE	__memmove_ssse3
28#  define MEMMOVE_CHK	__memmove_chk_ssse3
29#  define MEMCPY	__memcpy_ssse3
30#  define MEMCPY_CHK	__memcpy_chk_ssse3
31#  define MEMPCPY	__mempcpy_ssse3
32#  define MEMPCPY_CHK	__mempcpy_chk_ssse3
33# endif
34
35	.section .text.ssse3, "ax", @progbits
36# if defined SHARED
37ENTRY(MEMPCPY_CHK)
38	cmp	%RDX_LP, %RCX_LP
39	jb	HIDDEN_JUMPTARGET(__chk_fail)
40END(MEMPCPY_CHK)
41# endif
42
43ENTRY(MEMPCPY)
44	mov	%RDI_LP, %RAX_LP
45	add	%RDX_LP, %RAX_LP
46	jmp	L(start)
47END(MEMPCPY)
48
49# if defined SHARED
50ENTRY(MEMMOVE_CHK)
51	cmp	%RDX_LP, %RCX_LP
52	jb	HIDDEN_JUMPTARGET(__chk_fail)
53END(MEMMOVE_CHK)
54# endif
55
56ENTRY_P2ALIGN(MEMMOVE, 6)
57# ifdef __ILP32__
58	/* Clear the upper 32 bits.  */
59	movl	%edx, %edx
60# endif
61	movq	%rdi, %rax
62L(start):
63	cmpq	$16, %rdx
64	jb	L(copy_0_15)
65
66	/* These loads are always useful.  */
67	movups	0(%rsi), %xmm0
68	movups	-16(%rsi, %rdx), %xmm7
69	cmpq	$32, %rdx
70	ja	L(more_2x_vec)
71
72	movups	%xmm0, 0(%rdi)
73	movups	%xmm7, -16(%rdi, %rdx)
74	ret
75
76	.p2align 4,, 4
77L(copy_0_15):
78	cmpl	$4, %edx
79	jb	L(copy_0_3)
80	cmpl	$8, %edx
81	jb	L(copy_4_7)
82	movq	0(%rsi), %rcx
83	movq	-8(%rsi, %rdx), %rsi
84	movq	%rcx, 0(%rdi)
85	movq	%rsi, -8(%rdi, %rdx)
86	ret
87
88	.p2align 4,, 4
89L(copy_4_7):
90	movl	0(%rsi), %ecx
91	movl	-4(%rsi, %rdx), %esi
92	movl	%ecx, 0(%rdi)
93	movl	%esi, -4(%rdi, %rdx)
94	ret
95
96	.p2align 4,, 4
97L(copy_0_3):
98	decl	%edx
99	jl	L(copy_0_0)
100	movb	(%rsi), %cl
101	je	L(copy_1_1)
102
103	movzwl	-1(%rsi, %rdx), %esi
104	movw	%si, -1(%rdi, %rdx)
105L(copy_1_1):
106	movb	%cl, (%rdi)
107L(copy_0_0):
108	ret
109
110	.p2align 4,, 4
111L(copy_4x_vec):
112	movups	16(%rsi), %xmm1
113	movups	-32(%rsi, %rdx), %xmm2
114
115	movups	%xmm0, 0(%rdi)
116	movups	%xmm1, 16(%rdi)
117	movups	%xmm2, -32(%rdi, %rdx)
118	movups	%xmm7, -16(%rdi, %rdx)
119L(nop):
120	ret
121
122	.p2align 4
123L(more_2x_vec):
124	cmpq	$64, %rdx
125	jbe	L(copy_4x_vec)
126
127	/* We use rcx later to get alignr value.  */
128	movq	%rdi, %rcx
129
130	/* Backward copy for overlap + dst > src for memmove safety.  */
131	subq	%rsi, %rcx
132	cmpq	%rdx, %rcx
133	jb	L(copy_backward)
134
135	/* Load tail.  */
136
137	/* -16(%rsi, %rdx) already loaded into xmm7.  */
138	movups	-32(%rsi, %rdx), %xmm8
139	movups	-48(%rsi, %rdx), %xmm9
140
141	/* Get misalignment.  */
142	andl	$0xf, %ecx
143
144	movq	%rsi, %r9
145	addq	%rcx, %rsi
146	andq	$-16, %rsi
147	/* Get first vec for `palignr`.  */
148	movaps	(%rsi), %xmm1
149
150	/* We have loaded (%rsi) so safe to do this store before the
151	   loop.  */
152	movups	%xmm0, (%rdi)
153
154# ifdef SHARED_CACHE_SIZE_HALF
155	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
156# else
157	cmp	__x86_shared_cache_size_half(%rip), %rdx
158# endif
159	ja	L(large_memcpy)
160
161	leaq	-64(%rdi, %rdx), %r8
162	andq	$-16, %rdi
163	movl	$48, %edx
164
165	leaq	L(loop_fwd_start)(%rip), %r9
166	sall	$6, %ecx
167	addq	%r9, %rcx
168	jmp	* %rcx
169
170	.p2align 4,, 8
171L(copy_backward):
172	testq	%rcx, %rcx
173	jz	L(nop)
174
175	/* Preload tail.  */
176
177	/* (%rsi) already loaded into xmm0.  */
178	movups	16(%rsi), %xmm4
179	movups	32(%rsi), %xmm5
180
181	movq	%rdi, %r8
182	subq	%rdi, %rsi
183	leaq	-49(%rdi, %rdx), %rdi
184	andq	$-16, %rdi
185	addq	%rdi, %rsi
186	andq	$-16, %rsi
187
188	movaps	48(%rsi), %xmm6
189
190
191	leaq	L(loop_bkwd_start)(%rip), %r9
192	andl	$0xf, %ecx
193	sall	$6, %ecx
194	addq	%r9, %rcx
195	jmp	* %rcx
196
197	.p2align 4,, 8
198L(large_memcpy):
199	movups	-64(%r9, %rdx), %xmm10
200	movups	-80(%r9, %rdx), %xmm11
201
202	sall	$5, %ecx
203	leal	(%rcx, %rcx, 2), %r8d
204	leaq	-96(%rdi, %rdx), %rcx
205	andq	$-16, %rdi
206	leaq	L(large_loop_fwd_start)(%rip), %rdx
207	addq	%r8, %rdx
208	jmp	* %rdx
209
210
211	/* Instead of a typical jump table all 16 loops are exactly
212	   64-bytes in size. So, we can just jump to first loop + r8 *
213	   64. Before modifying any loop ensure all their sizes match!
214	 */
215	.p2align 6
216L(loop_fwd_start):
217L(loop_fwd_0x0):
218	movaps	16(%rsi), %xmm1
219	movaps	32(%rsi), %xmm2
220	movaps	48(%rsi), %xmm3
221	movaps	%xmm1, 16(%rdi)
222	movaps	%xmm2, 32(%rdi)
223	movaps	%xmm3, 48(%rdi)
224	addq	%rdx, %rdi
225	addq	%rdx, %rsi
226	cmpq	%rdi, %r8
227	ja	L(loop_fwd_0x0)
228L(end_loop_fwd):
229	movups	%xmm9, 16(%r8)
230	movups	%xmm8, 32(%r8)
231	movups	%xmm7, 48(%r8)
232	ret
233
234	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
235	   60 bytes otherwise.  */
236# define ALIGNED_LOOP_FWD(align_by);	\
237	.p2align 6;	\
238L(loop_fwd_ ## align_by):	\
239	movaps	16(%rsi), %xmm0;	\
240	movaps	32(%rsi), %xmm2;	\
241	movaps	48(%rsi), %xmm3;	\
242	movaps	%xmm3, %xmm4;	\
243	palignr	$align_by, %xmm2, %xmm3;	\
244	palignr	$align_by, %xmm0, %xmm2;	\
245	palignr	$align_by, %xmm1, %xmm0;	\
246	movaps	%xmm4, %xmm1;	\
247	movaps	%xmm0, 16(%rdi);	\
248	movaps	%xmm2, 32(%rdi);	\
249	movaps	%xmm3, 48(%rdi);	\
250	addq	%rdx, %rdi;	\
251	addq	%rdx, %rsi;	\
252	cmpq	%rdi, %r8;	\
253	ja	L(loop_fwd_ ## align_by);	\
254	jmp	L(end_loop_fwd);
255
256	/* Must be in descending order.  */
257	ALIGNED_LOOP_FWD (0xf)
258	ALIGNED_LOOP_FWD (0xe)
259	ALIGNED_LOOP_FWD (0xd)
260	ALIGNED_LOOP_FWD (0xc)
261	ALIGNED_LOOP_FWD (0xb)
262	ALIGNED_LOOP_FWD (0xa)
263	ALIGNED_LOOP_FWD (0x9)
264	ALIGNED_LOOP_FWD (0x8)
265	ALIGNED_LOOP_FWD (0x7)
266	ALIGNED_LOOP_FWD (0x6)
267	ALIGNED_LOOP_FWD (0x5)
268	ALIGNED_LOOP_FWD (0x4)
269	ALIGNED_LOOP_FWD (0x3)
270	ALIGNED_LOOP_FWD (0x2)
271	ALIGNED_LOOP_FWD (0x1)
272
273	.p2align 6
274L(large_loop_fwd_start):
275L(large_loop_fwd_0x0):
276	movaps	16(%rsi), %xmm1
277	movaps	32(%rsi), %xmm2
278	movaps	48(%rsi), %xmm3
279	movaps	64(%rsi), %xmm4
280	movaps	80(%rsi), %xmm5
281	movntps	%xmm1, 16(%rdi)
282	movntps	%xmm2, 32(%rdi)
283	movntps	%xmm3, 48(%rdi)
284	movntps	%xmm4, 64(%rdi)
285	movntps	%xmm5, 80(%rdi)
286	addq	$80, %rdi
287	addq	$80, %rsi
288	cmpq	%rdi, %rcx
289	ja	L(large_loop_fwd_0x0)
290
291	/* Ensure no icache line split on tail.  */
292	.p2align 4
293L(end_large_loop_fwd):
294	sfence
295	movups	%xmm11, 16(%rcx)
296	movups	%xmm10, 32(%rcx)
297	movups	%xmm9, 48(%rcx)
298	movups	%xmm8, 64(%rcx)
299	movups	%xmm7, 80(%rcx)
300	ret
301
302
303	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
304	   96-byte spacing between each.  */
305# define ALIGNED_LARGE_LOOP_FWD(align_by);	\
306	.p2align 5;	\
307L(large_loop_fwd_ ## align_by):	\
308	movaps	16(%rsi), %xmm0;	\
309	movaps	32(%rsi), %xmm2;	\
310	movaps	48(%rsi), %xmm3;	\
311	movaps	64(%rsi), %xmm4;	\
312	movaps	80(%rsi), %xmm5;	\
313	movaps	%xmm5, %xmm6;	\
314	palignr	$align_by, %xmm4, %xmm5;	\
315	palignr	$align_by, %xmm3, %xmm4;	\
316	palignr	$align_by, %xmm2, %xmm3;	\
317	palignr	$align_by, %xmm0, %xmm2;	\
318	palignr	$align_by, %xmm1, %xmm0;	\
319	movaps	%xmm6, %xmm1;	\
320	movntps	%xmm0, 16(%rdi);	\
321	movntps	%xmm2, 32(%rdi);	\
322	movntps	%xmm3, 48(%rdi);	\
323	movntps	%xmm4, 64(%rdi);	\
324	movntps	%xmm5, 80(%rdi);	\
325	addq	$80, %rdi;	\
326	addq	$80, %rsi;	\
327	cmpq	%rdi, %rcx;	\
328	ja	L(large_loop_fwd_ ## align_by);	\
329	jmp	L(end_large_loop_fwd);
330
331	/* Must be in descending order.  */
332	ALIGNED_LARGE_LOOP_FWD (0xf)
333	ALIGNED_LARGE_LOOP_FWD (0xe)
334	ALIGNED_LARGE_LOOP_FWD (0xd)
335	ALIGNED_LARGE_LOOP_FWD (0xc)
336	ALIGNED_LARGE_LOOP_FWD (0xb)
337	ALIGNED_LARGE_LOOP_FWD (0xa)
338	ALIGNED_LARGE_LOOP_FWD (0x9)
339	ALIGNED_LARGE_LOOP_FWD (0x8)
340	ALIGNED_LARGE_LOOP_FWD (0x7)
341	ALIGNED_LARGE_LOOP_FWD (0x6)
342	ALIGNED_LARGE_LOOP_FWD (0x5)
343	ALIGNED_LARGE_LOOP_FWD (0x4)
344	ALIGNED_LARGE_LOOP_FWD (0x3)
345	ALIGNED_LARGE_LOOP_FWD (0x2)
346	ALIGNED_LARGE_LOOP_FWD (0x1)
347
348
349	.p2align 6
350L(loop_bkwd_start):
351L(loop_bkwd_0x0):
352	movaps	32(%rsi), %xmm1
353	movaps	16(%rsi), %xmm2
354	movaps	0(%rsi), %xmm3
355	movaps	%xmm1, 32(%rdi)
356	movaps	%xmm2, 16(%rdi)
357	movaps	%xmm3, 0(%rdi)
358	subq	$48, %rdi
359	subq	$48, %rsi
360	cmpq	%rdi, %r8
361	jb	L(loop_bkwd_0x0)
362L(end_loop_bkwd):
363	movups	%xmm7, -16(%r8, %rdx)
364	movups	%xmm0, 0(%r8)
365	movups	%xmm4, 16(%r8)
366	movups	%xmm5, 32(%r8)
367
368	ret
369
370
371	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
372	   60 bytes otherwise.  */
373# define ALIGNED_LOOP_BKWD(align_by);	\
374	.p2align 6;	\
375L(loop_bkwd_ ## align_by):	\
376	movaps	32(%rsi), %xmm1;	\
377	movaps	16(%rsi), %xmm2;	\
378	movaps	0(%rsi), %xmm3;	\
379	palignr	$align_by, %xmm1, %xmm6;	\
380	palignr	$align_by, %xmm2, %xmm1;	\
381	palignr	$align_by, %xmm3, %xmm2;	\
382	movaps	%xmm6, 32(%rdi);	\
383	movaps	%xmm1, 16(%rdi);	\
384	movaps	%xmm2, 0(%rdi);	\
385	subq	$48, %rdi;	\
386	subq	$48, %rsi;	\
387	movaps	%xmm3, %xmm6;	\
388	cmpq	%rdi, %r8;	\
389	jb	L(loop_bkwd_ ## align_by);	\
390	jmp	L(end_loop_bkwd);
391
392	/* Must be in descending order.  */
393	ALIGNED_LOOP_BKWD (0xf)
394	ALIGNED_LOOP_BKWD (0xe)
395	ALIGNED_LOOP_BKWD (0xd)
396	ALIGNED_LOOP_BKWD (0xc)
397	ALIGNED_LOOP_BKWD (0xb)
398	ALIGNED_LOOP_BKWD (0xa)
399	ALIGNED_LOOP_BKWD (0x9)
400	ALIGNED_LOOP_BKWD (0x8)
401	ALIGNED_LOOP_BKWD (0x7)
402	ALIGNED_LOOP_BKWD (0x6)
403	ALIGNED_LOOP_BKWD (0x5)
404	ALIGNED_LOOP_BKWD (0x4)
405	ALIGNED_LOOP_BKWD (0x3)
406	ALIGNED_LOOP_BKWD (0x2)
407	ALIGNED_LOOP_BKWD (0x1)
408END(MEMMOVE)
409
410strong_alias (MEMMOVE, MEMCPY)
411# if defined SHARED
412strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
413# endif
414#endif
415