1/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
2   Copyright (C) 2016-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include <isa-level.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# include "asm-syntax.h"
25
26	.section .text.avx512,"ax",@progbits
27ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
28	cmp	%RDX_LP, %RCX_LP
29	jb	HIDDEN_JUMPTARGET (__chk_fail)
30END (__mempcpy_chk_avx512_no_vzeroupper)
31
32ENTRY (__mempcpy_avx512_no_vzeroupper)
33	mov	%RDI_LP, %RAX_LP
34	add	%RDX_LP, %RAX_LP
35	jmp	L(start)
36END (__mempcpy_avx512_no_vzeroupper)
37
38ENTRY (__memmove_chk_avx512_no_vzeroupper)
39	cmp	%RDX_LP, %RCX_LP
40	jb	HIDDEN_JUMPTARGET (__chk_fail)
41END (__memmove_chk_avx512_no_vzeroupper)
42
43ENTRY (__memmove_avx512_no_vzeroupper)
44	mov	%RDI_LP, %RAX_LP
45# ifdef USE_AS_MEMPCPY
46	add	%RDX_LP, %RAX_LP
47# endif
48L(start):
49# ifdef __ILP32__
50	/* Clear the upper 32 bits.  */
51	mov	%edx, %edx
52# endif
53	lea	(%rsi, %rdx), %rcx
54	lea	(%rdi, %rdx), %r9
55	cmp	$512, %rdx
56	ja	L(512bytesormore)
57
58L(check):
59	cmp	$16, %rdx
60	jbe	L(less_16bytes)
61	cmp	$256, %rdx
62	jb	L(less_256bytes)
63	vmovups	(%rsi), %zmm0
64	vmovups 0x40(%rsi), %zmm1
65	vmovups 0x80(%rsi), %zmm2
66	vmovups 0xC0(%rsi), %zmm3
67	vmovups	-0x100(%rcx), %zmm4
68	vmovups -0xC0(%rcx), %zmm5
69	vmovups -0x80(%rcx), %zmm6
70	vmovups -0x40(%rcx), %zmm7
71	vmovups %zmm0, (%rdi)
72	vmovups %zmm1, 0x40(%rdi)
73	vmovups %zmm2, 0x80(%rdi)
74	vmovups %zmm3, 0xC0(%rdi)
75	vmovups	%zmm4, -0x100(%r9)
76	vmovups %zmm5, -0xC0(%r9)
77	vmovups %zmm6, -0x80(%r9)
78	vmovups %zmm7, -0x40(%r9)
79	ret
80
81L(less_256bytes):
82	cmp	$128, %dl
83	jb	L(less_128bytes)
84	vmovups	(%rsi), %zmm0
85	vmovups 0x40(%rsi), %zmm1
86	vmovups -0x80(%rcx), %zmm2
87	vmovups -0x40(%rcx), %zmm3
88	vmovups	%zmm0, (%rdi)
89	vmovups %zmm1, 0x40(%rdi)
90	vmovups %zmm2, -0x80(%r9)
91	vmovups %zmm3, -0x40(%r9)
92	ret
93
94L(less_128bytes):
95	cmp	$64, %dl
96	jb	L(less_64bytes)
97	vmovdqu (%rsi), %ymm0
98	vmovdqu 0x20(%rsi), %ymm1
99	vmovdqu -0x40(%rcx), %ymm2
100	vmovdqu -0x20(%rcx), %ymm3
101	vmovdqu %ymm0, (%rdi)
102	vmovdqu %ymm1, 0x20(%rdi)
103	vmovdqu %ymm2, -0x40(%r9)
104	vmovdqu %ymm3, -0x20(%r9)
105	ret
106
107L(less_64bytes):
108	cmp	$32, %dl
109	jb	L(less_32bytes)
110	vmovdqu	(%rsi), %ymm0
111	vmovdqu -0x20(%rcx), %ymm1
112	vmovdqu	%ymm0, (%rdi)
113	vmovdqu	%ymm1, -0x20(%r9)
114	ret
115
116L(less_32bytes):
117	vmovdqu (%rsi), %xmm0
118	vmovdqu -0x10(%rcx), %xmm1
119	vmovdqu %xmm0, (%rdi)
120	vmovdqu %xmm1, -0x10(%r9)
121	ret
122
123L(less_16bytes):
124	cmp	$8, %dl
125	jb	L(less_8bytes)
126	movq	(%rsi), %rsi
127	movq	-0x8(%rcx), %rcx
128	movq	%rsi, (%rdi)
129	movq	%rcx, -0x8(%r9)
130	ret
131
132L(less_8bytes):
133	cmp	$4, %dl
134	jb	L(less_4bytes)
135	mov	(%rsi), %esi
136	mov	-0x4(%rcx), %ecx
137	mov	%esi, (%rdi)
138	mov	%ecx, -0x4(%r9)
139	ret
140
141L(less_4bytes):
142	cmp	$2, %dl
143	jb	L(less_2bytes)
144	mov	(%rsi), %si
145	mov	-0x2(%rcx), %cx
146	mov	%si, (%rdi)
147	mov	%cx, -0x2(%r9)
148	ret
149
150L(less_2bytes):
151	cmp	$1, %dl
152	jb	L(less_1bytes)
153	mov	(%rsi), %cl
154	mov	%cl, (%rdi)
155L(less_1bytes):
156	ret
157
158L(512bytesormore):
159# ifdef SHARED_CACHE_SIZE_HALF
160	mov	$SHARED_CACHE_SIZE_HALF, %r8
161# else
162	mov	__x86_shared_cache_size_half(%rip), %r8
163# endif
164	cmp	%r8, %rdx
165	jae	L(preloop_large)
166	cmp	$1024, %rdx
167	ja	L(1024bytesormore)
168	prefetcht1 (%rsi)
169	prefetcht1 0x40(%rsi)
170	prefetcht1 0x80(%rsi)
171	prefetcht1 0xC0(%rsi)
172	prefetcht1 0x100(%rsi)
173	prefetcht1 0x140(%rsi)
174	prefetcht1 0x180(%rsi)
175	prefetcht1 0x1C0(%rsi)
176	prefetcht1 -0x200(%rcx)
177	prefetcht1 -0x1C0(%rcx)
178	prefetcht1 -0x180(%rcx)
179	prefetcht1 -0x140(%rcx)
180	prefetcht1 -0x100(%rcx)
181	prefetcht1 -0xC0(%rcx)
182	prefetcht1 -0x80(%rcx)
183	prefetcht1 -0x40(%rcx)
184	vmovups	(%rsi), %zmm0
185	vmovups 0x40(%rsi), %zmm1
186	vmovups 0x80(%rsi), %zmm2
187	vmovups 0xC0(%rsi), %zmm3
188	vmovups	0x100(%rsi), %zmm4
189	vmovups 0x140(%rsi), %zmm5
190	vmovups 0x180(%rsi), %zmm6
191	vmovups 0x1C0(%rsi), %zmm7
192	vmovups	-0x200(%rcx), %zmm8
193	vmovups -0x1C0(%rcx), %zmm9
194	vmovups -0x180(%rcx), %zmm10
195	vmovups -0x140(%rcx), %zmm11
196	vmovups	-0x100(%rcx), %zmm12
197	vmovups -0xC0(%rcx), %zmm13
198	vmovups -0x80(%rcx), %zmm14
199	vmovups -0x40(%rcx), %zmm15
200	vmovups %zmm0, (%rdi)
201	vmovups %zmm1, 0x40(%rdi)
202	vmovups %zmm2, 0x80(%rdi)
203	vmovups %zmm3, 0xC0(%rdi)
204	vmovups %zmm4, 0x100(%rdi)
205	vmovups %zmm5, 0x140(%rdi)
206	vmovups %zmm6, 0x180(%rdi)
207	vmovups %zmm7, 0x1C0(%rdi)
208	vmovups	%zmm8, -0x200(%r9)
209	vmovups %zmm9, -0x1C0(%r9)
210	vmovups %zmm10, -0x180(%r9)
211	vmovups %zmm11, -0x140(%r9)
212	vmovups	%zmm12, -0x100(%r9)
213	vmovups %zmm13, -0xC0(%r9)
214	vmovups %zmm14, -0x80(%r9)
215	vmovups %zmm15, -0x40(%r9)
216	ret
217
218L(1024bytesormore):
219	cmp	%rsi, %rdi
220	ja	L(1024bytesormore_bkw)
221	sub	$512, %r9
222	vmovups -0x200(%rcx), %zmm8
223	vmovups -0x1C0(%rcx), %zmm9
224	vmovups -0x180(%rcx), %zmm10
225	vmovups -0x140(%rcx), %zmm11
226	vmovups	-0x100(%rcx), %zmm12
227	vmovups -0xC0(%rcx), %zmm13
228	vmovups -0x80(%rcx), %zmm14
229	vmovups -0x40(%rcx), %zmm15
230	prefetcht1 (%rsi)
231	prefetcht1 0x40(%rsi)
232	prefetcht1 0x80(%rsi)
233	prefetcht1 0xC0(%rsi)
234	prefetcht1 0x100(%rsi)
235	prefetcht1 0x140(%rsi)
236	prefetcht1 0x180(%rsi)
237	prefetcht1 0x1C0(%rsi)
238
239/* Loop with unaligned memory access.  */
240L(gobble_512bytes_loop):
241	vmovups	(%rsi), %zmm0
242	vmovups 0x40(%rsi), %zmm1
243	vmovups 0x80(%rsi), %zmm2
244	vmovups 0xC0(%rsi), %zmm3
245	vmovups	0x100(%rsi), %zmm4
246	vmovups 0x140(%rsi), %zmm5
247	vmovups 0x180(%rsi), %zmm6
248	vmovups 0x1C0(%rsi), %zmm7
249	add	$512, %rsi
250	prefetcht1 (%rsi)
251	prefetcht1 0x40(%rsi)
252	prefetcht1 0x80(%rsi)
253	prefetcht1 0xC0(%rsi)
254	prefetcht1 0x100(%rsi)
255	prefetcht1 0x140(%rsi)
256	prefetcht1 0x180(%rsi)
257	prefetcht1 0x1C0(%rsi)
258	vmovups	%zmm0, (%rdi)
259	vmovups %zmm1, 0x40(%rdi)
260	vmovups %zmm2, 0x80(%rdi)
261	vmovups %zmm3, 0xC0(%rdi)
262	vmovups	%zmm4, 0x100(%rdi)
263	vmovups %zmm5, 0x140(%rdi)
264	vmovups %zmm6, 0x180(%rdi)
265	vmovups %zmm7, 0x1C0(%rdi)
266	add	$512, %rdi
267	cmp	%r9, %rdi
268	jb	L(gobble_512bytes_loop)
269	vmovups %zmm8, (%r9)
270	vmovups %zmm9, 0x40(%r9)
271	vmovups %zmm10, 0x80(%r9)
272	vmovups %zmm11, 0xC0(%r9)
273	vmovups %zmm12, 0x100(%r9)
274	vmovups %zmm13, 0x140(%r9)
275	vmovups %zmm14, 0x180(%r9)
276	vmovups %zmm15, 0x1C0(%r9)
277	ret
278
279L(1024bytesormore_bkw):
280	add	$512, %rdi
281	vmovups	0x1C0(%rsi), %zmm8
282	vmovups 0x180(%rsi), %zmm9
283	vmovups 0x140(%rsi), %zmm10
284	vmovups 0x100(%rsi), %zmm11
285	vmovups	0xC0(%rsi), %zmm12
286	vmovups 0x80(%rsi), %zmm13
287	vmovups 0x40(%rsi), %zmm14
288	vmovups (%rsi), %zmm15
289	prefetcht1 -0x40(%rcx)
290	prefetcht1 -0x80(%rcx)
291	prefetcht1 -0xC0(%rcx)
292	prefetcht1 -0x100(%rcx)
293	prefetcht1 -0x140(%rcx)
294	prefetcht1 -0x180(%rcx)
295	prefetcht1 -0x1C0(%rcx)
296	prefetcht1 -0x200(%rcx)
297
298/* Backward loop with unaligned memory access.  */
299L(gobble_512bytes_loop_bkw):
300	vmovups -0x40(%rcx), %zmm0
301	vmovups -0x80(%rcx), %zmm1
302	vmovups -0xC0(%rcx), %zmm2
303	vmovups	-0x100(%rcx), %zmm3
304	vmovups -0x140(%rcx), %zmm4
305	vmovups -0x180(%rcx), %zmm5
306	vmovups -0x1C0(%rcx), %zmm6
307	vmovups	-0x200(%rcx), %zmm7
308	sub	$512, %rcx
309	prefetcht1 -0x40(%rcx)
310	prefetcht1 -0x80(%rcx)
311	prefetcht1 -0xC0(%rcx)
312	prefetcht1 -0x100(%rcx)
313	prefetcht1 -0x140(%rcx)
314	prefetcht1 -0x180(%rcx)
315	prefetcht1 -0x1C0(%rcx)
316	prefetcht1 -0x200(%rcx)
317	vmovups %zmm0, -0x40(%r9)
318	vmovups %zmm1, -0x80(%r9)
319	vmovups %zmm2, -0xC0(%r9)
320	vmovups	%zmm3, -0x100(%r9)
321	vmovups %zmm4, -0x140(%r9)
322	vmovups %zmm5, -0x180(%r9)
323	vmovups %zmm6, -0x1C0(%r9)
324	vmovups	%zmm7, -0x200(%r9)
325	sub	$512, %r9
326	cmp	%rdi, %r9
327	ja	L(gobble_512bytes_loop_bkw)
328	vmovups %zmm8, -0x40(%rdi)
329	vmovups %zmm9, -0x80(%rdi)
330	vmovups %zmm10, -0xC0(%rdi)
331	vmovups %zmm11, -0x100(%rdi)
332	vmovups %zmm12, -0x140(%rdi)
333	vmovups %zmm13, -0x180(%rdi)
334	vmovups %zmm14, -0x1C0(%rdi)
335	vmovups %zmm15, -0x200(%rdi)
336	ret
337
338L(preloop_large):
339	cmp	%rsi, %rdi
340	ja	L(preloop_large_bkw)
341	vmovups	(%rsi), %zmm4
342	vmovups	0x40(%rsi), %zmm5
343
344	mov	%rdi, %r11
345/* Align destination for access with non-temporal stores in the loop.  */
346	mov	%rdi, %r8
347	and	$-0x80, %rdi
348	add	$0x80, %rdi
349	sub	%rdi, %r8
350	sub	%r8, %rsi
351	add	%r8, %rdx
352L(gobble_256bytes_nt_loop):
353	prefetcht1 0x200(%rsi)
354	prefetcht1 0x240(%rsi)
355	prefetcht1 0x280(%rsi)
356	prefetcht1 0x2C0(%rsi)
357	prefetcht1 0x300(%rsi)
358	prefetcht1 0x340(%rsi)
359	prefetcht1 0x380(%rsi)
360	prefetcht1 0x3C0(%rsi)
361	vmovdqu64 (%rsi), %zmm0
362	vmovdqu64 0x40(%rsi), %zmm1
363	vmovdqu64 0x80(%rsi), %zmm2
364	vmovdqu64 0xC0(%rsi), %zmm3
365	vmovntdq %zmm0, (%rdi)
366	vmovntdq %zmm1, 0x40(%rdi)
367	vmovntdq %zmm2, 0x80(%rdi)
368	vmovntdq %zmm3, 0xC0(%rdi)
369	sub	$256, %rdx
370	add	$256, %rsi
371	add	$256, %rdi
372	cmp	$256, %rdx
373	ja	L(gobble_256bytes_nt_loop)
374	sfence
375	vmovups	%zmm4, (%r11)
376	vmovups	%zmm5, 0x40(%r11)
377	jmp	L(check)
378
379L(preloop_large_bkw):
380	vmovups -0x80(%rcx), %zmm4
381	vmovups -0x40(%rcx), %zmm5
382
383/* Align end of destination for access with non-temporal stores.  */
384	mov	%r9, %r8
385	and	$-0x80, %r9
386	sub	%r9, %r8
387	sub	%r8, %rcx
388	sub	%r8, %rdx
389	add	%r9, %r8
390L(gobble_256bytes_nt_loop_bkw):
391	prefetcht1 -0x400(%rcx)
392	prefetcht1 -0x3C0(%rcx)
393	prefetcht1 -0x380(%rcx)
394	prefetcht1 -0x340(%rcx)
395	prefetcht1 -0x300(%rcx)
396	prefetcht1 -0x2C0(%rcx)
397	prefetcht1 -0x280(%rcx)
398	prefetcht1 -0x240(%rcx)
399	vmovdqu64 -0x100(%rcx), %zmm0
400	vmovdqu64 -0xC0(%rcx), %zmm1
401	vmovdqu64 -0x80(%rcx), %zmm2
402	vmovdqu64 -0x40(%rcx), %zmm3
403	vmovntdq %zmm0,	-0x100(%r9)
404	vmovntdq %zmm1,	-0xC0(%r9)
405	vmovntdq %zmm2,	-0x80(%r9)
406	vmovntdq %zmm3,	-0x40(%r9)
407	sub	$256, %rdx
408	sub	$256, %rcx
409	sub	$256, %r9
410	cmp	$256, %rdx
411	ja	L(gobble_256bytes_nt_loop_bkw)
412	sfence
413	vmovups	%zmm4, -0x80(%r8)
414	vmovups	%zmm5, -0x40(%r8)
415	jmp	L(check)
416END (__memmove_avx512_no_vzeroupper)
417
418strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
419strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
420#endif
421