1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2   Copyright (C) 2016-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* memmove/memcpy/mempcpy is implemented as:
20   1. Use overlapping load and store to avoid branch.
21   2. Load all sources into registers and store them together to avoid
22      possible address overlap between source and destination.
23   3. If size is 8 * VEC_SIZE or less, load all sources into registers
24      and store them together.
25   4. If address of destination > address of source, backward copy
26      4 * VEC_SIZE at a time with unaligned load and aligned store.
27      Load the first 4 * VEC and last VEC before the loop and store
28      them after the loop to support overlapping addresses.
29   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30      load and aligned store.  Load the last 4 * VEC and first VEC
31      before the loop and store them after the loop to support
32      overlapping addresses.
33   6. On machines with ERMS feature, if size greater than equal or to
34      __x86_rep_movsb_threshold and less than
35      __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
36   7. If size >= __x86_shared_non_temporal_threshold and there is no
37      overlap between destination and source, use non-temporal store
38      instead of aligned store copying from either 2 or 4 pages at
39      once.
40   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
41      and source and destination do not page alias, copy from 2 pages
42      at once using non-temporal stores. Page aliasing in this case is
43      considered true if destination's page alignment - sources' page
44      alignment is less than 8 * VEC_SIZE.
45   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
46      and destination do page alias copy from 4 pages at once using
47      non-temporal stores.  */
48
49#include <sysdep.h>
50
51#ifndef MEMCPY_SYMBOL
52# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
53#endif
54
55#ifndef MEMPCPY_SYMBOL
56# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
57#endif
58
59#ifndef MEMMOVE_CHK_SYMBOL
60# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
61#endif
62
63#ifndef XMM0
64# define XMM0				xmm0
65#endif
66
67#ifndef YMM0
68# define YMM0				ymm0
69#endif
70
71#ifndef VZEROUPPER
72# if VEC_SIZE > 16
73#  define VZEROUPPER vzeroupper
74# else
75#  define VZEROUPPER
76# endif
77#endif
78
79/* Whether to align before movsb. Ultimately we want 64 byte
80   align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
81#define ALIGN_MOVSB	(VEC_SIZE > 16)
82/* Number of bytes to align movsb to.  */
83#define MOVSB_ALIGN_TO	64
84
85#define SMALL_MOV_SIZE	(MOV_SIZE <= 4)
86#define LARGE_MOV_SIZE	(MOV_SIZE > 4)
87
88#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
89# error MOV_SIZE Unknown
90#endif
91
92#if LARGE_MOV_SIZE
93# define SMALL_SIZE_OFFSET	(4)
94#else
95# define SMALL_SIZE_OFFSET	(0)
96#endif
97
98#ifndef PAGE_SIZE
99# define PAGE_SIZE 4096
100#endif
101
102#if PAGE_SIZE != 4096
103# error Unsupported PAGE_SIZE
104#endif
105
106#ifndef LOG_PAGE_SIZE
107# define LOG_PAGE_SIZE 12
108#endif
109
110#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
111# error Invalid LOG_PAGE_SIZE
112#endif
113
114/* Byte per page for large_memcpy inner loop.  */
115#if VEC_SIZE == 64
116# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
117#else
118# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
119#endif
120
121/* Amount to shift __x86_shared_non_temporal_threshold by for
122   bound for memcpy_large_4x. This is essentially use to to
123   indicate that the copy is far beyond the scope of L3
124   (assuming no user config x86_non_temporal_threshold) and to
125   use a more aggressively unrolled loop.  NB: before
126   increasing the value also update initialization of
127   x86_non_temporal_threshold.  */
128#ifndef LOG_4X_MEMCPY_THRESH
129# define LOG_4X_MEMCPY_THRESH 4
130#endif
131
132/* Avoid short distance rep movsb only with non-SSE vector.  */
133#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
134# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
135#else
136# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
137#endif
138
139#ifndef PREFETCH
140# define PREFETCH(addr) prefetcht0 addr
141#endif
142
143/* Assume 64-byte prefetch size.  */
144#ifndef PREFETCH_SIZE
145# define PREFETCH_SIZE 64
146#endif
147
148#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
149
150#if PREFETCH_SIZE == 64
151# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
152#  define PREFETCH_ONE_SET(dir, base, offset) \
153	PREFETCH ((offset)base)
154# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
155#  define PREFETCH_ONE_SET(dir, base, offset) \
156	PREFETCH ((offset)base); \
157	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
158# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
159#  define PREFETCH_ONE_SET(dir, base, offset) \
160	PREFETCH ((offset)base); \
161	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
162	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
163	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
164# else
165#   error Unsupported PREFETCHED_LOAD_SIZE!
166# endif
167#else
168# error Unsupported PREFETCH_SIZE!
169#endif
170
171#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
172# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
173	VMOVU	(offset)base, vec0; \
174	VMOVU	((offset) + VEC_SIZE)base, vec1;
175# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
176	VMOVNT  vec0, (offset)base; \
177	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
178#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
179# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
180	VMOVU	(offset)base, vec0; \
181	VMOVU	((offset) + VEC_SIZE)base, vec1; \
182	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
183	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
184# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
185	VMOVNT	vec0, (offset)base; \
186	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
187	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
188	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
189#else
190# error Invalid LARGE_LOAD_SIZE
191#endif
192
193#ifndef SECTION
194# error SECTION is not defined!
195#endif
196
197	.section SECTION(.text),"ax",@progbits
198#if defined SHARED && IS_IN (libc)
199ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
200	cmp	%RDX_LP, %RCX_LP
201	jb	HIDDEN_JUMPTARGET (__chk_fail)
202END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
203#endif
204
205ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
206	mov	%RDI_LP, %RAX_LP
207	add	%RDX_LP, %RAX_LP
208	jmp	L(start)
209END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
210
211#if defined SHARED && IS_IN (libc)
212ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
213	cmp	%RDX_LP, %RCX_LP
214	jb	HIDDEN_JUMPTARGET (__chk_fail)
215END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
216#endif
217
218ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
219	movq	%rdi, %rax
220L(start):
221# ifdef __ILP32__
222	/* Clear the upper 32 bits.  */
223	movl	%edx, %edx
224# endif
225	cmp	$VEC_SIZE, %RDX_LP
226	jb	L(less_vec)
227	/* Load regardless.  */
228	VMOVU	(%rsi), %VEC(0)
229	cmp	$(VEC_SIZE * 2), %RDX_LP
230	ja	L(more_2x_vec)
231	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
232	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
233	VMOVU	%VEC(0), (%rdi)
234	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
235#if !(defined USE_MULTIARCH && IS_IN (libc))
236	ZERO_UPPER_VEC_REGISTERS_RETURN
237#else
238	VZEROUPPER_RETURN
239#endif
240#if defined USE_MULTIARCH && IS_IN (libc)
241END (MEMMOVE_SYMBOL (__memmove, unaligned))
242
243# ifdef SHARED
244ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
245	cmp	%RDX_LP, %RCX_LP
246	jb	HIDDEN_JUMPTARGET (__chk_fail)
247END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
248# endif
249
250ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
251	mov	%RDI_LP, %RAX_LP
252	add	%RDX_LP, %RAX_LP
253	jmp	L(start_erms)
254END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
255
256# ifdef SHARED
257ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
258	cmp	%RDX_LP, %RCX_LP
259	jb	HIDDEN_JUMPTARGET (__chk_fail)
260END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
261# endif
262
263ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
264	movq	%rdi, %rax
265L(start_erms):
266# ifdef __ILP32__
267	/* Clear the upper 32 bits.  */
268	movl	%edx, %edx
269# endif
270	cmp	$VEC_SIZE, %RDX_LP
271	jb	L(less_vec)
272	/* Load regardless.  */
273	VMOVU	(%rsi), %VEC(0)
274	cmp	$(VEC_SIZE * 2), %RDX_LP
275	ja	L(movsb_more_2x_vec)
276	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.
277	 */
278	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
279	VMOVU	%VEC(0), (%rdi)
280	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
281L(return):
282# if VEC_SIZE > 16
283	ZERO_UPPER_VEC_REGISTERS_RETURN
284# else
285	ret
286# endif
287#endif
288
289#if LARGE_MOV_SIZE
290	/* If LARGE_MOV_SIZE this fits in the aligning bytes between the
291	   ENTRY block and L(less_vec).  */
292	.p2align 4,, 8
293L(between_4_7):
294	/* From 4 to 7.  No branch when size == 4.  */
295	movl	(%rsi), %ecx
296	movl	(%rsi, %rdx), %esi
297	movl	%ecx, (%rdi)
298	movl	%esi, (%rdi, %rdx)
299	ret
300#endif
301
302	.p2align 4
303L(less_vec):
304	/* Less than 1 VEC.  */
305#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
306# error Unsupported VEC_SIZE!
307#endif
308#if VEC_SIZE > 32
309	cmpl	$32, %edx
310	jae	L(between_32_63)
311#endif
312#if VEC_SIZE > 16
313	cmpl	$16, %edx
314	jae	L(between_16_31)
315#endif
316	cmpl	$8, %edx
317	jae	L(between_8_15)
318#if SMALL_MOV_SIZE
319	cmpl	$4, %edx
320#else
321	subq	$4, %rdx
322#endif
323	jae	L(between_4_7)
324	cmpl	$(1 - SMALL_SIZE_OFFSET), %edx
325	jl	L(copy_0)
326	movb	(%rsi), %cl
327	je	L(copy_1)
328	movzwl	(-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
329	movw	%si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
330L(copy_1):
331	movb	%cl, (%rdi)
332L(copy_0):
333	ret
334
335#if SMALL_MOV_SIZE
336	.p2align 4,, 8
337L(between_4_7):
338	/* From 4 to 7.  No branch when size == 4.  */
339	movl	-4(%rsi, %rdx), %ecx
340	movl	(%rsi), %esi
341	movl	%ecx, -4(%rdi, %rdx)
342	movl	%esi, (%rdi)
343	ret
344#endif
345
346#if VEC_SIZE > 16
347	/* From 16 to 31.  No branch when size == 16.  */
348	.p2align 4,, 8
349L(between_16_31):
350	vmovdqu	(%rsi), %xmm0
351	vmovdqu	-16(%rsi, %rdx), %xmm1
352	vmovdqu	%xmm0, (%rdi)
353	vmovdqu	%xmm1, -16(%rdi, %rdx)
354	/* No ymm registers have been touched.  */
355	ret
356#endif
357
358#if VEC_SIZE > 32
359	.p2align 4,, 10
360L(between_32_63):
361	/* From 32 to 63.  No branch when size == 32.  */
362	VMOVU	(%rsi), %YMM0
363	VMOVU	-32(%rsi, %rdx), %YMM1
364	VMOVU	%YMM0, (%rdi)
365	VMOVU	%YMM1, -32(%rdi, %rdx)
366	VZEROUPPER_RETURN
367#endif
368
369	.p2align 4,, 10
370L(between_8_15):
371	/* From 8 to 15.  No branch when size == 8.  */
372	movq	-8(%rsi, %rdx), %rcx
373	movq	(%rsi), %rsi
374	movq	%rsi, (%rdi)
375	movq	%rcx, -8(%rdi, %rdx)
376	ret
377
378	.p2align 4,, 10
379L(last_4x_vec):
380	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively.  */
381
382	/* VEC(0) and VEC(1) have already been loaded.  */
383	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(2)
384	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
385	VMOVU	%VEC(0), (%rdi)
386	VMOVU	%VEC(1), VEC_SIZE(%rdi)
387	VMOVU	%VEC(2), -VEC_SIZE(%rdi, %rdx)
388	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
389	VZEROUPPER_RETURN
390
391	.p2align 4
392#if defined USE_MULTIARCH && IS_IN (libc)
393L(movsb_more_2x_vec):
394	cmp	__x86_rep_movsb_threshold(%rip), %RDX_LP
395	ja	L(movsb)
396#endif
397L(more_2x_vec):
398	/* More than 2 * VEC and there may be overlap between
399	   destination and source.  */
400	cmpq	$(VEC_SIZE * 8), %rdx
401	ja	L(more_8x_vec)
402	/* Load VEC(1) regardless. VEC(0) has already been loaded.  */
403	VMOVU	VEC_SIZE(%rsi), %VEC(1)
404	cmpq	$(VEC_SIZE * 4), %rdx
405	jbe	L(last_4x_vec)
406	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively.  */
407	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
408	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
409	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(4)
410	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
411	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
412	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
413	VMOVU	%VEC(0), (%rdi)
414	VMOVU	%VEC(1), VEC_SIZE(%rdi)
415	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
416	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
417	VMOVU	%VEC(4), -VEC_SIZE(%rdi, %rdx)
418	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
419	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
420	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
421	VZEROUPPER_RETURN
422
423	.p2align 4,, 4
424L(more_8x_vec):
425	movq	%rdi, %rcx
426	subq	%rsi, %rcx
427	/* Go to backwards temporal copy if overlap no matter what as
428	   backward REP MOVSB is slow and we don't want to use NT stores if
429	   there is overlap.  */
430	cmpq	%rdx, %rcx
431	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
432	jb	L(more_8x_vec_backward_check_nop)
433	/* Check if non-temporal move candidate.  */
434#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
435	/* Check non-temporal store threshold.  */
436	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
437	ja	L(large_memcpy_2x)
438#endif
439	/* To reach this point there cannot be overlap and dst > src. So
440	   check for overlap and src > dst in which case correctness
441	   requires forward copy. Otherwise decide between backward/forward
442	   copy depending on address aliasing.  */
443
444	/* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
445	   but less than __x86_shared_non_temporal_threshold.  */
446L(more_8x_vec_check):
447	/* rcx contains dst - src. Add back length (rdx).  */
448	leaq	(%rcx, %rdx), %r8
449	/* If r8 has different sign than rcx then there is overlap so we
450	   must do forward copy.  */
451	xorq	%rcx, %r8
452	/* Isolate just sign bit of r8.  */
453	shrq	$63, %r8
454	/* Get 4k difference dst - src.  */
455	andl	$(PAGE_SIZE - 256), %ecx
456	/* If r8 is non-zero must do foward for correctness. Otherwise
457	   if ecx is non-zero there is 4k False Alaising so do backward
458	   copy.  */
459	addl	%r8d, %ecx
460	jz	L(more_8x_vec_backward)
461
462	/* if rdx is greater than __x86_shared_non_temporal_threshold
463	   but there is overlap, or from short distance movsb.  */
464L(more_8x_vec_forward):
465	/* Load first and last 4 * VEC to support overlapping addresses.
466	 */
467
468	/* First vec was already loaded into VEC(0).  */
469	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
470	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
471	/* Save begining of dst.  */
472	movq	%rdi, %rcx
473	/* Align dst to VEC_SIZE - 1.  */
474	orq	$(VEC_SIZE - 1), %rdi
475	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
476	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
477
478	/* Subtract dst from src. Add back after dst aligned.  */
479	subq	%rcx, %rsi
480	/* Finish aligning dst.  */
481	incq	%rdi
482	/* Restore src adjusted with new value for aligned dst.  */
483	addq	%rdi, %rsi
484	/* Store end of buffer minus tail in rdx.  */
485	leaq	(VEC_SIZE * -4)(%rcx, %rdx), %rdx
486
487	/* Dont use multi-byte nop to align.  */
488	.p2align 4,, 11
489L(loop_4x_vec_forward):
490	/* Copy 4 * VEC a time forward.  */
491	VMOVU	(%rsi), %VEC(1)
492	VMOVU	VEC_SIZE(%rsi), %VEC(2)
493	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
494	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
495	subq	$-(VEC_SIZE * 4), %rsi
496	VMOVA	%VEC(1), (%rdi)
497	VMOVA	%VEC(2), VEC_SIZE(%rdi)
498	VMOVA	%VEC(3), (VEC_SIZE * 2)(%rdi)
499	VMOVA	%VEC(4), (VEC_SIZE * 3)(%rdi)
500	subq	$-(VEC_SIZE * 4), %rdi
501	cmpq	%rdi, %rdx
502	ja	L(loop_4x_vec_forward)
503	/* Store the last 4 * VEC.  */
504	VMOVU	%VEC(5), (VEC_SIZE * 3)(%rdx)
505	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdx)
506	VMOVU	%VEC(7), VEC_SIZE(%rdx)
507	VMOVU	%VEC(8), (%rdx)
508	/* Store the first VEC.  */
509	VMOVU	%VEC(0), (%rcx)
510	/* Keep L(nop_backward) target close to jmp for 2-byte encoding.
511	 */
512L(nop_backward):
513	VZEROUPPER_RETURN
514
515	.p2align 4,, 8
516L(more_8x_vec_backward_check_nop):
517	/* rcx contains dst - src. Test for dst == src to skip all of
518	   memmove.  */
519	testq	%rcx, %rcx
520	jz	L(nop_backward)
521L(more_8x_vec_backward):
522	/* Load the first 4 * VEC and last VEC to support overlapping
523	   addresses.  */
524
525	/* First vec was also loaded into VEC(0).  */
526	VMOVU	VEC_SIZE(%rsi), %VEC(5)
527	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
528	/* Begining of region for 4x backward copy stored in rcx.  */
529	leaq	(VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
530	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
531	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(8)
532	/* Subtract dst from src. Add back after dst aligned.  */
533	subq	%rdi, %rsi
534	/* Align dst.  */
535	andq	$-(VEC_SIZE), %rcx
536	/* Restore src.  */
537	addq	%rcx, %rsi
538
539	/* Don't use multi-byte nop to align.  */
540	.p2align 4,, 11
541L(loop_4x_vec_backward):
542	/* Copy 4 * VEC a time backward.  */
543	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
544	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
545	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(3)
546	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(4)
547	addq	$(VEC_SIZE * -4), %rsi
548	VMOVA	%VEC(1), (VEC_SIZE * 3)(%rcx)
549	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
550	VMOVA	%VEC(3), (VEC_SIZE * 1)(%rcx)
551	VMOVA	%VEC(4), (VEC_SIZE * 0)(%rcx)
552	addq	$(VEC_SIZE * -4), %rcx
553	cmpq	%rcx, %rdi
554	jb	L(loop_4x_vec_backward)
555	/* Store the first 4 * VEC.  */
556	VMOVU	%VEC(0), (%rdi)
557	VMOVU	%VEC(5), VEC_SIZE(%rdi)
558	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
559	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
560	/* Store the last VEC.  */
561	VMOVU	%VEC(8), -VEC_SIZE(%rdx, %rdi)
562	VZEROUPPER_RETURN
563
564#if defined USE_MULTIARCH && IS_IN (libc)
565	/* L(skip_short_movsb_check) is only used with ERMS. Not for
566	   FSRM.  */
567	.p2align 5,, 16
568# if ALIGN_MOVSB
569L(skip_short_movsb_check):
570#  if MOVSB_ALIGN_TO > VEC_SIZE
571	VMOVU	VEC_SIZE(%rsi), %VEC(1)
572#  endif
573#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
574#   error Unsupported MOVSB_ALIGN_TO
575#  endif
576	/* If CPU does not have FSRM two options for aligning. Align src
577	   if dst and src 4k alias. Otherwise align dst.  */
578	testl	$(PAGE_SIZE - 512), %ecx
579	jnz	L(movsb_align_dst)
580	/* Fall through. dst and src 4k alias. It's better to align src
581	   here because the bottleneck will be loads dues to the false
582	   dependency on dst.  */
583
584	/* rcx already has dst - src.  */
585	movq	%rcx, %r9
586	/* Add src to len. Subtract back after src aligned. -1 because
587	   src is initially aligned to MOVSB_ALIGN_TO - 1.  */
588	leaq	-1(%rsi, %rdx), %rcx
589	/* Inclusively align src to MOVSB_ALIGN_TO - 1.  */
590	orq	$(MOVSB_ALIGN_TO - 1), %rsi
591	/* Restore dst and len adjusted with new values for aligned dst.
592	 */
593	leaq	1(%rsi, %r9), %rdi
594	subq	%rsi, %rcx
595	/* Finish aligning src.  */
596	incq	%rsi
597
598	rep	movsb
599
600	VMOVU	%VEC(0), (%r8)
601#  if MOVSB_ALIGN_TO > VEC_SIZE
602	VMOVU	%VEC(1), VEC_SIZE(%r8)
603#  endif
604	VZEROUPPER_RETURN
605# endif
606
607	.p2align 4,, 12
608L(movsb):
609	movq	%rdi, %rcx
610	subq	%rsi, %rcx
611	/* Go to backwards temporal copy if overlap no matter what as
612	   backward REP MOVSB is slow and we don't want to use NT stores if
613	   there is overlap.  */
614	cmpq	%rdx, %rcx
615	/* L(more_8x_vec_backward_check_nop) checks for src == dst.  */
616	jb	L(more_8x_vec_backward_check_nop)
617# if ALIGN_MOVSB
618	/* Save dest for storing aligning VECs later.  */
619	movq	%rdi, %r8
620# endif
621	/* If above __x86_rep_movsb_stop_threshold most likely is
622	   candidate for NT moves aswell.  */
623	cmp	__x86_rep_movsb_stop_threshold(%rip), %RDX_LP
624	jae	L(large_memcpy_2x_check)
625# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
626	/* Only avoid short movsb if CPU has FSRM.  */
627	testl	$X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
628	jz	L(skip_short_movsb_check)
629#  if AVOID_SHORT_DISTANCE_REP_MOVSB
630	/* Avoid "rep movsb" if RCX, the distance between source and
631	   destination, is N*4GB + [1..63] with N >= 0.  */
632
633	/* ecx contains dst - src. Early check for backward copy
634	   conditions means only case of slow movsb with src = dst + [0,
635	   63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
636	   for that case.  */
637	cmpl	$-64, %ecx
638	ja	L(more_8x_vec_forward)
639#  endif
640# endif
641# if ALIGN_MOVSB
642#  if MOVSB_ALIGN_TO > VEC_SIZE
643	VMOVU	VEC_SIZE(%rsi), %VEC(1)
644#  endif
645#  if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
646#   error Unsupported MOVSB_ALIGN_TO
647#  endif
648	/* Fall through means cpu has FSRM. In that case exclusively
649	   align destination.  */
650L(movsb_align_dst):
651	/* Subtract dst from src. Add back after dst aligned.  */
652	subq	%rdi, %rsi
653	/* Exclusively align dst to MOVSB_ALIGN_TO (64).  */
654	addq	$(MOVSB_ALIGN_TO - 1), %rdi
655	/* Add dst to len. Subtract back after dst aligned.  */
656	leaq	(%r8, %rdx), %rcx
657	/* Finish aligning dst.  */
658	andq	$-(MOVSB_ALIGN_TO), %rdi
659	/* Restore src and len adjusted with new values for aligned dst.
660	 */
661	addq	%rdi, %rsi
662	subq	%rdi, %rcx
663
664	rep	movsb
665
666	/* Store VECs loaded for aligning.  */
667	VMOVU	%VEC(0), (%r8)
668#  if MOVSB_ALIGN_TO > VEC_SIZE
669	VMOVU	%VEC(1), VEC_SIZE(%r8)
670#  endif
671	VZEROUPPER_RETURN
672# else	/* !ALIGN_MOVSB.  */
673L(skip_short_movsb_check):
674	mov	%RDX_LP, %RCX_LP
675	rep	movsb
676	ret
677# endif
678#endif
679
680	.p2align 4,, 10
681#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
682L(large_memcpy_2x_check):
683	/* Entry from L(large_memcpy_2x) has a redundant load of
684	   __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
685	   is only use for the non-erms memmove which is generally less
686	   common.  */
687L(large_memcpy_2x):
688	mov	__x86_shared_non_temporal_threshold(%rip), %R11_LP
689	cmp	%R11_LP, %RDX_LP
690	jb	L(more_8x_vec_check)
691	/* To reach this point it is impossible for dst > src and
692	   overlap. Remaining to check is src > dst and overlap. rcx
693	   already contains dst - src. Negate rcx to get src - dst. If
694	   length > rcx then there is overlap and forward copy is best.  */
695	negq	%rcx
696	cmpq	%rcx, %rdx
697	ja	L(more_8x_vec_forward)
698
699	/* Cache align destination. First store the first 64 bytes then
700	   adjust alignments.  */
701
702	/* First vec was also loaded into VEC(0).  */
703# if VEC_SIZE < 64
704	VMOVU	VEC_SIZE(%rsi), %VEC(1)
705#  if VEC_SIZE < 32
706	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
707	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
708#  endif
709# endif
710	VMOVU	%VEC(0), (%rdi)
711# if VEC_SIZE < 64
712	VMOVU	%VEC(1), VEC_SIZE(%rdi)
713#  if VEC_SIZE < 32
714	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
715	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
716#  endif
717# endif
718
719	/* Adjust source, destination, and size.  */
720	movq	%rdi, %r8
721	andq	$63, %r8
722	/* Get the negative of offset for alignment.  */
723	subq	$64, %r8
724	/* Adjust source.  */
725	subq	%r8, %rsi
726	/* Adjust destination which should be aligned now.  */
727	subq	%r8, %rdi
728	/* Adjust length.  */
729	addq	%r8, %rdx
730
731	/* Test if source and destination addresses will alias. If they
732	   do the larger pipeline in large_memcpy_4x alleviated the
733	   performance drop.  */
734
735	/* ecx contains -(dst - src). not ecx will return dst - src - 1
736	   which works for testing aliasing.  */
737	notl	%ecx
738	movq	%rdx, %r10
739	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
740	jz	L(large_memcpy_4x)
741
742	/* r11 has __x86_shared_non_temporal_threshold.  Shift it left
743	   by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
744	 */
745	shlq	$LOG_4X_MEMCPY_THRESH, %r11
746	cmp	%r11, %rdx
747	jae	L(large_memcpy_4x)
748
749	/* edx will store remainder size for copying tail.  */
750	andl	$(PAGE_SIZE * 2 - 1), %edx
751	/* r10 stores outer loop counter.  */
752	shrq	$(LOG_PAGE_SIZE + 1), %r10
753	/* Copy 4x VEC at a time from 2 pages.  */
754	.p2align 4
755L(loop_large_memcpy_2x_outer):
756	/* ecx stores inner loop counter.  */
757	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
758L(loop_large_memcpy_2x_inner):
759	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
760	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
761	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
762	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
763	/* Load vectors from rsi.  */
764	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
765	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
766	subq	$-LARGE_LOAD_SIZE, %rsi
767	/* Non-temporal store vectors to rdi.  */
768	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
769	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
770	subq	$-LARGE_LOAD_SIZE, %rdi
771	decl	%ecx
772	jnz	L(loop_large_memcpy_2x_inner)
773	addq	$PAGE_SIZE, %rdi
774	addq	$PAGE_SIZE, %rsi
775	decq	%r10
776	jne	L(loop_large_memcpy_2x_outer)
777	sfence
778
779	/* Check if only last 4 loads are needed.  */
780	cmpl	$(VEC_SIZE * 4), %edx
781	jbe	L(large_memcpy_2x_end)
782
783	/* Handle the last 2 * PAGE_SIZE bytes.  */
784L(loop_large_memcpy_2x_tail):
785	/* Copy 4 * VEC a time forward with non-temporal stores.  */
786	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
787	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
788	VMOVU	(%rsi), %VEC(0)
789	VMOVU	VEC_SIZE(%rsi), %VEC(1)
790	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
791	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
792	subq	$-(VEC_SIZE * 4), %rsi
793	addl	$-(VEC_SIZE * 4), %edx
794	VMOVA	%VEC(0), (%rdi)
795	VMOVA	%VEC(1), VEC_SIZE(%rdi)
796	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
797	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
798	subq	$-(VEC_SIZE * 4), %rdi
799	cmpl	$(VEC_SIZE * 4), %edx
800	ja	L(loop_large_memcpy_2x_tail)
801
802L(large_memcpy_2x_end):
803	/* Store the last 4 * VEC.  */
804	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
805	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
806	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
807	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
808
809	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
810	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
811	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
812	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
813	VZEROUPPER_RETURN
814
815	.p2align 4
816L(large_memcpy_4x):
817	/* edx will store remainder size for copying tail.  */
818	andl	$(PAGE_SIZE * 4 - 1), %edx
819	/* r10 stores outer loop counter.  */
820	shrq	$(LOG_PAGE_SIZE + 2), %r10
821	/* Copy 4x VEC at a time from 4 pages.  */
822	.p2align 4
823L(loop_large_memcpy_4x_outer):
824	/* ecx stores inner loop counter.  */
825	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
826L(loop_large_memcpy_4x_inner):
827	/* Only one prefetch set per page as doing 4 pages give more
828	   time for prefetcher to keep up.  */
829	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
830	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
831	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
832	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
833	/* Load vectors from rsi.  */
834	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
835	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
836	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
837	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
838	subq	$-LARGE_LOAD_SIZE, %rsi
839	/* Non-temporal store vectors to rdi.  */
840	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
841	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
842	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
843	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
844	subq	$-LARGE_LOAD_SIZE, %rdi
845	decl	%ecx
846	jnz	L(loop_large_memcpy_4x_inner)
847	addq	$(PAGE_SIZE * 3), %rdi
848	addq	$(PAGE_SIZE * 3), %rsi
849	decq	%r10
850	jne	L(loop_large_memcpy_4x_outer)
851	sfence
852	/* Check if only last 4 loads are needed.  */
853	cmpl	$(VEC_SIZE * 4), %edx
854	jbe	L(large_memcpy_4x_end)
855
856	/* Handle the last 4  * PAGE_SIZE bytes.  */
857L(loop_large_memcpy_4x_tail):
858	/* Copy 4 * VEC a time forward with non-temporal stores.  */
859	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
860	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
861	VMOVU	(%rsi), %VEC(0)
862	VMOVU	VEC_SIZE(%rsi), %VEC(1)
863	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
864	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
865	subq	$-(VEC_SIZE * 4), %rsi
866	addl	$-(VEC_SIZE * 4), %edx
867	VMOVA	%VEC(0), (%rdi)
868	VMOVA	%VEC(1), VEC_SIZE(%rdi)
869	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
870	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
871	subq	$-(VEC_SIZE * 4), %rdi
872	cmpl	$(VEC_SIZE * 4), %edx
873	ja	L(loop_large_memcpy_4x_tail)
874
875L(large_memcpy_4x_end):
876	/* Store the last 4 * VEC.  */
877	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
878	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
879	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
880	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
881
882	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
883	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
884	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
885	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
886	VZEROUPPER_RETURN
887#endif
888END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
889
890#if IS_IN (libc)
891# ifdef USE_MULTIARCH
892strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
893	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
894#  ifdef SHARED
895strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
896	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
897#  endif
898# endif
899# ifdef SHARED
900strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
901	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
902# endif
903#endif
904strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
905	      MEMCPY_SYMBOL (__memcpy, unaligned))
906