1/* strcpy with AVX2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23
24# ifndef USE_AS_STRCAT
25#  include <sysdep.h>
26
27#  ifndef STRCPY
28#   define STRCPY  __strcpy_avx2
29#  endif
30
31# endif
32
33/* Number of bytes in a vector register */
34# ifndef VEC_SIZE
35#  define VEC_SIZE	32
36# endif
37
38# ifndef VZEROUPPER
39#  define VZEROUPPER	vzeroupper
40# endif
41
42# ifndef SECTION
43#  define SECTION(p)	p##.avx
44# endif
45
46/* zero register */
47#define xmmZ	xmm0
48#define ymmZ	ymm0
49
50/* mask register */
51#define ymmM	ymm1
52
53# ifndef USE_AS_STRCAT
54
55	.section SECTION(.text),"ax",@progbits
56ENTRY (STRCPY)
57#  ifdef USE_AS_STRNCPY
58	mov	%RDX_LP, %R8_LP
59	test	%R8_LP, %R8_LP
60	jz	L(ExitZero)
61#  endif
62	mov	%rsi, %rcx
63#  ifndef USE_AS_STPCPY
64	mov	%rdi, %rax      /* save result */
65#  endif
66
67# endif
68
69	vpxor	%xmmZ, %xmmZ, %xmmZ
70
71	and	$((VEC_SIZE * 4) - 1), %ecx
72	cmp	$(VEC_SIZE * 2), %ecx
73	jbe	L(SourceStringAlignmentLessTwoVecSize)
74
75	and	$-VEC_SIZE, %rsi
76	and	$(VEC_SIZE - 1), %ecx
77
78	vpcmpeqb (%rsi), %ymmZ, %ymmM
79	vpmovmskb %ymmM, %edx
80	shr	%cl, %rdx
81
82# ifdef USE_AS_STRNCPY
83#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
84	mov	$VEC_SIZE, %r10
85	sub	%rcx, %r10
86	cmp	%r10, %r8
87#  else
88	mov	$(VEC_SIZE + 1), %r10
89	sub	%rcx, %r10
90	cmp	%r10, %r8
91#  endif
92	jbe	L(CopyVecSizeTailCase2OrCase3)
93# endif
94	test	%edx, %edx
95	jnz	L(CopyVecSizeTail)
96
97	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
98	vpmovmskb %ymm2, %edx
99
100# ifdef USE_AS_STRNCPY
101	add	$VEC_SIZE, %r10
102	cmp	%r10, %r8
103	jbe	L(CopyTwoVecSizeCase2OrCase3)
104# endif
105	test	%edx, %edx
106	jnz	L(CopyTwoVecSize)
107
108	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
109	vmovdqu %ymm2, (%rdi)
110
111/* If source address alignment != destination address alignment */
112	.p2align 4
113L(UnalignVecSizeBoth):
114	sub	%rcx, %rdi
115# ifdef USE_AS_STRNCPY
116	add	%rcx, %r8
117	sbb	%rcx, %rcx
118	or	%rcx, %r8
119# endif
120	mov	$VEC_SIZE, %rcx
121	vmovdqa (%rsi, %rcx), %ymm2
122	vmovdqu %ymm2, (%rdi, %rcx)
123	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
124	vpcmpeqb %ymm2, %ymmZ, %ymmM
125	vpmovmskb %ymmM, %edx
126	add	$VEC_SIZE, %rcx
127# ifdef USE_AS_STRNCPY
128	sub	$(VEC_SIZE * 3), %r8
129	jbe	L(CopyVecSizeCase2OrCase3)
130# endif
131	test	%edx, %edx
132# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
133	jnz	L(CopyVecSizeUnalignedVec2)
134# else
135	jnz	L(CopyVecSize)
136# endif
137
138	vmovdqu %ymm2, (%rdi, %rcx)
139	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
140	vpcmpeqb %ymm3, %ymmZ, %ymmM
141	vpmovmskb %ymmM, %edx
142	add	$VEC_SIZE, %rcx
143# ifdef USE_AS_STRNCPY
144	sub	$VEC_SIZE, %r8
145	jbe	L(CopyVecSizeCase2OrCase3)
146# endif
147	test	%edx, %edx
148# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
149	jnz	L(CopyVecSizeUnalignedVec3)
150# else
151	jnz	L(CopyVecSize)
152# endif
153
154	vmovdqu %ymm3, (%rdi, %rcx)
155	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
156	vpcmpeqb %ymm4, %ymmZ, %ymmM
157	vpmovmskb %ymmM, %edx
158	add	$VEC_SIZE, %rcx
159# ifdef USE_AS_STRNCPY
160	sub	$VEC_SIZE, %r8
161	jbe	L(CopyVecSizeCase2OrCase3)
162# endif
163	test	%edx, %edx
164# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
165	jnz	L(CopyVecSizeUnalignedVec4)
166# else
167	jnz	L(CopyVecSize)
168# endif
169
170	vmovdqu %ymm4, (%rdi, %rcx)
171	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
172	vpcmpeqb %ymm2, %ymmZ, %ymmM
173	vpmovmskb %ymmM, %edx
174	add	$VEC_SIZE, %rcx
175# ifdef USE_AS_STRNCPY
176	sub	$VEC_SIZE, %r8
177	jbe	L(CopyVecSizeCase2OrCase3)
178# endif
179	test	%edx, %edx
180# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
181	jnz	L(CopyVecSizeUnalignedVec2)
182# else
183	jnz	L(CopyVecSize)
184# endif
185
186	vmovdqu %ymm2, (%rdi, %rcx)
187	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
188	vpcmpeqb %ymm2, %ymmZ, %ymmM
189	vpmovmskb %ymmM, %edx
190	add	$VEC_SIZE, %rcx
191# ifdef USE_AS_STRNCPY
192	sub	$VEC_SIZE, %r8
193	jbe	L(CopyVecSizeCase2OrCase3)
194# endif
195	test	%edx, %edx
196# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
197	jnz	L(CopyVecSizeUnalignedVec2)
198# else
199	jnz	L(CopyVecSize)
200# endif
201
202	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
203	vmovdqu %ymm2, (%rdi, %rcx)
204	vpcmpeqb %ymm3, %ymmZ, %ymmM
205	vpmovmskb %ymmM, %edx
206	add	$VEC_SIZE, %rcx
207# ifdef USE_AS_STRNCPY
208	sub	$VEC_SIZE, %r8
209	jbe	L(CopyVecSizeCase2OrCase3)
210# endif
211	test	%edx, %edx
212# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
213	jnz	L(CopyVecSizeUnalignedVec3)
214# else
215	jnz	L(CopyVecSize)
216# endif
217
218	vmovdqu %ymm3, (%rdi, %rcx)
219	mov	%rsi, %rdx
220	lea	VEC_SIZE(%rsi, %rcx), %rsi
221	and	$-(VEC_SIZE * 4), %rsi
222	sub	%rsi, %rdx
223	sub	%rdx, %rdi
224# ifdef USE_AS_STRNCPY
225	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
226# endif
227L(UnalignedFourVecSizeLoop):
228	vmovdqa (%rsi), %ymm4
229	vmovdqa VEC_SIZE(%rsi), %ymm5
230	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
231	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
232	vpminub %ymm5, %ymm4, %ymm2
233	vpminub %ymm7, %ymm6, %ymm3
234	vpminub %ymm2, %ymm3, %ymm3
235	vpcmpeqb %ymmM, %ymm3, %ymm3
236	vpmovmskb %ymm3, %edx
237# ifdef USE_AS_STRNCPY
238	sub	$(VEC_SIZE * 4), %r8
239	jbe	L(UnalignedLeaveCase2OrCase3)
240# endif
241	test	%edx, %edx
242	jnz	L(UnalignedFourVecSizeLeave)
243
244L(UnalignedFourVecSizeLoop_start):
245	add	$(VEC_SIZE * 4), %rdi
246	add	$(VEC_SIZE * 4), %rsi
247	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
248	vmovdqa (%rsi), %ymm4
249	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
250	vmovdqa VEC_SIZE(%rsi), %ymm5
251	vpminub %ymm5, %ymm4, %ymm2
252	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
253	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
254	vmovdqu %ymm7, -VEC_SIZE(%rdi)
255	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
256	vpminub %ymm7, %ymm6, %ymm3
257	vpminub %ymm2, %ymm3, %ymm3
258	vpcmpeqb %ymmM, %ymm3, %ymm3
259	vpmovmskb %ymm3, %edx
260# ifdef USE_AS_STRNCPY
261	sub	$(VEC_SIZE * 4), %r8
262	jbe	L(UnalignedLeaveCase2OrCase3)
263# endif
264	test	%edx, %edx
265	jz	L(UnalignedFourVecSizeLoop_start)
266
267L(UnalignedFourVecSizeLeave):
268	vpcmpeqb %ymm4, %ymmZ, %ymmM
269	vpmovmskb %ymmM, %edx
270	test	%edx, %edx
271	jnz	L(CopyVecSizeUnaligned_0)
272
273	vpcmpeqb %ymm5, %ymmZ, %ymmM
274	vpmovmskb %ymmM, %ecx
275	test	%ecx, %ecx
276	jnz	L(CopyVecSizeUnaligned_16)
277
278	vpcmpeqb %ymm6, %ymmZ, %ymmM
279	vpmovmskb %ymmM, %edx
280	test	%edx, %edx
281	jnz	L(CopyVecSizeUnaligned_32)
282
283	vpcmpeqb %ymm7, %ymmZ, %ymmM
284	vpmovmskb %ymmM, %ecx
285	bsf	%ecx, %edx
286	vmovdqu %ymm4, (%rdi)
287	vmovdqu %ymm5, VEC_SIZE(%rdi)
288	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
289# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
290# ifdef USE_AS_STPCPY
291	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
292# endif
293	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
294	add	$(VEC_SIZE - 1), %r8
295	sub	%rdx, %r8
296	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
297	jmp	L(StrncpyFillTailWithZero)
298# else
299	add	$(VEC_SIZE * 3), %rsi
300	add	$(VEC_SIZE * 3), %rdi
301	jmp	L(CopyVecSizeExit)
302# endif
303
304/* If source address alignment == destination address alignment */
305
306L(SourceStringAlignmentLessTwoVecSize):
307	vmovdqu (%rsi), %ymm3
308	vmovdqu VEC_SIZE(%rsi), %ymm2
309	vpcmpeqb %ymm3, %ymmZ, %ymmM
310	vpmovmskb %ymmM, %edx
311
312# ifdef USE_AS_STRNCPY
313#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
314	cmp	$VEC_SIZE, %r8
315#  else
316	cmp	$(VEC_SIZE + 1), %r8
317#  endif
318	jbe	L(CopyVecSizeTail1Case2OrCase3)
319# endif
320	test	%edx, %edx
321	jnz	L(CopyVecSizeTail1)
322
323	vmovdqu %ymm3, (%rdi)
324	vpcmpeqb %ymm2, %ymmZ, %ymmM
325	vpmovmskb %ymmM, %edx
326
327# ifdef USE_AS_STRNCPY
328#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
329	cmp	$(VEC_SIZE * 2), %r8
330#  else
331	cmp	$((VEC_SIZE * 2) + 1), %r8
332#  endif
333	jbe	L(CopyTwoVecSize1Case2OrCase3)
334# endif
335	test	%edx, %edx
336	jnz	L(CopyTwoVecSize1)
337
338	and	$-VEC_SIZE, %rsi
339	and	$(VEC_SIZE - 1), %ecx
340	jmp	L(UnalignVecSizeBoth)
341
342/*------End of main part with loops---------------------*/
343
344/* Case1 */
345
346# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
347	.p2align 4
348L(CopyVecSize):
349	add	%rcx, %rdi
350# endif
351L(CopyVecSizeTail):
352	add	%rcx, %rsi
353L(CopyVecSizeTail1):
354	bsf	%edx, %edx
355L(CopyVecSizeExit):
356	cmp	$32, %edx
357	jae	L(Exit32_63)
358	cmp	$16, %edx
359	jae	L(Exit16_31)
360	cmp	$8, %edx
361	jae	L(Exit8_15)
362	cmp	$4, %edx
363	jae	L(Exit4_7)
364	cmp	$3, %edx
365	je	L(Exit3)
366	cmp	$1, %edx
367	ja	L(Exit2)
368	je	L(Exit1)
369	movb	$0, (%rdi)
370# ifdef USE_AS_STPCPY
371	lea	(%rdi), %rax
372# endif
373# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
374	sub	$1, %r8
375	lea	1(%rdi), %rdi
376	jnz	L(StrncpyFillTailWithZero)
377# endif
378L(return_vzeroupper):
379	ZERO_UPPER_VEC_REGISTERS_RETURN
380
381	.p2align 4
382L(CopyTwoVecSize1):
383	add	$VEC_SIZE, %rsi
384	add	$VEC_SIZE, %rdi
385# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
386	sub	$VEC_SIZE, %r8
387# endif
388	jmp	L(CopyVecSizeTail1)
389
390	.p2align 4
391L(CopyTwoVecSize):
392	bsf	%edx, %edx
393	add	%rcx, %rsi
394	add	$VEC_SIZE, %edx
395	sub	%ecx, %edx
396	jmp	L(CopyVecSizeExit)
397
398	.p2align 4
399L(CopyVecSizeUnaligned_0):
400	bsf	%edx, %edx
401# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
402# ifdef USE_AS_STPCPY
403	lea	(%rdi, %rdx), %rax
404# endif
405	vmovdqu %ymm4, (%rdi)
406	add	$((VEC_SIZE * 4) - 1), %r8
407	sub	%rdx, %r8
408	lea	1(%rdi, %rdx), %rdi
409	jmp	L(StrncpyFillTailWithZero)
410# else
411	jmp	L(CopyVecSizeExit)
412# endif
413
414	.p2align 4
415L(CopyVecSizeUnaligned_16):
416	bsf	%ecx, %edx
417	vmovdqu %ymm4, (%rdi)
418# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
419# ifdef USE_AS_STPCPY
420	lea	VEC_SIZE(%rdi, %rdx), %rax
421# endif
422	vmovdqu %ymm5, VEC_SIZE(%rdi)
423	add	$((VEC_SIZE * 3) - 1), %r8
424	sub	%rdx, %r8
425	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
426	jmp	L(StrncpyFillTailWithZero)
427# else
428	add	$VEC_SIZE, %rsi
429	add	$VEC_SIZE, %rdi
430	jmp	L(CopyVecSizeExit)
431# endif
432
433	.p2align 4
434L(CopyVecSizeUnaligned_32):
435	bsf	%edx, %edx
436	vmovdqu %ymm4, (%rdi)
437	vmovdqu %ymm5, VEC_SIZE(%rdi)
438# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
439# ifdef USE_AS_STPCPY
440	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
441# endif
442	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
443	add	$((VEC_SIZE * 2) - 1), %r8
444	sub	%rdx, %r8
445	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
446	jmp	L(StrncpyFillTailWithZero)
447# else
448	add	$(VEC_SIZE * 2), %rsi
449	add	$(VEC_SIZE * 2), %rdi
450	jmp	L(CopyVecSizeExit)
451# endif
452
453# ifdef USE_AS_STRNCPY
454#  ifndef USE_AS_STRCAT
455	.p2align 4
456L(CopyVecSizeUnalignedVec6):
457	vmovdqu %ymm6, (%rdi, %rcx)
458	jmp	L(CopyVecSizeVecExit)
459
460	.p2align 4
461L(CopyVecSizeUnalignedVec5):
462	vmovdqu %ymm5, (%rdi, %rcx)
463	jmp	L(CopyVecSizeVecExit)
464
465	.p2align 4
466L(CopyVecSizeUnalignedVec4):
467	vmovdqu %ymm4, (%rdi, %rcx)
468	jmp	L(CopyVecSizeVecExit)
469
470	.p2align 4
471L(CopyVecSizeUnalignedVec3):
472	vmovdqu %ymm3, (%rdi, %rcx)
473	jmp	L(CopyVecSizeVecExit)
474#  endif
475
476/* Case2 */
477
478	.p2align 4
479L(CopyVecSizeCase2):
480	add	$VEC_SIZE, %r8
481	add	%rcx, %rdi
482	add	%rcx, %rsi
483	bsf	%edx, %edx
484	cmp	%r8d, %edx
485	jb	L(CopyVecSizeExit)
486	jmp	L(StrncpyExit)
487
488	.p2align 4
489L(CopyTwoVecSizeCase2):
490	add	%rcx, %rsi
491	bsf	%edx, %edx
492	add	$VEC_SIZE, %edx
493	sub	%ecx, %edx
494	cmp	%r8d, %edx
495	jb	L(CopyVecSizeExit)
496	jmp	L(StrncpyExit)
497
498L(CopyVecSizeTailCase2):
499	add	%rcx, %rsi
500	bsf	%edx, %edx
501	cmp	%r8d, %edx
502	jb	L(CopyVecSizeExit)
503	jmp	L(StrncpyExit)
504
505L(CopyVecSizeTail1Case2):
506	bsf	%edx, %edx
507	cmp	%r8d, %edx
508	jb	L(CopyVecSizeExit)
509	jmp	L(StrncpyExit)
510
511/* Case2 or Case3,  Case3 */
512
513	.p2align 4
514L(CopyVecSizeCase2OrCase3):
515	test	%rdx, %rdx
516	jnz	L(CopyVecSizeCase2)
517L(CopyVecSizeCase3):
518	add	$VEC_SIZE, %r8
519	add	%rcx, %rdi
520	add	%rcx, %rsi
521	jmp	L(StrncpyExit)
522
523	.p2align 4
524L(CopyTwoVecSizeCase2OrCase3):
525	test	%rdx, %rdx
526	jnz	L(CopyTwoVecSizeCase2)
527	add	%rcx, %rsi
528	jmp	L(StrncpyExit)
529
530	.p2align 4
531L(CopyVecSizeTailCase2OrCase3):
532	test	%rdx, %rdx
533	jnz	L(CopyVecSizeTailCase2)
534	add	%rcx, %rsi
535	jmp	L(StrncpyExit)
536
537	.p2align 4
538L(CopyTwoVecSize1Case2OrCase3):
539	add	$VEC_SIZE, %rdi
540	add	$VEC_SIZE, %rsi
541	sub	$VEC_SIZE, %r8
542L(CopyVecSizeTail1Case2OrCase3):
543	test	%rdx, %rdx
544	jnz	L(CopyVecSizeTail1Case2)
545	jmp	L(StrncpyExit)
546# endif
547
548/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
549
550	.p2align 4
551L(Exit1):
552	movzwl	(%rsi), %edx
553	mov	%dx, (%rdi)
554# ifdef USE_AS_STPCPY
555	lea	1(%rdi), %rax
556# endif
557# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
558	sub	$2, %r8
559	lea	2(%rdi), %rdi
560	jnz	L(StrncpyFillTailWithZero)
561# endif
562	VZEROUPPER_RETURN
563
564	.p2align 4
565L(Exit2):
566	movzwl	(%rsi), %ecx
567	mov	%cx, (%rdi)
568	movb	$0, 2(%rdi)
569# ifdef USE_AS_STPCPY
570	lea	2(%rdi), %rax
571# endif
572# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
573	sub	$3, %r8
574	lea	3(%rdi), %rdi
575	jnz	L(StrncpyFillTailWithZero)
576# endif
577	VZEROUPPER_RETURN
578
579	.p2align 4
580L(Exit3):
581	mov	(%rsi), %edx
582	mov	%edx, (%rdi)
583# ifdef USE_AS_STPCPY
584	lea	3(%rdi), %rax
585# endif
586# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
587	sub	$4, %r8
588	lea	4(%rdi), %rdi
589	jnz	L(StrncpyFillTailWithZero)
590# endif
591	VZEROUPPER_RETURN
592
593	.p2align 4
594L(Exit4_7):
595	mov	(%rsi), %ecx
596	mov	%ecx, (%rdi)
597	mov	-3(%rsi, %rdx), %ecx
598	mov	%ecx, -3(%rdi, %rdx)
599# ifdef USE_AS_STPCPY
600	lea	(%rdi, %rdx), %rax
601# endif
602# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
603	sub	%rdx, %r8
604	sub	$1, %r8
605	lea	1(%rdi, %rdx), %rdi
606	jnz	L(StrncpyFillTailWithZero)
607# endif
608	VZEROUPPER_RETURN
609
610	.p2align 4
611L(Exit8_15):
612	mov	(%rsi), %rcx
613	mov	-7(%rsi, %rdx), %r9
614	mov	%rcx, (%rdi)
615	mov	%r9, -7(%rdi, %rdx)
616# ifdef USE_AS_STPCPY
617	lea	(%rdi, %rdx), %rax
618# endif
619# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
620	sub	%rdx, %r8
621	sub	$1, %r8
622	lea	1(%rdi, %rdx), %rdi
623	jnz	L(StrncpyFillTailWithZero)
624# endif
625	VZEROUPPER_RETURN
626
627	.p2align 4
628L(Exit16_31):
629	vmovdqu (%rsi), %xmm2
630	vmovdqu -15(%rsi, %rdx), %xmm3
631	vmovdqu %xmm2, (%rdi)
632	vmovdqu %xmm3, -15(%rdi, %rdx)
633# ifdef USE_AS_STPCPY
634	lea	(%rdi, %rdx), %rax
635# endif
636# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
637	sub %rdx, %r8
638	sub $1, %r8
639	lea 1(%rdi, %rdx), %rdi
640	jnz L(StrncpyFillTailWithZero)
641# endif
642	VZEROUPPER_RETURN
643
644	.p2align 4
645L(Exit32_63):
646	vmovdqu (%rsi), %ymm2
647	vmovdqu -31(%rsi, %rdx), %ymm3
648	vmovdqu %ymm2, (%rdi)
649	vmovdqu %ymm3, -31(%rdi, %rdx)
650# ifdef USE_AS_STPCPY
651	lea	(%rdi, %rdx), %rax
652# endif
653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
654	sub	%rdx, %r8
655	sub	$1, %r8
656	lea	1(%rdi, %rdx), %rdi
657	jnz	L(StrncpyFillTailWithZero)
658# endif
659	VZEROUPPER_RETURN
660
661# ifdef USE_AS_STRNCPY
662
663	.p2align 4
664L(StrncpyExit1):
665	movzbl	(%rsi), %edx
666	mov	%dl, (%rdi)
667#  ifdef USE_AS_STPCPY
668	lea	1(%rdi), %rax
669#  endif
670#  ifdef USE_AS_STRCAT
671	movb	$0, 1(%rdi)
672#  endif
673	VZEROUPPER_RETURN
674
675	.p2align 4
676L(StrncpyExit2):
677	movzwl	(%rsi), %edx
678	mov	%dx, (%rdi)
679#  ifdef USE_AS_STPCPY
680	lea	2(%rdi), %rax
681#  endif
682#  ifdef USE_AS_STRCAT
683	movb	$0, 2(%rdi)
684#  endif
685	VZEROUPPER_RETURN
686
687	.p2align 4
688L(StrncpyExit3_4):
689	movzwl	(%rsi), %ecx
690	movzwl	-2(%rsi, %r8), %edx
691	mov	%cx, (%rdi)
692	mov	%dx, -2(%rdi, %r8)
693#  ifdef USE_AS_STPCPY
694	lea	(%rdi, %r8), %rax
695#  endif
696#  ifdef USE_AS_STRCAT
697	movb	$0, (%rdi, %r8)
698#  endif
699	VZEROUPPER_RETURN
700
701	.p2align 4
702L(StrncpyExit5_8):
703	mov	(%rsi), %ecx
704	mov	-4(%rsi, %r8), %edx
705	mov	%ecx, (%rdi)
706	mov	%edx, -4(%rdi, %r8)
707#  ifdef USE_AS_STPCPY
708	lea	(%rdi, %r8), %rax
709#  endif
710#  ifdef USE_AS_STRCAT
711	movb	$0, (%rdi, %r8)
712#  endif
713	VZEROUPPER_RETURN
714
715	.p2align 4
716L(StrncpyExit9_16):
717	mov	(%rsi), %rcx
718	mov	-8(%rsi, %r8), %rdx
719	mov	%rcx, (%rdi)
720	mov	%rdx, -8(%rdi, %r8)
721#  ifdef USE_AS_STPCPY
722	lea	(%rdi, %r8), %rax
723#  endif
724#  ifdef USE_AS_STRCAT
725	movb	$0, (%rdi, %r8)
726#  endif
727	VZEROUPPER_RETURN
728
729	.p2align 4
730L(StrncpyExit17_32):
731	vmovdqu (%rsi), %xmm2
732	vmovdqu -16(%rsi, %r8), %xmm3
733	vmovdqu %xmm2, (%rdi)
734	vmovdqu %xmm3, -16(%rdi, %r8)
735#  ifdef USE_AS_STPCPY
736	lea	(%rdi, %r8), %rax
737#  endif
738#  ifdef USE_AS_STRCAT
739	movb	$0, (%rdi, %r8)
740#  endif
741	VZEROUPPER_RETURN
742
743	.p2align 4
744L(StrncpyExit33_64):
745	/*  0/32, 31/16 */
746	vmovdqu (%rsi), %ymm2
747	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
748	vmovdqu %ymm2, (%rdi)
749	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
750#  ifdef USE_AS_STPCPY
751	lea	(%rdi, %r8), %rax
752#  endif
753#  ifdef USE_AS_STRCAT
754	movb	$0, (%rdi, %r8)
755#  endif
756	VZEROUPPER_RETURN
757
758	.p2align 4
759L(StrncpyExit65):
760	/* 0/32, 32/32, 64/1 */
761	vmovdqu (%rsi), %ymm2
762	vmovdqu 32(%rsi), %ymm3
763	mov	64(%rsi), %cl
764	vmovdqu %ymm2, (%rdi)
765	vmovdqu %ymm3, 32(%rdi)
766	mov	%cl, 64(%rdi)
767#  ifdef USE_AS_STPCPY
768	lea	65(%rdi), %rax
769#  endif
770#  ifdef USE_AS_STRCAT
771	movb	$0, 65(%rdi)
772#  endif
773	VZEROUPPER_RETURN
774
775#  ifndef USE_AS_STRCAT
776
777	.p2align 4
778L(Fill1):
779	mov	%dl, (%rdi)
780	VZEROUPPER_RETURN
781
782	.p2align 4
783L(Fill2):
784	mov	%dx, (%rdi)
785	VZEROUPPER_RETURN
786
787	.p2align 4
788L(Fill3_4):
789	mov	%dx, (%rdi)
790	mov     %dx, -2(%rdi, %r8)
791	VZEROUPPER_RETURN
792
793	.p2align 4
794L(Fill5_8):
795	mov	%edx, (%rdi)
796	mov     %edx, -4(%rdi, %r8)
797	VZEROUPPER_RETURN
798
799	.p2align 4
800L(Fill9_16):
801	mov	%rdx, (%rdi)
802	mov	%rdx, -8(%rdi, %r8)
803	VZEROUPPER_RETURN
804
805	.p2align 4
806L(Fill17_32):
807	vmovdqu %xmmZ, (%rdi)
808	vmovdqu %xmmZ, -16(%rdi, %r8)
809	VZEROUPPER_RETURN
810
811	.p2align 4
812L(CopyVecSizeUnalignedVec2):
813	vmovdqu %ymm2, (%rdi, %rcx)
814
815	.p2align 4
816L(CopyVecSizeVecExit):
817	bsf	%edx, %edx
818	add	$(VEC_SIZE - 1), %r8
819	add	%rcx, %rdi
820#   ifdef USE_AS_STPCPY
821	lea	(%rdi, %rdx), %rax
822#   endif
823	sub	%rdx, %r8
824	lea	1(%rdi, %rdx), %rdi
825
826	.p2align 4
827L(StrncpyFillTailWithZero):
828	xor	%edx, %edx
829	sub	$VEC_SIZE, %r8
830	jbe	L(StrncpyFillExit)
831
832	vmovdqu %ymmZ, (%rdi)
833	add	$VEC_SIZE, %rdi
834
835	mov	%rdi, %rsi
836	and	$(VEC_SIZE - 1), %esi
837	sub	%rsi, %rdi
838	add	%rsi, %r8
839	sub	$(VEC_SIZE * 4), %r8
840	jb	L(StrncpyFillLessFourVecSize)
841
842L(StrncpyFillLoopVmovdqa):
843	vmovdqa %ymmZ, (%rdi)
844	vmovdqa %ymmZ, VEC_SIZE(%rdi)
845	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
846	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
847	add	$(VEC_SIZE * 4), %rdi
848	sub	$(VEC_SIZE * 4), %r8
849	jae	L(StrncpyFillLoopVmovdqa)
850
851L(StrncpyFillLessFourVecSize):
852	add	$(VEC_SIZE * 2), %r8
853	jl	L(StrncpyFillLessTwoVecSize)
854	vmovdqa %ymmZ, (%rdi)
855	vmovdqa %ymmZ, VEC_SIZE(%rdi)
856	add	$(VEC_SIZE * 2), %rdi
857	sub	$VEC_SIZE, %r8
858	jl	L(StrncpyFillExit)
859	vmovdqa %ymmZ, (%rdi)
860	add	$VEC_SIZE, %rdi
861	jmp	L(Fill)
862
863	.p2align 4
864L(StrncpyFillLessTwoVecSize):
865	add	$VEC_SIZE, %r8
866	jl	L(StrncpyFillExit)
867	vmovdqa %ymmZ, (%rdi)
868	add	$VEC_SIZE, %rdi
869	jmp	L(Fill)
870
871	.p2align 4
872L(StrncpyFillExit):
873	add	$VEC_SIZE, %r8
874L(Fill):
875	cmp	$17, %r8d
876	jae	L(Fill17_32)
877	cmp	$9, %r8d
878	jae	L(Fill9_16)
879	cmp	$5, %r8d
880	jae	L(Fill5_8)
881	cmp	$3, %r8d
882	jae	L(Fill3_4)
883	cmp	$1, %r8d
884	ja	L(Fill2)
885	je	L(Fill1)
886	VZEROUPPER_RETURN
887
888/* end of ifndef USE_AS_STRCAT */
889#  endif
890
891	.p2align 4
892L(UnalignedLeaveCase2OrCase3):
893	test	%rdx, %rdx
894	jnz	L(UnalignedFourVecSizeLeaveCase2)
895L(UnalignedFourVecSizeLeaveCase3):
896	lea	(VEC_SIZE * 4)(%r8), %rcx
897	and	$-VEC_SIZE, %rcx
898	add	$(VEC_SIZE * 3), %r8
899	jl	L(CopyVecSizeCase3)
900	vmovdqu %ymm4, (%rdi)
901	sub	$VEC_SIZE, %r8
902	jb	L(CopyVecSizeCase3)
903	vmovdqu %ymm5, VEC_SIZE(%rdi)
904	sub	$VEC_SIZE, %r8
905	jb	L(CopyVecSizeCase3)
906	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
907	sub	$VEC_SIZE, %r8
908	jb	L(CopyVecSizeCase3)
909	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
910#  ifdef USE_AS_STPCPY
911	lea	(VEC_SIZE * 4)(%rdi), %rax
912#  endif
913#  ifdef USE_AS_STRCAT
914	movb	$0, (VEC_SIZE * 4)(%rdi)
915#  endif
916	VZEROUPPER_RETURN
917
918	.p2align 4
919L(UnalignedFourVecSizeLeaveCase2):
920	xor	%ecx, %ecx
921	vpcmpeqb %ymm4, %ymmZ, %ymmM
922	vpmovmskb %ymmM, %edx
923	add	$(VEC_SIZE * 3), %r8
924	jle	L(CopyVecSizeCase2OrCase3)
925	test	%edx, %edx
926#  ifndef USE_AS_STRCAT
927	jnz	L(CopyVecSizeUnalignedVec4)
928#  else
929	jnz	L(CopyVecSize)
930#  endif
931	vpcmpeqb %ymm5, %ymmZ, %ymmM
932	vpmovmskb %ymmM, %edx
933	vmovdqu %ymm4, (%rdi)
934	add	$VEC_SIZE, %rcx
935	sub	$VEC_SIZE, %r8
936	jbe	L(CopyVecSizeCase2OrCase3)
937	test	%edx, %edx
938#  ifndef USE_AS_STRCAT
939	jnz	L(CopyVecSizeUnalignedVec5)
940#  else
941	jnz	L(CopyVecSize)
942#  endif
943
944	vpcmpeqb %ymm6, %ymmZ, %ymmM
945	vpmovmskb %ymmM, %edx
946	vmovdqu %ymm5, VEC_SIZE(%rdi)
947	add	$VEC_SIZE, %rcx
948	sub	$VEC_SIZE, %r8
949	jbe	L(CopyVecSizeCase2OrCase3)
950	test	%edx, %edx
951#  ifndef USE_AS_STRCAT
952	jnz	L(CopyVecSizeUnalignedVec6)
953#  else
954	jnz	L(CopyVecSize)
955#  endif
956
957	vpcmpeqb %ymm7, %ymmZ, %ymmM
958	vpmovmskb %ymmM, %edx
959	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
960	lea	VEC_SIZE(%rdi, %rcx), %rdi
961	lea	VEC_SIZE(%rsi, %rcx), %rsi
962	bsf	%edx, %edx
963	cmp	%r8d, %edx
964	jb	L(CopyVecSizeExit)
965L(StrncpyExit):
966	cmp	$65, %r8d
967	je	L(StrncpyExit65)
968	cmp	$33, %r8d
969	jae	L(StrncpyExit33_64)
970	cmp	$17, %r8d
971	jae	L(StrncpyExit17_32)
972	cmp	$9, %r8d
973	jae	L(StrncpyExit9_16)
974	cmp	$5, %r8d
975	jae	L(StrncpyExit5_8)
976	cmp	$3, %r8d
977	jae	L(StrncpyExit3_4)
978	cmp	$1, %r8d
979	ja	L(StrncpyExit2)
980	je	L(StrncpyExit1)
981#  ifdef USE_AS_STPCPY
982	mov	%rdi, %rax
983#  endif
984#  ifdef USE_AS_STRCAT
985	movb	$0, (%rdi)
986#  endif
987	VZEROUPPER_RETURN
988
989	.p2align 4
990L(ExitZero):
991#  ifndef USE_AS_STRCAT
992	mov	%rdi, %rax
993#  endif
994	VZEROUPPER_RETURN
995
996# endif
997
998# ifndef USE_AS_STRCAT
999END (STRCPY)
1000# else
1001END (STRCAT)
1002# endif
1003#endif
1004