1/* strcpy with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23
24# ifndef USE_AS_STRCAT
25#  include <sysdep.h>
26
27#  ifndef STRCPY
28#   define STRCPY  __strcpy_evex
29#  endif
30
31# endif
32
33# define VMOVU		vmovdqu64
34# define VMOVA		vmovdqa64
35
36/* Number of bytes in a vector register */
37# ifndef VEC_SIZE
38#  define VEC_SIZE	32
39# endif
40
41# define XMM2		xmm18
42# define XMM3		xmm19
43
44# define YMM2		ymm18
45# define YMM3		ymm19
46# define YMM4		ymm20
47# define YMM5		ymm21
48# define YMM6		ymm22
49# define YMM7		ymm23
50
51# ifndef USE_AS_STRCAT
52
53/* zero register */
54#  define XMMZERO	xmm16
55#  define YMMZERO	ymm16
56#  define YMM1		ymm17
57
58	.section .text.evex,"ax",@progbits
59ENTRY (STRCPY)
60#  ifdef USE_AS_STRNCPY
61	mov	%RDX_LP, %R8_LP
62	test	%R8_LP, %R8_LP
63	jz	L(ExitZero)
64#  endif
65	mov	%rsi, %rcx
66#  ifndef USE_AS_STPCPY
67	mov	%rdi, %rax      /* save result */
68#  endif
69
70	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
71# endif
72
73	and	$((VEC_SIZE * 4) - 1), %ecx
74	cmp	$(VEC_SIZE * 2), %ecx
75	jbe	L(SourceStringAlignmentLessTwoVecSize)
76
77	and	$-VEC_SIZE, %rsi
78	and	$(VEC_SIZE - 1), %ecx
79
80	vpcmpb	$0, (%rsi), %YMMZERO, %k0
81	kmovd	%k0, %edx
82	shr	%cl, %rdx
83
84# ifdef USE_AS_STRNCPY
85#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
86	mov	$VEC_SIZE, %r10
87	sub	%rcx, %r10
88	cmp	%r10, %r8
89#  else
90	mov	$(VEC_SIZE + 1), %r10
91	sub	%rcx, %r10
92	cmp	%r10, %r8
93#  endif
94	jbe	L(CopyVecSizeTailCase2OrCase3)
95# endif
96	test	%edx, %edx
97	jnz	L(CopyVecSizeTail)
98
99	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
100	kmovd	%k1, %edx
101
102# ifdef USE_AS_STRNCPY
103	add	$VEC_SIZE, %r10
104	cmp	%r10, %r8
105	jbe	L(CopyTwoVecSizeCase2OrCase3)
106# endif
107	test	%edx, %edx
108	jnz	L(CopyTwoVecSize)
109
110	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
111	VMOVU	%YMM2, (%rdi)
112
113/* If source address alignment != destination address alignment */
114	.p2align 4
115L(UnalignVecSizeBoth):
116	sub	%rcx, %rdi
117# ifdef USE_AS_STRNCPY
118	add	%rcx, %r8
119	sbb	%rcx, %rcx
120	or	%rcx, %r8
121# endif
122	mov	$VEC_SIZE, %rcx
123	VMOVA	(%rsi, %rcx), %YMM2
124	VMOVU	%YMM2, (%rdi, %rcx)
125	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
126	vpcmpb	$0, %YMM2, %YMMZERO, %k0
127	kmovd	%k0, %edx
128	add	$VEC_SIZE, %rcx
129# ifdef USE_AS_STRNCPY
130	sub	$(VEC_SIZE * 3), %r8
131	jbe	L(CopyVecSizeCase2OrCase3)
132# endif
133	test	%edx, %edx
134# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
135	jnz	L(CopyVecSizeUnalignedVec2)
136# else
137	jnz	L(CopyVecSize)
138# endif
139
140	VMOVU	%YMM2, (%rdi, %rcx)
141	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
142	vpcmpb	$0, %YMM3, %YMMZERO, %k0
143	kmovd	%k0, %edx
144	add	$VEC_SIZE, %rcx
145# ifdef USE_AS_STRNCPY
146	sub	$VEC_SIZE, %r8
147	jbe	L(CopyVecSizeCase2OrCase3)
148# endif
149	test	%edx, %edx
150# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
151	jnz	L(CopyVecSizeUnalignedVec3)
152# else
153	jnz	L(CopyVecSize)
154# endif
155
156	VMOVU	%YMM3, (%rdi, %rcx)
157	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
158	vpcmpb	$0, %YMM4, %YMMZERO, %k0
159	kmovd	%k0, %edx
160	add	$VEC_SIZE, %rcx
161# ifdef USE_AS_STRNCPY
162	sub	$VEC_SIZE, %r8
163	jbe	L(CopyVecSizeCase2OrCase3)
164# endif
165	test	%edx, %edx
166# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
167	jnz	L(CopyVecSizeUnalignedVec4)
168# else
169	jnz	L(CopyVecSize)
170# endif
171
172	VMOVU	%YMM4, (%rdi, %rcx)
173	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
174	vpcmpb	$0, %YMM2, %YMMZERO, %k0
175	kmovd	%k0, %edx
176	add	$VEC_SIZE, %rcx
177# ifdef USE_AS_STRNCPY
178	sub	$VEC_SIZE, %r8
179	jbe	L(CopyVecSizeCase2OrCase3)
180# endif
181	test	%edx, %edx
182# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
183	jnz	L(CopyVecSizeUnalignedVec2)
184# else
185	jnz	L(CopyVecSize)
186# endif
187
188	VMOVU	%YMM2, (%rdi, %rcx)
189	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
190	vpcmpb	$0, %YMM2, %YMMZERO, %k0
191	kmovd	%k0, %edx
192	add	$VEC_SIZE, %rcx
193# ifdef USE_AS_STRNCPY
194	sub	$VEC_SIZE, %r8
195	jbe	L(CopyVecSizeCase2OrCase3)
196# endif
197	test	%edx, %edx
198# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
199	jnz	L(CopyVecSizeUnalignedVec2)
200# else
201	jnz	L(CopyVecSize)
202# endif
203
204	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
205	VMOVU	%YMM2, (%rdi, %rcx)
206	vpcmpb	$0, %YMM3, %YMMZERO, %k0
207	kmovd	%k0, %edx
208	add	$VEC_SIZE, %rcx
209# ifdef USE_AS_STRNCPY
210	sub	$VEC_SIZE, %r8
211	jbe	L(CopyVecSizeCase2OrCase3)
212# endif
213	test	%edx, %edx
214# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
215	jnz	L(CopyVecSizeUnalignedVec3)
216# else
217	jnz	L(CopyVecSize)
218# endif
219
220	VMOVU	%YMM3, (%rdi, %rcx)
221	mov	%rsi, %rdx
222	lea	VEC_SIZE(%rsi, %rcx), %rsi
223	and	$-(VEC_SIZE * 4), %rsi
224	sub	%rsi, %rdx
225	sub	%rdx, %rdi
226# ifdef USE_AS_STRNCPY
227	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
228# endif
229L(UnalignedFourVecSizeLoop):
230	VMOVA	(%rsi), %YMM4
231	VMOVA	VEC_SIZE(%rsi), %YMM5
232	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
233	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
234	vpminub	%YMM5, %YMM4, %YMM2
235	vpminub	%YMM7, %YMM6, %YMM3
236	vpminub	%YMM2, %YMM3, %YMM2
237	/* If K7 != 0, there is a null byte.  */
238	vpcmpb	$0, %YMM2, %YMMZERO, %k7
239	kmovd	%k7, %edx
240# ifdef USE_AS_STRNCPY
241	sub	$(VEC_SIZE * 4), %r8
242	jbe	L(UnalignedLeaveCase2OrCase3)
243# endif
244	test	%edx, %edx
245	jnz	L(UnalignedFourVecSizeLeave)
246
247L(UnalignedFourVecSizeLoop_start):
248	add	$(VEC_SIZE * 4), %rdi
249	add	$(VEC_SIZE * 4), %rsi
250	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
251	VMOVA	(%rsi), %YMM4
252	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
253	VMOVA	VEC_SIZE(%rsi), %YMM5
254	vpminub	%YMM5, %YMM4, %YMM2
255	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
256	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
257	VMOVU	%YMM7, -VEC_SIZE(%rdi)
258	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
259	vpminub	%YMM7, %YMM6, %YMM3
260	vpminub	%YMM2, %YMM3, %YMM2
261	/* If K7 != 0, there is a null byte.  */
262	vpcmpb	$0, %YMM2, %YMMZERO, %k7
263	kmovd	%k7, %edx
264# ifdef USE_AS_STRNCPY
265	sub	$(VEC_SIZE * 4), %r8
266	jbe	L(UnalignedLeaveCase2OrCase3)
267# endif
268	test	%edx, %edx
269	jz	L(UnalignedFourVecSizeLoop_start)
270
271L(UnalignedFourVecSizeLeave):
272	vpcmpb	$0, %YMM4, %YMMZERO, %k1
273	kmovd	%k1, %edx
274	test	%edx, %edx
275	jnz	L(CopyVecSizeUnaligned_0)
276
277	vpcmpb	$0, %YMM5, %YMMZERO, %k2
278	kmovd	%k2, %ecx
279	test	%ecx, %ecx
280	jnz	L(CopyVecSizeUnaligned_16)
281
282	vpcmpb	$0, %YMM6, %YMMZERO, %k3
283	kmovd	%k3, %edx
284	test	%edx, %edx
285	jnz	L(CopyVecSizeUnaligned_32)
286
287	vpcmpb	$0, %YMM7, %YMMZERO, %k4
288	kmovd	%k4, %ecx
289	bsf	%ecx, %edx
290	VMOVU	%YMM4, (%rdi)
291	VMOVU	%YMM5, VEC_SIZE(%rdi)
292	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
293# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
294# ifdef USE_AS_STPCPY
295	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
296# endif
297	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
298	add	$(VEC_SIZE - 1), %r8
299	sub	%rdx, %r8
300	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
301	jmp	L(StrncpyFillTailWithZero)
302# else
303	add	$(VEC_SIZE * 3), %rsi
304	add	$(VEC_SIZE * 3), %rdi
305	jmp	L(CopyVecSizeExit)
306# endif
307
308/* If source address alignment == destination address alignment */
309
310L(SourceStringAlignmentLessTwoVecSize):
311	VMOVU	(%rsi), %YMM3
312	VMOVU	VEC_SIZE(%rsi), %YMM2
313	vpcmpb	$0, %YMM3, %YMMZERO, %k0
314	kmovd	%k0, %edx
315
316# ifdef USE_AS_STRNCPY
317#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
318	cmp	$VEC_SIZE, %r8
319#  else
320	cmp	$(VEC_SIZE + 1), %r8
321#  endif
322	jbe	L(CopyVecSizeTail1Case2OrCase3)
323# endif
324	test	%edx, %edx
325	jnz	L(CopyVecSizeTail1)
326
327	VMOVU	%YMM3, (%rdi)
328	vpcmpb	$0, %YMM2, %YMMZERO, %k0
329	kmovd	%k0, %edx
330
331# ifdef USE_AS_STRNCPY
332#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
333	cmp	$(VEC_SIZE * 2), %r8
334#  else
335	cmp	$((VEC_SIZE * 2) + 1), %r8
336#  endif
337	jbe	L(CopyTwoVecSize1Case2OrCase3)
338# endif
339	test	%edx, %edx
340	jnz	L(CopyTwoVecSize1)
341
342	and	$-VEC_SIZE, %rsi
343	and	$(VEC_SIZE - 1), %ecx
344	jmp	L(UnalignVecSizeBoth)
345
346/*------End of main part with loops---------------------*/
347
348/* Case1 */
349
350# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
351	.p2align 4
352L(CopyVecSize):
353	add	%rcx, %rdi
354# endif
355L(CopyVecSizeTail):
356	add	%rcx, %rsi
357L(CopyVecSizeTail1):
358	bsf	%edx, %edx
359L(CopyVecSizeExit):
360	cmp	$32, %edx
361	jae	L(Exit32_63)
362	cmp	$16, %edx
363	jae	L(Exit16_31)
364	cmp	$8, %edx
365	jae	L(Exit8_15)
366	cmp	$4, %edx
367	jae	L(Exit4_7)
368	cmp	$3, %edx
369	je	L(Exit3)
370	cmp	$1, %edx
371	ja	L(Exit2)
372	je	L(Exit1)
373	movb	$0, (%rdi)
374# ifdef USE_AS_STPCPY
375	lea	(%rdi), %rax
376# endif
377# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
378	sub	$1, %r8
379	lea	1(%rdi), %rdi
380	jnz	L(StrncpyFillTailWithZero)
381# endif
382	ret
383
384	.p2align 4
385L(CopyTwoVecSize1):
386	add	$VEC_SIZE, %rsi
387	add	$VEC_SIZE, %rdi
388# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
389	sub	$VEC_SIZE, %r8
390# endif
391	jmp	L(CopyVecSizeTail1)
392
393	.p2align 4
394L(CopyTwoVecSize):
395	bsf	%edx, %edx
396	add	%rcx, %rsi
397	add	$VEC_SIZE, %edx
398	sub	%ecx, %edx
399	jmp	L(CopyVecSizeExit)
400
401	.p2align 4
402L(CopyVecSizeUnaligned_0):
403	bsf	%edx, %edx
404# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
405# ifdef USE_AS_STPCPY
406	lea	(%rdi, %rdx), %rax
407# endif
408	VMOVU	%YMM4, (%rdi)
409	add	$((VEC_SIZE * 4) - 1), %r8
410	sub	%rdx, %r8
411	lea	1(%rdi, %rdx), %rdi
412	jmp	L(StrncpyFillTailWithZero)
413# else
414	jmp	L(CopyVecSizeExit)
415# endif
416
417	.p2align 4
418L(CopyVecSizeUnaligned_16):
419	bsf	%ecx, %edx
420	VMOVU	%YMM4, (%rdi)
421# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
422# ifdef USE_AS_STPCPY
423	lea	VEC_SIZE(%rdi, %rdx), %rax
424# endif
425	VMOVU	%YMM5, VEC_SIZE(%rdi)
426	add	$((VEC_SIZE * 3) - 1), %r8
427	sub	%rdx, %r8
428	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
429	jmp	L(StrncpyFillTailWithZero)
430# else
431	add	$VEC_SIZE, %rsi
432	add	$VEC_SIZE, %rdi
433	jmp	L(CopyVecSizeExit)
434# endif
435
436	.p2align 4
437L(CopyVecSizeUnaligned_32):
438	bsf	%edx, %edx
439	VMOVU	%YMM4, (%rdi)
440	VMOVU	%YMM5, VEC_SIZE(%rdi)
441# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
442# ifdef USE_AS_STPCPY
443	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
444# endif
445	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
446	add	$((VEC_SIZE * 2) - 1), %r8
447	sub	%rdx, %r8
448	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
449	jmp	L(StrncpyFillTailWithZero)
450# else
451	add	$(VEC_SIZE * 2), %rsi
452	add	$(VEC_SIZE * 2), %rdi
453	jmp	L(CopyVecSizeExit)
454# endif
455
456# ifdef USE_AS_STRNCPY
457#  ifndef USE_AS_STRCAT
458	.p2align 4
459L(CopyVecSizeUnalignedVec6):
460	VMOVU	%YMM6, (%rdi, %rcx)
461	jmp	L(CopyVecSizeVecExit)
462
463	.p2align 4
464L(CopyVecSizeUnalignedVec5):
465	VMOVU	%YMM5, (%rdi, %rcx)
466	jmp	L(CopyVecSizeVecExit)
467
468	.p2align 4
469L(CopyVecSizeUnalignedVec4):
470	VMOVU	%YMM4, (%rdi, %rcx)
471	jmp	L(CopyVecSizeVecExit)
472
473	.p2align 4
474L(CopyVecSizeUnalignedVec3):
475	VMOVU	%YMM3, (%rdi, %rcx)
476	jmp	L(CopyVecSizeVecExit)
477#  endif
478
479/* Case2 */
480
481	.p2align 4
482L(CopyVecSizeCase2):
483	add	$VEC_SIZE, %r8
484	add	%rcx, %rdi
485	add	%rcx, %rsi
486	bsf	%edx, %edx
487	cmp	%r8d, %edx
488	jb	L(CopyVecSizeExit)
489	jmp	L(StrncpyExit)
490
491	.p2align 4
492L(CopyTwoVecSizeCase2):
493	add	%rcx, %rsi
494	bsf	%edx, %edx
495	add	$VEC_SIZE, %edx
496	sub	%ecx, %edx
497	cmp	%r8d, %edx
498	jb	L(CopyVecSizeExit)
499	jmp	L(StrncpyExit)
500
501L(CopyVecSizeTailCase2):
502	add	%rcx, %rsi
503	bsf	%edx, %edx
504	cmp	%r8d, %edx
505	jb	L(CopyVecSizeExit)
506	jmp	L(StrncpyExit)
507
508L(CopyVecSizeTail1Case2):
509	bsf	%edx, %edx
510	cmp	%r8d, %edx
511	jb	L(CopyVecSizeExit)
512	jmp	L(StrncpyExit)
513
514/* Case2 or Case3,  Case3 */
515
516	.p2align 4
517L(CopyVecSizeCase2OrCase3):
518	test	%rdx, %rdx
519	jnz	L(CopyVecSizeCase2)
520L(CopyVecSizeCase3):
521	add	$VEC_SIZE, %r8
522	add	%rcx, %rdi
523	add	%rcx, %rsi
524	jmp	L(StrncpyExit)
525
526	.p2align 4
527L(CopyTwoVecSizeCase2OrCase3):
528	test	%rdx, %rdx
529	jnz	L(CopyTwoVecSizeCase2)
530	add	%rcx, %rsi
531	jmp	L(StrncpyExit)
532
533	.p2align 4
534L(CopyVecSizeTailCase2OrCase3):
535	test	%rdx, %rdx
536	jnz	L(CopyVecSizeTailCase2)
537	add	%rcx, %rsi
538	jmp	L(StrncpyExit)
539
540	.p2align 4
541L(CopyTwoVecSize1Case2OrCase3):
542	add	$VEC_SIZE, %rdi
543	add	$VEC_SIZE, %rsi
544	sub	$VEC_SIZE, %r8
545L(CopyVecSizeTail1Case2OrCase3):
546	test	%rdx, %rdx
547	jnz	L(CopyVecSizeTail1Case2)
548	jmp	L(StrncpyExit)
549# endif
550
551/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
552
553	.p2align 4
554L(Exit1):
555	movzwl	(%rsi), %edx
556	mov	%dx, (%rdi)
557# ifdef USE_AS_STPCPY
558	lea	1(%rdi), %rax
559# endif
560# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
561	sub	$2, %r8
562	lea	2(%rdi), %rdi
563	jnz	L(StrncpyFillTailWithZero)
564# endif
565	ret
566
567	.p2align 4
568L(Exit2):
569	movzwl	(%rsi), %ecx
570	mov	%cx, (%rdi)
571	movb	$0, 2(%rdi)
572# ifdef USE_AS_STPCPY
573	lea	2(%rdi), %rax
574# endif
575# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
576	sub	$3, %r8
577	lea	3(%rdi), %rdi
578	jnz	L(StrncpyFillTailWithZero)
579# endif
580	ret
581
582	.p2align 4
583L(Exit3):
584	mov	(%rsi), %edx
585	mov	%edx, (%rdi)
586# ifdef USE_AS_STPCPY
587	lea	3(%rdi), %rax
588# endif
589# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
590	sub	$4, %r8
591	lea	4(%rdi), %rdi
592	jnz	L(StrncpyFillTailWithZero)
593# endif
594	ret
595
596	.p2align 4
597L(Exit4_7):
598	mov	(%rsi), %ecx
599	mov	%ecx, (%rdi)
600	mov	-3(%rsi, %rdx), %ecx
601	mov	%ecx, -3(%rdi, %rdx)
602# ifdef USE_AS_STPCPY
603	lea	(%rdi, %rdx), %rax
604# endif
605# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
606	sub	%rdx, %r8
607	sub	$1, %r8
608	lea	1(%rdi, %rdx), %rdi
609	jnz	L(StrncpyFillTailWithZero)
610# endif
611	ret
612
613	.p2align 4
614L(Exit8_15):
615	mov	(%rsi), %rcx
616	mov	-7(%rsi, %rdx), %r9
617	mov	%rcx, (%rdi)
618	mov	%r9, -7(%rdi, %rdx)
619# ifdef USE_AS_STPCPY
620	lea	(%rdi, %rdx), %rax
621# endif
622# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
623	sub	%rdx, %r8
624	sub	$1, %r8
625	lea	1(%rdi, %rdx), %rdi
626	jnz	L(StrncpyFillTailWithZero)
627# endif
628	ret
629
630	.p2align 4
631L(Exit16_31):
632	VMOVU	(%rsi), %XMM2
633	VMOVU	-15(%rsi, %rdx), %XMM3
634	VMOVU	%XMM2, (%rdi)
635	VMOVU	%XMM3, -15(%rdi, %rdx)
636# ifdef USE_AS_STPCPY
637	lea	(%rdi, %rdx), %rax
638# endif
639# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
640	sub %rdx, %r8
641	sub $1, %r8
642	lea 1(%rdi, %rdx), %rdi
643	jnz L(StrncpyFillTailWithZero)
644# endif
645	ret
646
647	.p2align 4
648L(Exit32_63):
649	VMOVU	(%rsi), %YMM2
650	VMOVU	-31(%rsi, %rdx), %YMM3
651	VMOVU	%YMM2, (%rdi)
652	VMOVU	%YMM3, -31(%rdi, %rdx)
653# ifdef USE_AS_STPCPY
654	lea	(%rdi, %rdx), %rax
655# endif
656# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
657	sub	%rdx, %r8
658	sub	$1, %r8
659	lea	1(%rdi, %rdx), %rdi
660	jnz	L(StrncpyFillTailWithZero)
661# endif
662	ret
663
664# ifdef USE_AS_STRNCPY
665
666	.p2align 4
667L(StrncpyExit1):
668	movzbl	(%rsi), %edx
669	mov	%dl, (%rdi)
670#  ifdef USE_AS_STPCPY
671	lea	1(%rdi), %rax
672#  endif
673#  ifdef USE_AS_STRCAT
674	movb	$0, 1(%rdi)
675#  endif
676	ret
677
678	.p2align 4
679L(StrncpyExit2):
680	movzwl	(%rsi), %edx
681	mov	%dx, (%rdi)
682#  ifdef USE_AS_STPCPY
683	lea	2(%rdi), %rax
684#  endif
685#  ifdef USE_AS_STRCAT
686	movb	$0, 2(%rdi)
687#  endif
688	ret
689
690	.p2align 4
691L(StrncpyExit3_4):
692	movzwl	(%rsi), %ecx
693	movzwl	-2(%rsi, %r8), %edx
694	mov	%cx, (%rdi)
695	mov	%dx, -2(%rdi, %r8)
696#  ifdef USE_AS_STPCPY
697	lea	(%rdi, %r8), %rax
698#  endif
699#  ifdef USE_AS_STRCAT
700	movb	$0, (%rdi, %r8)
701#  endif
702	ret
703
704	.p2align 4
705L(StrncpyExit5_8):
706	mov	(%rsi), %ecx
707	mov	-4(%rsi, %r8), %edx
708	mov	%ecx, (%rdi)
709	mov	%edx, -4(%rdi, %r8)
710#  ifdef USE_AS_STPCPY
711	lea	(%rdi, %r8), %rax
712#  endif
713#  ifdef USE_AS_STRCAT
714	movb	$0, (%rdi, %r8)
715#  endif
716	ret
717
718	.p2align 4
719L(StrncpyExit9_16):
720	mov	(%rsi), %rcx
721	mov	-8(%rsi, %r8), %rdx
722	mov	%rcx, (%rdi)
723	mov	%rdx, -8(%rdi, %r8)
724#  ifdef USE_AS_STPCPY
725	lea	(%rdi, %r8), %rax
726#  endif
727#  ifdef USE_AS_STRCAT
728	movb	$0, (%rdi, %r8)
729#  endif
730	ret
731
732	.p2align 4
733L(StrncpyExit17_32):
734	VMOVU	(%rsi), %XMM2
735	VMOVU	-16(%rsi, %r8), %XMM3
736	VMOVU	%XMM2, (%rdi)
737	VMOVU	%XMM3, -16(%rdi, %r8)
738#  ifdef USE_AS_STPCPY
739	lea	(%rdi, %r8), %rax
740#  endif
741#  ifdef USE_AS_STRCAT
742	movb	$0, (%rdi, %r8)
743#  endif
744	ret
745
746	.p2align 4
747L(StrncpyExit33_64):
748	/*  0/32, 31/16 */
749	VMOVU	(%rsi), %YMM2
750	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
751	VMOVU	%YMM2, (%rdi)
752	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
753#  ifdef USE_AS_STPCPY
754	lea	(%rdi, %r8), %rax
755#  endif
756#  ifdef USE_AS_STRCAT
757	movb	$0, (%rdi, %r8)
758#  endif
759	ret
760
761	.p2align 4
762L(StrncpyExit65):
763	/* 0/32, 32/32, 64/1 */
764	VMOVU	(%rsi), %YMM2
765	VMOVU	32(%rsi), %YMM3
766	mov	64(%rsi), %cl
767	VMOVU	%YMM2, (%rdi)
768	VMOVU	%YMM3, 32(%rdi)
769	mov	%cl, 64(%rdi)
770#  ifdef USE_AS_STPCPY
771	lea	65(%rdi), %rax
772#  endif
773#  ifdef USE_AS_STRCAT
774	movb	$0, 65(%rdi)
775#  endif
776	ret
777
778#  ifndef USE_AS_STRCAT
779
780	.p2align 4
781L(Fill1):
782	mov	%dl, (%rdi)
783	ret
784
785	.p2align 4
786L(Fill2):
787	mov	%dx, (%rdi)
788	ret
789
790	.p2align 4
791L(Fill3_4):
792	mov	%dx, (%rdi)
793	mov     %dx, -2(%rdi, %r8)
794	ret
795
796	.p2align 4
797L(Fill5_8):
798	mov	%edx, (%rdi)
799	mov     %edx, -4(%rdi, %r8)
800	ret
801
802	.p2align 4
803L(Fill9_16):
804	mov	%rdx, (%rdi)
805	mov	%rdx, -8(%rdi, %r8)
806	ret
807
808	.p2align 4
809L(Fill17_32):
810	VMOVU	%XMMZERO, (%rdi)
811	VMOVU	%XMMZERO, -16(%rdi, %r8)
812	ret
813
814	.p2align 4
815L(CopyVecSizeUnalignedVec2):
816	VMOVU	%YMM2, (%rdi, %rcx)
817
818	.p2align 4
819L(CopyVecSizeVecExit):
820	bsf	%edx, %edx
821	add	$(VEC_SIZE - 1), %r8
822	add	%rcx, %rdi
823#   ifdef USE_AS_STPCPY
824	lea	(%rdi, %rdx), %rax
825#   endif
826	sub	%rdx, %r8
827	lea	1(%rdi, %rdx), %rdi
828
829	.p2align 4
830L(StrncpyFillTailWithZero):
831	xor	%edx, %edx
832	sub	$VEC_SIZE, %r8
833	jbe	L(StrncpyFillExit)
834
835	VMOVU	%YMMZERO, (%rdi)
836	add	$VEC_SIZE, %rdi
837
838	mov	%rdi, %rsi
839	and	$(VEC_SIZE - 1), %esi
840	sub	%rsi, %rdi
841	add	%rsi, %r8
842	sub	$(VEC_SIZE * 4), %r8
843	jb	L(StrncpyFillLessFourVecSize)
844
845L(StrncpyFillLoopVmovdqa):
846	VMOVA	%YMMZERO, (%rdi)
847	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
848	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
849	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
850	add	$(VEC_SIZE * 4), %rdi
851	sub	$(VEC_SIZE * 4), %r8
852	jae	L(StrncpyFillLoopVmovdqa)
853
854L(StrncpyFillLessFourVecSize):
855	add	$(VEC_SIZE * 2), %r8
856	jl	L(StrncpyFillLessTwoVecSize)
857	VMOVA	%YMMZERO, (%rdi)
858	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
859	add	$(VEC_SIZE * 2), %rdi
860	sub	$VEC_SIZE, %r8
861	jl	L(StrncpyFillExit)
862	VMOVA	%YMMZERO, (%rdi)
863	add	$VEC_SIZE, %rdi
864	jmp	L(Fill)
865
866	.p2align 4
867L(StrncpyFillLessTwoVecSize):
868	add	$VEC_SIZE, %r8
869	jl	L(StrncpyFillExit)
870	VMOVA	%YMMZERO, (%rdi)
871	add	$VEC_SIZE, %rdi
872	jmp	L(Fill)
873
874	.p2align 4
875L(StrncpyFillExit):
876	add	$VEC_SIZE, %r8
877L(Fill):
878	cmp	$17, %r8d
879	jae	L(Fill17_32)
880	cmp	$9, %r8d
881	jae	L(Fill9_16)
882	cmp	$5, %r8d
883	jae	L(Fill5_8)
884	cmp	$3, %r8d
885	jae	L(Fill3_4)
886	cmp	$1, %r8d
887	ja	L(Fill2)
888	je	L(Fill1)
889	ret
890
891/* end of ifndef USE_AS_STRCAT */
892#  endif
893
894	.p2align 4
895L(UnalignedLeaveCase2OrCase3):
896	test	%rdx, %rdx
897	jnz	L(UnalignedFourVecSizeLeaveCase2)
898L(UnalignedFourVecSizeLeaveCase3):
899	lea	(VEC_SIZE * 4)(%r8), %rcx
900	and	$-VEC_SIZE, %rcx
901	add	$(VEC_SIZE * 3), %r8
902	jl	L(CopyVecSizeCase3)
903	VMOVU	%YMM4, (%rdi)
904	sub	$VEC_SIZE, %r8
905	jb	L(CopyVecSizeCase3)
906	VMOVU	%YMM5, VEC_SIZE(%rdi)
907	sub	$VEC_SIZE, %r8
908	jb	L(CopyVecSizeCase3)
909	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
910	sub	$VEC_SIZE, %r8
911	jb	L(CopyVecSizeCase3)
912	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
913#  ifdef USE_AS_STPCPY
914	lea	(VEC_SIZE * 4)(%rdi), %rax
915#  endif
916#  ifdef USE_AS_STRCAT
917	movb	$0, (VEC_SIZE * 4)(%rdi)
918#  endif
919	ret
920
921	.p2align 4
922L(UnalignedFourVecSizeLeaveCase2):
923	xor	%ecx, %ecx
924	vpcmpb	$0, %YMM4, %YMMZERO, %k1
925	kmovd	%k1, %edx
926	add	$(VEC_SIZE * 3), %r8
927	jle	L(CopyVecSizeCase2OrCase3)
928	test	%edx, %edx
929#  ifndef USE_AS_STRCAT
930	jnz	L(CopyVecSizeUnalignedVec4)
931#  else
932	jnz	L(CopyVecSize)
933#  endif
934	vpcmpb	$0, %YMM5, %YMMZERO, %k2
935	kmovd	%k2, %edx
936	VMOVU	%YMM4, (%rdi)
937	add	$VEC_SIZE, %rcx
938	sub	$VEC_SIZE, %r8
939	jbe	L(CopyVecSizeCase2OrCase3)
940	test	%edx, %edx
941#  ifndef USE_AS_STRCAT
942	jnz	L(CopyVecSizeUnalignedVec5)
943#  else
944	jnz	L(CopyVecSize)
945#  endif
946
947	vpcmpb	$0, %YMM6, %YMMZERO, %k3
948	kmovd	%k3, %edx
949	VMOVU	%YMM5, VEC_SIZE(%rdi)
950	add	$VEC_SIZE, %rcx
951	sub	$VEC_SIZE, %r8
952	jbe	L(CopyVecSizeCase2OrCase3)
953	test	%edx, %edx
954#  ifndef USE_AS_STRCAT
955	jnz	L(CopyVecSizeUnalignedVec6)
956#  else
957	jnz	L(CopyVecSize)
958#  endif
959
960	vpcmpb	$0, %YMM7, %YMMZERO, %k4
961	kmovd	%k4, %edx
962	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
963	lea	VEC_SIZE(%rdi, %rcx), %rdi
964	lea	VEC_SIZE(%rsi, %rcx), %rsi
965	bsf	%edx, %edx
966	cmp	%r8d, %edx
967	jb	L(CopyVecSizeExit)
968L(StrncpyExit):
969	cmp	$65, %r8d
970	je	L(StrncpyExit65)
971	cmp	$33, %r8d
972	jae	L(StrncpyExit33_64)
973	cmp	$17, %r8d
974	jae	L(StrncpyExit17_32)
975	cmp	$9, %r8d
976	jae	L(StrncpyExit9_16)
977	cmp	$5, %r8d
978	jae	L(StrncpyExit5_8)
979	cmp	$3, %r8d
980	jae	L(StrncpyExit3_4)
981	cmp	$1, %r8d
982	ja	L(StrncpyExit2)
983	je	L(StrncpyExit1)
984#  ifdef USE_AS_STPCPY
985	mov	%rdi, %rax
986#  endif
987#  ifdef USE_AS_STRCAT
988	movb	$0, (%rdi)
989#  endif
990	ret
991
992	.p2align 4
993L(ExitZero):
994#  ifndef USE_AS_STRCAT
995	mov	%rdi, %rax
996#  endif
997	ret
998
999# endif
1000
1001# ifndef USE_AS_STRCAT
1002END (STRCPY)
1003# else
1004END (STRCAT)
1005# endif
1006#endif
1007