1/* strcpy with SSE2 and unaligned load
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22   so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25
26# ifndef USE_AS_STRCAT
27#  include <sysdep.h>
28
29#  ifndef STRCPY
30#   define STRCPY  __strcpy_sse2_unaligned
31#  endif
32
33# endif
34
35# define JMPTBL(I, B)	I - B
36# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)             \
37	lea	TABLE(%rip), %r11;                              \
38	movslq	(%r11, INDEX, SCALE), %rcx;                     \
39	lea	(%r11, %rcx), %rcx;                             \
40	_CET_NOTRACK jmp *%rcx
41
42# ifndef USE_AS_STRCAT
43
44.text
45ENTRY (STRCPY)
46#  ifdef USE_AS_STRNCPY
47	mov	%RDX_LP, %R8_LP
48	test	%R8_LP, %R8_LP
49	jz	L(ExitZero)
50#  endif
51	mov	%rsi, %rcx
52#  ifndef USE_AS_STPCPY
53	mov	%rdi, %rax      /* save result */
54#  endif
55
56# endif
57
58	and	$63, %rcx
59	cmp	$32, %rcx
60	jbe	L(SourceStringAlignmentLess32)
61
62	and	$-16, %rsi
63	and	$15, %rcx
64	pxor	%xmm0, %xmm0
65	pxor	%xmm1, %xmm1
66
67	pcmpeqb	(%rsi), %xmm1
68	pmovmskb %xmm1, %rdx
69	shr	%cl, %rdx
70
71# ifdef USE_AS_STRNCPY
72#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
73	mov	$16, %r10
74	sub	%rcx, %r10
75	cmp	%r10, %r8
76#  else
77	mov	$17, %r10
78	sub	%rcx, %r10
79	cmp	%r10, %r8
80#  endif
81	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
82# endif
83	test	%rdx, %rdx
84	jnz	L(CopyFrom1To16BytesTail)
85
86	pcmpeqb	16(%rsi), %xmm0
87	pmovmskb %xmm0, %rdx
88
89# ifdef USE_AS_STRNCPY
90	add	$16, %r10
91	cmp	%r10, %r8
92	jbe	L(CopyFrom1To32BytesCase2OrCase3)
93# endif
94	test	%rdx, %rdx
95	jnz	L(CopyFrom1To32Bytes)
96
97	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
98	movdqu	%xmm1, (%rdi)
99
100/* If source address alignment != destination address alignment */
101	.p2align 4
102L(Unalign16Both):
103	sub	%rcx, %rdi
104# ifdef USE_AS_STRNCPY
105	add	%rcx, %r8
106	sbb	%rcx, %rcx
107	or	%rcx, %r8
108# endif
109	mov	$16, %rcx
110	movdqa	(%rsi, %rcx), %xmm1
111	movaps	16(%rsi, %rcx), %xmm2
112	movdqu	%xmm1, (%rdi, %rcx)
113	pcmpeqb	%xmm2, %xmm0
114	pmovmskb %xmm0, %rdx
115	add	$16, %rcx
116# ifdef USE_AS_STRNCPY
117	sub	$48, %r8
118	jbe	L(CopyFrom1To16BytesCase2OrCase3)
119# endif
120	test	%rdx, %rdx
121# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
122	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
123# else
124	jnz	L(CopyFrom1To16Bytes)
125# endif
126
127	movaps	16(%rsi, %rcx), %xmm3
128	movdqu	%xmm2, (%rdi, %rcx)
129	pcmpeqb	%xmm3, %xmm0
130	pmovmskb %xmm0, %rdx
131	add	$16, %rcx
132# ifdef USE_AS_STRNCPY
133	sub	$16, %r8
134	jbe	L(CopyFrom1To16BytesCase2OrCase3)
135# endif
136	test	%rdx, %rdx
137# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
138	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
139# else
140	jnz	L(CopyFrom1To16Bytes)
141# endif
142
143	movaps	16(%rsi, %rcx), %xmm4
144	movdqu	%xmm3, (%rdi, %rcx)
145	pcmpeqb	%xmm4, %xmm0
146	pmovmskb %xmm0, %rdx
147	add	$16, %rcx
148# ifdef USE_AS_STRNCPY
149	sub	$16, %r8
150	jbe	L(CopyFrom1To16BytesCase2OrCase3)
151# endif
152	test	%rdx, %rdx
153# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
154	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
155# else
156	jnz	L(CopyFrom1To16Bytes)
157# endif
158
159	movaps	16(%rsi, %rcx), %xmm1
160	movdqu	%xmm4, (%rdi, %rcx)
161	pcmpeqb	%xmm1, %xmm0
162	pmovmskb %xmm0, %rdx
163	add	$16, %rcx
164# ifdef USE_AS_STRNCPY
165	sub	$16, %r8
166	jbe	L(CopyFrom1To16BytesCase2OrCase3)
167# endif
168	test	%rdx, %rdx
169# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
170	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
171# else
172	jnz	L(CopyFrom1To16Bytes)
173# endif
174
175	movaps	16(%rsi, %rcx), %xmm2
176	movdqu	%xmm1, (%rdi, %rcx)
177	pcmpeqb	%xmm2, %xmm0
178	pmovmskb %xmm0, %rdx
179	add	$16, %rcx
180# ifdef USE_AS_STRNCPY
181	sub	$16, %r8
182	jbe	L(CopyFrom1To16BytesCase2OrCase3)
183# endif
184	test	%rdx, %rdx
185# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
186	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
187# else
188	jnz	L(CopyFrom1To16Bytes)
189# endif
190
191	movaps	16(%rsi, %rcx), %xmm3
192	movdqu	%xmm2, (%rdi, %rcx)
193	pcmpeqb	%xmm3, %xmm0
194	pmovmskb %xmm0, %rdx
195	add	$16, %rcx
196# ifdef USE_AS_STRNCPY
197	sub	$16, %r8
198	jbe	L(CopyFrom1To16BytesCase2OrCase3)
199# endif
200	test	%rdx, %rdx
201# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
202	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
203# else
204	jnz	L(CopyFrom1To16Bytes)
205# endif
206
207	movdqu	%xmm3, (%rdi, %rcx)
208	mov	%rsi, %rdx
209	lea	16(%rsi, %rcx), %rsi
210	and	$-0x40, %rsi
211	sub	%rsi, %rdx
212	sub	%rdx, %rdi
213# ifdef USE_AS_STRNCPY
214	lea	128(%r8, %rdx), %r8
215# endif
216L(Unaligned64Loop):
217	movaps	(%rsi), %xmm2
218	movaps	%xmm2, %xmm4
219	movaps	16(%rsi), %xmm5
220	movaps	32(%rsi), %xmm3
221	movaps	%xmm3, %xmm6
222	movaps	48(%rsi), %xmm7
223	pminub	%xmm5, %xmm2
224	pminub	%xmm7, %xmm3
225	pminub	%xmm2, %xmm3
226	pcmpeqb	%xmm0, %xmm3
227	pmovmskb %xmm3, %rdx
228# ifdef USE_AS_STRNCPY
229	sub	$64, %r8
230	jbe	L(UnalignedLeaveCase2OrCase3)
231# endif
232	test	%rdx, %rdx
233	jnz	L(Unaligned64Leave)
234
235L(Unaligned64Loop_start):
236	add	$64, %rdi
237	add	$64, %rsi
238	movdqu	%xmm4, -64(%rdi)
239	movaps	(%rsi), %xmm2
240	movdqa	%xmm2, %xmm4
241	movdqu	%xmm5, -48(%rdi)
242	movaps	16(%rsi), %xmm5
243	pminub	%xmm5, %xmm2
244	movaps	32(%rsi), %xmm3
245	movdqu	%xmm6, -32(%rdi)
246	movaps	%xmm3, %xmm6
247	movdqu	%xmm7, -16(%rdi)
248	movaps	48(%rsi), %xmm7
249	pminub	%xmm7, %xmm3
250	pminub	%xmm2, %xmm3
251	pcmpeqb	%xmm0, %xmm3
252	pmovmskb %xmm3, %rdx
253# ifdef USE_AS_STRNCPY
254	sub	$64, %r8
255	jbe	L(UnalignedLeaveCase2OrCase3)
256# endif
257	test	%rdx, %rdx
258	jz	L(Unaligned64Loop_start)
259
260L(Unaligned64Leave):
261	pxor	%xmm1, %xmm1
262
263	pcmpeqb	%xmm4, %xmm0
264	pcmpeqb	%xmm5, %xmm1
265	pmovmskb %xmm0, %rdx
266	pmovmskb %xmm1, %rcx
267	test	%rdx, %rdx
268	jnz	L(CopyFrom1To16BytesUnaligned_0)
269	test	%rcx, %rcx
270	jnz	L(CopyFrom1To16BytesUnaligned_16)
271
272	pcmpeqb	%xmm6, %xmm0
273	pcmpeqb	%xmm7, %xmm1
274	pmovmskb %xmm0, %rdx
275	pmovmskb %xmm1, %rcx
276	test	%rdx, %rdx
277	jnz	L(CopyFrom1To16BytesUnaligned_32)
278
279	bsf	%rcx, %rdx
280	movdqu	%xmm4, (%rdi)
281	movdqu	%xmm5, 16(%rdi)
282	movdqu	%xmm6, 32(%rdi)
283# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
284# ifdef USE_AS_STPCPY
285	lea	48(%rdi, %rdx), %rax
286# endif
287	movdqu	%xmm7, 48(%rdi)
288	add	$15, %r8
289	sub	%rdx, %r8
290	lea	49(%rdi, %rdx), %rdi
291	jmp	L(StrncpyFillTailWithZero)
292# else
293	add	$48, %rsi
294	add	$48, %rdi
295	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
296# endif
297
298/* If source address alignment == destination address alignment */
299
300L(SourceStringAlignmentLess32):
301	pxor	%xmm0, %xmm0
302	movdqu	(%rsi), %xmm1
303	movdqu	16(%rsi), %xmm2
304	pcmpeqb	%xmm1, %xmm0
305	pmovmskb %xmm0, %rdx
306
307# ifdef USE_AS_STRNCPY
308#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
309	cmp	$16, %r8
310#  else
311	cmp	$17, %r8
312#  endif
313	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
314# endif
315	test	%rdx, %rdx
316	jnz	L(CopyFrom1To16BytesTail1)
317
318	pcmpeqb	%xmm2, %xmm0
319	movdqu	%xmm1, (%rdi)
320	pmovmskb %xmm0, %rdx
321
322# ifdef USE_AS_STRNCPY
323#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
324	cmp	$32, %r8
325#  else
326	cmp	$33, %r8
327#  endif
328	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
329# endif
330	test	%rdx, %rdx
331	jnz	L(CopyFrom1To32Bytes1)
332
333	and	$-16, %rsi
334	and	$15, %rcx
335	jmp	L(Unalign16Both)
336
337/*------End of main part with loops---------------------*/
338
339/* Case1 */
340
341# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
342	.p2align 4
343L(CopyFrom1To16Bytes):
344	add	%rcx, %rdi
345	add	%rcx, %rsi
346	bsf	%rdx, %rdx
347	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
348# endif
349	.p2align 4
350L(CopyFrom1To16BytesTail):
351	add	%rcx, %rsi
352	bsf	%rdx, %rdx
353	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
354
355	.p2align 4
356L(CopyFrom1To32Bytes1):
357	add	$16, %rsi
358	add	$16, %rdi
359# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
360	sub	$16, %r8
361# endif
362L(CopyFrom1To16BytesTail1):
363	bsf	%rdx, %rdx
364	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
365
366	.p2align 4
367L(CopyFrom1To32Bytes):
368	bsf	%rdx, %rdx
369	add	%rcx, %rsi
370	add	$16, %rdx
371	sub	%rcx, %rdx
372	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
373
374	.p2align 4
375L(CopyFrom1To16BytesUnaligned_0):
376	bsf	%rdx, %rdx
377# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
378# ifdef USE_AS_STPCPY
379	lea	(%rdi, %rdx), %rax
380# endif
381	movdqu	%xmm4, (%rdi)
382	add	$63, %r8
383	sub	%rdx, %r8
384	lea	1(%rdi, %rdx), %rdi
385	jmp	L(StrncpyFillTailWithZero)
386# else
387	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
388# endif
389
390	.p2align 4
391L(CopyFrom1To16BytesUnaligned_16):
392	bsf	%rcx, %rdx
393	movdqu	%xmm4, (%rdi)
394# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
395# ifdef USE_AS_STPCPY
396	lea	16(%rdi, %rdx), %rax
397# endif
398	movdqu	%xmm5, 16(%rdi)
399	add	$47, %r8
400	sub	%rdx, %r8
401	lea	17(%rdi, %rdx), %rdi
402	jmp	L(StrncpyFillTailWithZero)
403# else
404	add	$16, %rsi
405	add	$16, %rdi
406	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
407# endif
408
409	.p2align 4
410L(CopyFrom1To16BytesUnaligned_32):
411	bsf	%rdx, %rdx
412	movdqu	%xmm4, (%rdi)
413	movdqu	%xmm5, 16(%rdi)
414# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
415# ifdef USE_AS_STPCPY
416	lea	32(%rdi, %rdx), %rax
417# endif
418	movdqu	%xmm6, 32(%rdi)
419	add	$31, %r8
420	sub	%rdx, %r8
421	lea	33(%rdi, %rdx), %rdi
422	jmp	L(StrncpyFillTailWithZero)
423# else
424	add	$32, %rsi
425	add	$32, %rdi
426	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
427# endif
428
429# ifdef USE_AS_STRNCPY
430#  ifndef USE_AS_STRCAT
431	.p2align 4
432L(CopyFrom1To16BytesUnalignedXmm6):
433	movdqu	%xmm6, (%rdi, %rcx)
434	jmp	L(CopyFrom1To16BytesXmmExit)
435
436	.p2align 4
437L(CopyFrom1To16BytesUnalignedXmm5):
438	movdqu	%xmm5, (%rdi, %rcx)
439	jmp	L(CopyFrom1To16BytesXmmExit)
440
441	.p2align 4
442L(CopyFrom1To16BytesUnalignedXmm4):
443	movdqu	%xmm4, (%rdi, %rcx)
444	jmp	L(CopyFrom1To16BytesXmmExit)
445
446	.p2align 4
447L(CopyFrom1To16BytesUnalignedXmm3):
448	movdqu	%xmm3, (%rdi, %rcx)
449	jmp	L(CopyFrom1To16BytesXmmExit)
450
451	.p2align 4
452L(CopyFrom1To16BytesUnalignedXmm1):
453	movdqu	%xmm1, (%rdi, %rcx)
454	jmp	L(CopyFrom1To16BytesXmmExit)
455#  endif
456
457	.p2align 4
458L(CopyFrom1To16BytesExit):
459	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
460
461/* Case2 */
462
463	.p2align 4
464L(CopyFrom1To16BytesCase2):
465	add	$16, %r8
466	add	%rcx, %rdi
467	add	%rcx, %rsi
468	bsf	%rdx, %rdx
469	cmp	%r8, %rdx
470	jb	L(CopyFrom1To16BytesExit)
471	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
472
473	.p2align 4
474L(CopyFrom1To32BytesCase2):
475	add	%rcx, %rsi
476	bsf	%rdx, %rdx
477	add	$16, %rdx
478	sub	%rcx, %rdx
479	cmp	%r8, %rdx
480	jb	L(CopyFrom1To16BytesExit)
481	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
482
483L(CopyFrom1To16BytesTailCase2):
484	add	%rcx, %rsi
485	bsf	%rdx, %rdx
486	cmp	%r8, %rdx
487	jb	L(CopyFrom1To16BytesExit)
488	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
489
490L(CopyFrom1To16BytesTail1Case2):
491	bsf	%rdx, %rdx
492	cmp	%r8, %rdx
493	jb	L(CopyFrom1To16BytesExit)
494	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
495
496/* Case2 or Case3,  Case3 */
497
498	.p2align 4
499L(CopyFrom1To16BytesCase2OrCase3):
500	test	%rdx, %rdx
501	jnz	L(CopyFrom1To16BytesCase2)
502L(CopyFrom1To16BytesCase3):
503	add	$16, %r8
504	add	%rcx, %rdi
505	add	%rcx, %rsi
506	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
507
508	.p2align 4
509L(CopyFrom1To32BytesCase2OrCase3):
510	test	%rdx, %rdx
511	jnz	L(CopyFrom1To32BytesCase2)
512	add	%rcx, %rsi
513	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
514
515	.p2align 4
516L(CopyFrom1To16BytesTailCase2OrCase3):
517	test	%rdx, %rdx
518	jnz	L(CopyFrom1To16BytesTailCase2)
519	add	%rcx, %rsi
520	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
521
522	.p2align 4
523L(CopyFrom1To32Bytes1Case2OrCase3):
524	add	$16, %rdi
525	add	$16, %rsi
526	sub	$16, %r8
527L(CopyFrom1To16BytesTail1Case2OrCase3):
528	test	%rdx, %rdx
529	jnz	L(CopyFrom1To16BytesTail1Case2)
530	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
531
532# endif
533
534/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
535
536	.p2align 4
537L(Exit1):
538	mov	%dh, (%rdi)
539# ifdef USE_AS_STPCPY
540	lea	(%rdi), %rax
541# endif
542# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
543	sub	$1, %r8
544	lea	1(%rdi), %rdi
545	jnz	L(StrncpyFillTailWithZero)
546# endif
547	ret
548
549	.p2align 4
550L(Exit2):
551	mov	(%rsi), %dx
552	mov	%dx, (%rdi)
553# ifdef USE_AS_STPCPY
554	lea	1(%rdi), %rax
555# endif
556# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
557	sub	$2, %r8
558	lea	2(%rdi), %rdi
559	jnz	L(StrncpyFillTailWithZero)
560# endif
561	ret
562
563	.p2align 4
564L(Exit3):
565	mov	(%rsi), %cx
566	mov	%cx, (%rdi)
567	mov	%dh, 2(%rdi)
568# ifdef USE_AS_STPCPY
569	lea	2(%rdi), %rax
570# endif
571# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
572	sub	$3, %r8
573	lea	3(%rdi), %rdi
574	jnz	L(StrncpyFillTailWithZero)
575# endif
576	ret
577
578	.p2align 4
579L(Exit4):
580	mov	(%rsi), %edx
581	mov	%edx, (%rdi)
582# ifdef USE_AS_STPCPY
583	lea	3(%rdi), %rax
584# endif
585# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
586	sub	$4, %r8
587	lea	4(%rdi), %rdi
588	jnz	L(StrncpyFillTailWithZero)
589# endif
590	ret
591
592	.p2align 4
593L(Exit5):
594	mov	(%rsi), %ecx
595	mov	%dh, 4(%rdi)
596	mov	%ecx, (%rdi)
597# ifdef USE_AS_STPCPY
598	lea	4(%rdi), %rax
599# endif
600# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
601	sub	$5, %r8
602	lea	5(%rdi), %rdi
603	jnz	L(StrncpyFillTailWithZero)
604# endif
605	ret
606
607	.p2align 4
608L(Exit6):
609	mov	(%rsi), %ecx
610	mov	4(%rsi), %dx
611	mov	%ecx, (%rdi)
612	mov	%dx, 4(%rdi)
613# ifdef USE_AS_STPCPY
614	lea	5(%rdi), %rax
615# endif
616# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
617	sub	$6, %r8
618	lea	6(%rdi), %rdi
619	jnz	L(StrncpyFillTailWithZero)
620# endif
621	ret
622
623	.p2align 4
624L(Exit7):
625	mov	(%rsi), %ecx
626	mov	3(%rsi), %edx
627	mov	%ecx, (%rdi)
628	mov	%edx, 3(%rdi)
629# ifdef USE_AS_STPCPY
630	lea	6(%rdi), %rax
631# endif
632# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
633	sub	$7, %r8
634	lea	7(%rdi), %rdi
635	jnz	L(StrncpyFillTailWithZero)
636# endif
637	ret
638
639	.p2align 4
640L(Exit8):
641	mov	(%rsi), %rdx
642	mov	%rdx, (%rdi)
643# ifdef USE_AS_STPCPY
644	lea	7(%rdi), %rax
645# endif
646# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
647	sub	$8, %r8
648	lea	8(%rdi), %rdi
649	jnz	L(StrncpyFillTailWithZero)
650# endif
651	ret
652
653	.p2align 4
654L(Exit9):
655	mov	(%rsi), %rcx
656	mov	%dh, 8(%rdi)
657	mov	%rcx, (%rdi)
658# ifdef USE_AS_STPCPY
659	lea	8(%rdi), %rax
660# endif
661# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
662	sub	$9, %r8
663	lea	9(%rdi), %rdi
664	jnz	L(StrncpyFillTailWithZero)
665# endif
666	ret
667
668	.p2align 4
669L(Exit10):
670	mov	(%rsi), %rcx
671	mov	8(%rsi), %dx
672	mov	%rcx, (%rdi)
673	mov	%dx, 8(%rdi)
674# ifdef USE_AS_STPCPY
675	lea	9(%rdi), %rax
676# endif
677# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
678	sub	$10, %r8
679	lea	10(%rdi), %rdi
680	jnz	L(StrncpyFillTailWithZero)
681# endif
682	ret
683
684	.p2align 4
685L(Exit11):
686	mov	(%rsi), %rcx
687	mov	7(%rsi), %edx
688	mov	%rcx, (%rdi)
689	mov	%edx, 7(%rdi)
690# ifdef USE_AS_STPCPY
691	lea	10(%rdi), %rax
692# endif
693# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
694	sub	$11, %r8
695	lea	11(%rdi), %rdi
696	jnz	L(StrncpyFillTailWithZero)
697# endif
698	ret
699
700	.p2align 4
701L(Exit12):
702	mov	(%rsi), %rcx
703	mov	8(%rsi), %edx
704	mov	%rcx, (%rdi)
705	mov	%edx, 8(%rdi)
706# ifdef USE_AS_STPCPY
707	lea	11(%rdi), %rax
708# endif
709# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
710	sub	$12, %r8
711	lea	12(%rdi), %rdi
712	jnz	L(StrncpyFillTailWithZero)
713# endif
714	ret
715
716	.p2align 4
717L(Exit13):
718	mov	(%rsi), %rcx
719	mov	5(%rsi), %rdx
720	mov	%rcx, (%rdi)
721	mov	%rdx, 5(%rdi)
722# ifdef USE_AS_STPCPY
723	lea	12(%rdi), %rax
724# endif
725# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
726	sub	$13, %r8
727	lea	13(%rdi), %rdi
728	jnz	L(StrncpyFillTailWithZero)
729# endif
730	ret
731
732	.p2align 4
733L(Exit14):
734	mov	(%rsi), %rcx
735	mov	6(%rsi), %rdx
736	mov	%rcx, (%rdi)
737	mov	%rdx, 6(%rdi)
738# ifdef USE_AS_STPCPY
739	lea	13(%rdi), %rax
740# endif
741# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
742	sub	$14, %r8
743	lea	14(%rdi), %rdi
744	jnz	L(StrncpyFillTailWithZero)
745# endif
746	ret
747
748	.p2align 4
749L(Exit15):
750	mov	(%rsi), %rcx
751	mov	7(%rsi), %rdx
752	mov	%rcx, (%rdi)
753	mov	%rdx, 7(%rdi)
754# ifdef USE_AS_STPCPY
755	lea	14(%rdi), %rax
756# endif
757# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
758	sub	$15, %r8
759	lea	15(%rdi), %rdi
760	jnz	L(StrncpyFillTailWithZero)
761# endif
762	ret
763
764	.p2align 4
765L(Exit16):
766	movdqu	(%rsi), %xmm0
767	movdqu	%xmm0, (%rdi)
768# ifdef USE_AS_STPCPY
769	lea	15(%rdi), %rax
770# endif
771# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
772	sub	$16, %r8
773	lea	16(%rdi), %rdi
774	jnz	L(StrncpyFillTailWithZero)
775# endif
776	ret
777
778	.p2align 4
779L(Exit17):
780	movdqu	(%rsi), %xmm0
781	movdqu	%xmm0, (%rdi)
782	mov	%dh, 16(%rdi)
783# ifdef USE_AS_STPCPY
784	lea	16(%rdi), %rax
785# endif
786# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
787	sub	$17, %r8
788	lea	17(%rdi), %rdi
789	jnz	L(StrncpyFillTailWithZero)
790# endif
791	ret
792
793	.p2align 4
794L(Exit18):
795	movdqu	(%rsi), %xmm0
796	mov	16(%rsi), %cx
797	movdqu	%xmm0, (%rdi)
798	mov	%cx, 16(%rdi)
799# ifdef USE_AS_STPCPY
800	lea	17(%rdi), %rax
801# endif
802# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
803	sub	$18, %r8
804	lea	18(%rdi), %rdi
805	jnz	L(StrncpyFillTailWithZero)
806# endif
807	ret
808
809	.p2align 4
810L(Exit19):
811	movdqu	(%rsi), %xmm0
812	mov	15(%rsi), %ecx
813	movdqu	%xmm0, (%rdi)
814	mov	%ecx, 15(%rdi)
815# ifdef USE_AS_STPCPY
816	lea	18(%rdi), %rax
817# endif
818# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
819	sub	$19, %r8
820	lea	19(%rdi), %rdi
821	jnz	L(StrncpyFillTailWithZero)
822# endif
823	ret
824
825	.p2align 4
826L(Exit20):
827	movdqu	(%rsi), %xmm0
828	mov	16(%rsi), %ecx
829	movdqu	%xmm0, (%rdi)
830	mov	%ecx, 16(%rdi)
831# ifdef USE_AS_STPCPY
832	lea	19(%rdi), %rax
833# endif
834# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
835	sub	$20, %r8
836	lea	20(%rdi), %rdi
837	jnz	L(StrncpyFillTailWithZero)
838# endif
839	ret
840
841	.p2align 4
842L(Exit21):
843	movdqu	(%rsi), %xmm0
844	mov	16(%rsi), %ecx
845	movdqu	%xmm0, (%rdi)
846	mov	%ecx, 16(%rdi)
847	mov	%dh, 20(%rdi)
848# ifdef USE_AS_STPCPY
849	lea	20(%rdi), %rax
850# endif
851# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
852	sub	$21, %r8
853	lea	21(%rdi), %rdi
854	jnz	L(StrncpyFillTailWithZero)
855# endif
856	ret
857
858	.p2align 4
859L(Exit22):
860	movdqu	(%rsi), %xmm0
861	mov	14(%rsi), %rcx
862	movdqu	%xmm0, (%rdi)
863	mov	%rcx, 14(%rdi)
864# ifdef USE_AS_STPCPY
865	lea	21(%rdi), %rax
866# endif
867# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
868	sub	$22, %r8
869	lea	22(%rdi), %rdi
870	jnz	L(StrncpyFillTailWithZero)
871# endif
872	ret
873
874	.p2align 4
875L(Exit23):
876	movdqu	(%rsi), %xmm0
877	mov	15(%rsi), %rcx
878	movdqu	%xmm0, (%rdi)
879	mov	%rcx, 15(%rdi)
880# ifdef USE_AS_STPCPY
881	lea	22(%rdi), %rax
882# endif
883# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
884	sub	$23, %r8
885	lea	23(%rdi), %rdi
886	jnz	L(StrncpyFillTailWithZero)
887# endif
888	ret
889
890	.p2align 4
891L(Exit24):
892	movdqu	(%rsi), %xmm0
893	mov	16(%rsi), %rcx
894	movdqu	%xmm0, (%rdi)
895	mov	%rcx, 16(%rdi)
896# ifdef USE_AS_STPCPY
897	lea	23(%rdi), %rax
898# endif
899# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
900	sub	$24, %r8
901	lea	24(%rdi), %rdi
902	jnz	L(StrncpyFillTailWithZero)
903# endif
904	ret
905
906	.p2align 4
907L(Exit25):
908	movdqu	(%rsi), %xmm0
909	mov	16(%rsi), %rcx
910	movdqu	%xmm0, (%rdi)
911	mov	%rcx, 16(%rdi)
912	mov	%dh, 24(%rdi)
913# ifdef USE_AS_STPCPY
914	lea	24(%rdi), %rax
915# endif
916# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
917	sub	$25, %r8
918	lea	25(%rdi), %rdi
919	jnz	L(StrncpyFillTailWithZero)
920# endif
921	ret
922
923	.p2align 4
924L(Exit26):
925	movdqu	(%rsi), %xmm0
926	mov	16(%rsi), %rdx
927	mov	24(%rsi), %cx
928	movdqu	%xmm0, (%rdi)
929	mov	%rdx, 16(%rdi)
930	mov	%cx, 24(%rdi)
931# ifdef USE_AS_STPCPY
932	lea	25(%rdi), %rax
933# endif
934# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
935	sub	$26, %r8
936	lea	26(%rdi), %rdi
937	jnz	L(StrncpyFillTailWithZero)
938# endif
939	ret
940
941	.p2align 4
942L(Exit27):
943	movdqu	(%rsi), %xmm0
944	mov	16(%rsi), %rdx
945	mov	23(%rsi), %ecx
946	movdqu	%xmm0, (%rdi)
947	mov	%rdx, 16(%rdi)
948	mov	%ecx, 23(%rdi)
949# ifdef USE_AS_STPCPY
950	lea	26(%rdi), %rax
951# endif
952# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
953	sub	$27, %r8
954	lea	27(%rdi), %rdi
955	jnz	L(StrncpyFillTailWithZero)
956# endif
957	ret
958
959	.p2align 4
960L(Exit28):
961	movdqu	(%rsi), %xmm0
962	mov	16(%rsi), %rdx
963	mov	24(%rsi), %ecx
964	movdqu	%xmm0, (%rdi)
965	mov	%rdx, 16(%rdi)
966	mov	%ecx, 24(%rdi)
967# ifdef USE_AS_STPCPY
968	lea	27(%rdi), %rax
969# endif
970# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
971	sub	$28, %r8
972	lea	28(%rdi), %rdi
973	jnz	L(StrncpyFillTailWithZero)
974# endif
975	ret
976
977	.p2align 4
978L(Exit29):
979	movdqu	(%rsi), %xmm0
980	movdqu	13(%rsi), %xmm2
981	movdqu	%xmm0, (%rdi)
982	movdqu	%xmm2, 13(%rdi)
983# ifdef USE_AS_STPCPY
984	lea	28(%rdi), %rax
985# endif
986# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
987	sub	$29, %r8
988	lea	29(%rdi), %rdi
989	jnz	L(StrncpyFillTailWithZero)
990# endif
991	ret
992
993	.p2align 4
994L(Exit30):
995	movdqu	(%rsi), %xmm0
996	movdqu	14(%rsi), %xmm2
997	movdqu	%xmm0, (%rdi)
998	movdqu	%xmm2, 14(%rdi)
999# ifdef USE_AS_STPCPY
1000	lea	29(%rdi), %rax
1001# endif
1002# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1003	sub	$30, %r8
1004	lea	30(%rdi), %rdi
1005	jnz	L(StrncpyFillTailWithZero)
1006# endif
1007	ret
1008
1009	.p2align 4
1010L(Exit31):
1011	movdqu	(%rsi), %xmm0
1012	movdqu	15(%rsi), %xmm2
1013	movdqu	%xmm0, (%rdi)
1014	movdqu	%xmm2, 15(%rdi)
1015# ifdef USE_AS_STPCPY
1016	lea	30(%rdi), %rax
1017# endif
1018# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1019	sub	$31, %r8
1020	lea	31(%rdi), %rdi
1021	jnz	L(StrncpyFillTailWithZero)
1022# endif
1023	ret
1024
1025	.p2align 4
1026L(Exit32):
1027	movdqu	(%rsi), %xmm0
1028	movdqu	16(%rsi), %xmm2
1029	movdqu	%xmm0, (%rdi)
1030	movdqu	%xmm2, 16(%rdi)
1031# ifdef USE_AS_STPCPY
1032	lea	31(%rdi), %rax
1033# endif
1034# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
1035	sub	$32, %r8
1036	lea	32(%rdi), %rdi
1037	jnz	L(StrncpyFillTailWithZero)
1038# endif
1039	ret
1040
1041# ifdef USE_AS_STRNCPY
1042
1043	.p2align 4
1044L(StrncpyExit0):
1045#  ifdef USE_AS_STPCPY
1046	mov	%rdi, %rax
1047#  endif
1048#  ifdef USE_AS_STRCAT
1049	xor	%ch, %ch
1050	movb	%ch, (%rdi)
1051#  endif
1052	ret
1053
1054	.p2align 4
1055L(StrncpyExit1):
1056	mov	(%rsi), %dl
1057	mov	%dl, (%rdi)
1058#  ifdef USE_AS_STPCPY
1059	lea	1(%rdi), %rax
1060#  endif
1061#  ifdef USE_AS_STRCAT
1062	xor	%ch, %ch
1063	movb	%ch, 1(%rdi)
1064#  endif
1065	ret
1066
1067	.p2align 4
1068L(StrncpyExit2):
1069	mov	(%rsi), %dx
1070	mov	%dx, (%rdi)
1071#  ifdef USE_AS_STPCPY
1072	lea	2(%rdi), %rax
1073#  endif
1074#  ifdef USE_AS_STRCAT
1075	xor	%ch, %ch
1076	movb	%ch, 2(%rdi)
1077#  endif
1078	ret
1079
1080	.p2align 4
1081L(StrncpyExit3):
1082	mov	(%rsi), %cx
1083	mov	2(%rsi), %dl
1084	mov	%cx, (%rdi)
1085	mov	%dl, 2(%rdi)
1086#  ifdef USE_AS_STPCPY
1087	lea	3(%rdi), %rax
1088#  endif
1089#  ifdef USE_AS_STRCAT
1090	xor	%ch, %ch
1091	movb	%ch, 3(%rdi)
1092#  endif
1093	ret
1094
1095	.p2align 4
1096L(StrncpyExit4):
1097	mov	(%rsi), %edx
1098	mov	%edx, (%rdi)
1099#  ifdef USE_AS_STPCPY
1100	lea	4(%rdi), %rax
1101#  endif
1102#  ifdef USE_AS_STRCAT
1103	xor	%ch, %ch
1104	movb	%ch, 4(%rdi)
1105#  endif
1106	ret
1107
1108	.p2align 4
1109L(StrncpyExit5):
1110	mov	(%rsi), %ecx
1111	mov	4(%rsi), %dl
1112	mov	%ecx, (%rdi)
1113	mov	%dl, 4(%rdi)
1114#  ifdef USE_AS_STPCPY
1115	lea	5(%rdi), %rax
1116#  endif
1117#  ifdef USE_AS_STRCAT
1118	xor	%ch, %ch
1119	movb	%ch, 5(%rdi)
1120#  endif
1121	ret
1122
1123	.p2align 4
1124L(StrncpyExit6):
1125	mov	(%rsi), %ecx
1126	mov	4(%rsi), %dx
1127	mov	%ecx, (%rdi)
1128	mov	%dx, 4(%rdi)
1129#  ifdef USE_AS_STPCPY
1130	lea	6(%rdi), %rax
1131#  endif
1132#  ifdef USE_AS_STRCAT
1133	xor	%ch, %ch
1134	movb	%ch, 6(%rdi)
1135#  endif
1136	ret
1137
1138	.p2align 4
1139L(StrncpyExit7):
1140	mov	(%rsi), %ecx
1141	mov	3(%rsi), %edx
1142	mov	%ecx, (%rdi)
1143	mov	%edx, 3(%rdi)
1144#  ifdef USE_AS_STPCPY
1145	lea	7(%rdi), %rax
1146#  endif
1147#  ifdef USE_AS_STRCAT
1148	xor	%ch, %ch
1149	movb	%ch, 7(%rdi)
1150#  endif
1151	ret
1152
1153	.p2align 4
1154L(StrncpyExit8):
1155	mov	(%rsi), %rdx
1156	mov	%rdx, (%rdi)
1157#  ifdef USE_AS_STPCPY
1158	lea	8(%rdi), %rax
1159#  endif
1160#  ifdef USE_AS_STRCAT
1161	xor	%ch, %ch
1162	movb	%ch, 8(%rdi)
1163#  endif
1164	ret
1165
1166	.p2align 4
1167L(StrncpyExit9):
1168	mov	(%rsi), %rcx
1169	mov	8(%rsi), %dl
1170	mov	%rcx, (%rdi)
1171	mov	%dl, 8(%rdi)
1172#  ifdef USE_AS_STPCPY
1173	lea	9(%rdi), %rax
1174#  endif
1175#  ifdef USE_AS_STRCAT
1176	xor	%ch, %ch
1177	movb	%ch, 9(%rdi)
1178#  endif
1179	ret
1180
1181	.p2align 4
1182L(StrncpyExit10):
1183	mov	(%rsi), %rcx
1184	mov	8(%rsi), %dx
1185	mov	%rcx, (%rdi)
1186	mov	%dx, 8(%rdi)
1187#  ifdef USE_AS_STPCPY
1188	lea	10(%rdi), %rax
1189#  endif
1190#  ifdef USE_AS_STRCAT
1191	xor	%ch, %ch
1192	movb	%ch, 10(%rdi)
1193#  endif
1194	ret
1195
1196	.p2align 4
1197L(StrncpyExit11):
1198	mov	(%rsi), %rcx
1199	mov	7(%rsi), %edx
1200	mov	%rcx, (%rdi)
1201	mov	%edx, 7(%rdi)
1202#  ifdef USE_AS_STPCPY
1203	lea	11(%rdi), %rax
1204#  endif
1205#  ifdef USE_AS_STRCAT
1206	xor	%ch, %ch
1207	movb	%ch, 11(%rdi)
1208#  endif
1209	ret
1210
1211	.p2align 4
1212L(StrncpyExit12):
1213	mov	(%rsi), %rcx
1214	mov	8(%rsi), %edx
1215	mov	%rcx, (%rdi)
1216	mov	%edx, 8(%rdi)
1217#  ifdef USE_AS_STPCPY
1218	lea	12(%rdi), %rax
1219#  endif
1220#  ifdef USE_AS_STRCAT
1221	xor	%ch, %ch
1222	movb	%ch, 12(%rdi)
1223#  endif
1224	ret
1225
1226	.p2align 4
1227L(StrncpyExit13):
1228	mov	(%rsi), %rcx
1229	mov	5(%rsi), %rdx
1230	mov	%rcx, (%rdi)
1231	mov	%rdx, 5(%rdi)
1232#  ifdef USE_AS_STPCPY
1233	lea	13(%rdi), %rax
1234#  endif
1235#  ifdef USE_AS_STRCAT
1236	xor	%ch, %ch
1237	movb	%ch, 13(%rdi)
1238#  endif
1239	ret
1240
1241	.p2align 4
1242L(StrncpyExit14):
1243	mov	(%rsi), %rcx
1244	mov	6(%rsi), %rdx
1245	mov	%rcx, (%rdi)
1246	mov	%rdx, 6(%rdi)
1247#  ifdef USE_AS_STPCPY
1248	lea	14(%rdi), %rax
1249#  endif
1250#  ifdef USE_AS_STRCAT
1251	xor	%ch, %ch
1252	movb	%ch, 14(%rdi)
1253#  endif
1254	ret
1255
1256	.p2align 4
1257L(StrncpyExit15):
1258	mov	(%rsi), %rcx
1259	mov	7(%rsi), %rdx
1260	mov	%rcx, (%rdi)
1261	mov	%rdx, 7(%rdi)
1262#  ifdef USE_AS_STPCPY
1263	lea	15(%rdi), %rax
1264#  endif
1265#  ifdef USE_AS_STRCAT
1266	xor	%ch, %ch
1267	movb	%ch, 15(%rdi)
1268#  endif
1269	ret
1270
1271	.p2align 4
1272L(StrncpyExit16):
1273	movdqu	(%rsi), %xmm0
1274	movdqu	%xmm0, (%rdi)
1275#  ifdef USE_AS_STPCPY
1276	lea	16(%rdi), %rax
1277#  endif
1278#  ifdef USE_AS_STRCAT
1279	xor	%ch, %ch
1280	movb	%ch, 16(%rdi)
1281#  endif
1282	ret
1283
1284	.p2align 4
1285L(StrncpyExit17):
1286	movdqu	(%rsi), %xmm0
1287	mov	16(%rsi), %cl
1288	movdqu	%xmm0, (%rdi)
1289	mov	%cl, 16(%rdi)
1290#  ifdef USE_AS_STPCPY
1291	lea	17(%rdi), %rax
1292#  endif
1293#  ifdef USE_AS_STRCAT
1294	xor	%ch, %ch
1295	movb	%ch, 17(%rdi)
1296#  endif
1297	ret
1298
1299	.p2align 4
1300L(StrncpyExit18):
1301	movdqu	(%rsi), %xmm0
1302	mov	16(%rsi), %cx
1303	movdqu	%xmm0, (%rdi)
1304	mov	%cx, 16(%rdi)
1305#  ifdef USE_AS_STPCPY
1306	lea	18(%rdi), %rax
1307#  endif
1308#  ifdef USE_AS_STRCAT
1309	xor	%ch, %ch
1310	movb	%ch, 18(%rdi)
1311#  endif
1312	ret
1313
1314	.p2align 4
1315L(StrncpyExit19):
1316	movdqu	(%rsi), %xmm0
1317	mov	15(%rsi), %ecx
1318	movdqu	%xmm0, (%rdi)
1319	mov	%ecx, 15(%rdi)
1320#  ifdef USE_AS_STPCPY
1321	lea	19(%rdi), %rax
1322#  endif
1323#  ifdef USE_AS_STRCAT
1324	xor	%ch, %ch
1325	movb	%ch, 19(%rdi)
1326#  endif
1327	ret
1328
1329	.p2align 4
1330L(StrncpyExit20):
1331	movdqu	(%rsi), %xmm0
1332	mov	16(%rsi), %ecx
1333	movdqu	%xmm0, (%rdi)
1334	mov	%ecx, 16(%rdi)
1335#  ifdef USE_AS_STPCPY
1336	lea	20(%rdi), %rax
1337#  endif
1338#  ifdef USE_AS_STRCAT
1339	xor	%ch, %ch
1340	movb	%ch, 20(%rdi)
1341#  endif
1342	ret
1343
1344	.p2align 4
1345L(StrncpyExit21):
1346	movdqu	(%rsi), %xmm0
1347	mov	16(%rsi), %ecx
1348	mov	20(%rsi), %dl
1349	movdqu	%xmm0, (%rdi)
1350	mov	%ecx, 16(%rdi)
1351	mov	%dl, 20(%rdi)
1352#  ifdef USE_AS_STPCPY
1353	lea	21(%rdi), %rax
1354#  endif
1355#  ifdef USE_AS_STRCAT
1356	xor	%ch, %ch
1357	movb	%ch, 21(%rdi)
1358#  endif
1359	ret
1360
1361	.p2align 4
1362L(StrncpyExit22):
1363	movdqu	(%rsi), %xmm0
1364	mov	14(%rsi), %rcx
1365	movdqu	%xmm0, (%rdi)
1366	mov	%rcx, 14(%rdi)
1367#  ifdef USE_AS_STPCPY
1368	lea	22(%rdi), %rax
1369#  endif
1370#  ifdef USE_AS_STRCAT
1371	xor	%ch, %ch
1372	movb	%ch, 22(%rdi)
1373#  endif
1374	ret
1375
1376	.p2align 4
1377L(StrncpyExit23):
1378	movdqu	(%rsi), %xmm0
1379	mov	15(%rsi), %rcx
1380	movdqu	%xmm0, (%rdi)
1381	mov	%rcx, 15(%rdi)
1382#  ifdef USE_AS_STPCPY
1383	lea	23(%rdi), %rax
1384#  endif
1385#  ifdef USE_AS_STRCAT
1386	xor	%ch, %ch
1387	movb	%ch, 23(%rdi)
1388#  endif
1389	ret
1390
1391	.p2align 4
1392L(StrncpyExit24):
1393	movdqu	(%rsi), %xmm0
1394	mov	16(%rsi), %rcx
1395	movdqu	%xmm0, (%rdi)
1396	mov	%rcx, 16(%rdi)
1397#  ifdef USE_AS_STPCPY
1398	lea	24(%rdi), %rax
1399#  endif
1400#  ifdef USE_AS_STRCAT
1401	xor	%ch, %ch
1402	movb	%ch, 24(%rdi)
1403#  endif
1404	ret
1405
1406	.p2align 4
1407L(StrncpyExit25):
1408	movdqu	(%rsi), %xmm0
1409	mov	16(%rsi), %rdx
1410	mov	24(%rsi), %cl
1411	movdqu	%xmm0, (%rdi)
1412	mov	%rdx, 16(%rdi)
1413	mov	%cl, 24(%rdi)
1414#  ifdef USE_AS_STPCPY
1415	lea	25(%rdi), %rax
1416#  endif
1417#  ifdef USE_AS_STRCAT
1418	xor	%ch, %ch
1419	movb	%ch, 25(%rdi)
1420#  endif
1421	ret
1422
1423	.p2align 4
1424L(StrncpyExit26):
1425	movdqu	(%rsi), %xmm0
1426	mov	16(%rsi), %rdx
1427	mov	24(%rsi), %cx
1428	movdqu	%xmm0, (%rdi)
1429	mov	%rdx, 16(%rdi)
1430	mov	%cx, 24(%rdi)
1431#  ifdef USE_AS_STPCPY
1432	lea	26(%rdi), %rax
1433#  endif
1434#  ifdef USE_AS_STRCAT
1435	xor	%ch, %ch
1436	movb	%ch, 26(%rdi)
1437#  endif
1438	ret
1439
1440	.p2align 4
1441L(StrncpyExit27):
1442	movdqu	(%rsi), %xmm0
1443	mov	16(%rsi), %rdx
1444	mov	23(%rsi), %ecx
1445	movdqu	%xmm0, (%rdi)
1446	mov	%rdx, 16(%rdi)
1447	mov	%ecx, 23(%rdi)
1448#  ifdef USE_AS_STPCPY
1449	lea	27(%rdi), %rax
1450#  endif
1451#  ifdef USE_AS_STRCAT
1452	xor	%ch, %ch
1453	movb	%ch, 27(%rdi)
1454#  endif
1455	ret
1456
1457	.p2align 4
1458L(StrncpyExit28):
1459	movdqu	(%rsi), %xmm0
1460	mov	16(%rsi), %rdx
1461	mov	24(%rsi), %ecx
1462	movdqu	%xmm0, (%rdi)
1463	mov	%rdx, 16(%rdi)
1464	mov	%ecx, 24(%rdi)
1465#  ifdef USE_AS_STPCPY
1466	lea	28(%rdi), %rax
1467#  endif
1468#  ifdef USE_AS_STRCAT
1469	xor	%ch, %ch
1470	movb	%ch, 28(%rdi)
1471#  endif
1472	ret
1473
1474	.p2align 4
1475L(StrncpyExit29):
1476	movdqu	(%rsi), %xmm0
1477	movdqu	13(%rsi), %xmm2
1478	movdqu	%xmm0, (%rdi)
1479	movdqu	%xmm2, 13(%rdi)
1480#  ifdef USE_AS_STPCPY
1481	lea	29(%rdi), %rax
1482#  endif
1483#  ifdef USE_AS_STRCAT
1484	xor	%ch, %ch
1485	movb	%ch, 29(%rdi)
1486#  endif
1487	ret
1488
1489	.p2align 4
1490L(StrncpyExit30):
1491	movdqu	(%rsi), %xmm0
1492	movdqu	14(%rsi), %xmm2
1493	movdqu	%xmm0, (%rdi)
1494	movdqu	%xmm2, 14(%rdi)
1495#  ifdef USE_AS_STPCPY
1496	lea	30(%rdi), %rax
1497#  endif
1498#  ifdef USE_AS_STRCAT
1499	xor	%ch, %ch
1500	movb	%ch, 30(%rdi)
1501#  endif
1502	ret
1503
1504	.p2align 4
1505L(StrncpyExit31):
1506	movdqu	(%rsi), %xmm0
1507	movdqu	15(%rsi), %xmm2
1508	movdqu	%xmm0, (%rdi)
1509	movdqu	%xmm2, 15(%rdi)
1510#  ifdef USE_AS_STPCPY
1511	lea	31(%rdi), %rax
1512#  endif
1513#  ifdef USE_AS_STRCAT
1514	xor	%ch, %ch
1515	movb	%ch, 31(%rdi)
1516#  endif
1517	ret
1518
1519	.p2align 4
1520L(StrncpyExit32):
1521	movdqu	(%rsi), %xmm0
1522	movdqu	16(%rsi), %xmm2
1523	movdqu	%xmm0, (%rdi)
1524	movdqu	%xmm2, 16(%rdi)
1525#  ifdef USE_AS_STPCPY
1526	lea	32(%rdi), %rax
1527#  endif
1528#  ifdef USE_AS_STRCAT
1529	xor	%ch, %ch
1530	movb	%ch, 32(%rdi)
1531#  endif
1532	ret
1533
1534	.p2align 4
1535L(StrncpyExit33):
1536	movdqu	(%rsi), %xmm0
1537	movdqu	16(%rsi), %xmm2
1538	mov	32(%rsi), %cl
1539	movdqu	%xmm0, (%rdi)
1540	movdqu	%xmm2, 16(%rdi)
1541	mov	%cl, 32(%rdi)
1542#  ifdef USE_AS_STRCAT
1543	xor	%ch, %ch
1544	movb	%ch, 33(%rdi)
1545#  endif
1546	ret
1547
1548#  ifndef USE_AS_STRCAT
1549
1550	.p2align 4
1551L(Fill0):
1552	ret
1553
1554	.p2align 4
1555L(Fill1):
1556	mov	%dl, (%rdi)
1557	ret
1558
1559	.p2align 4
1560L(Fill2):
1561	mov	%dx, (%rdi)
1562	ret
1563
1564	.p2align 4
1565L(Fill3):
1566	mov	%edx, -1(%rdi)
1567	ret
1568
1569	.p2align 4
1570L(Fill4):
1571	mov	%edx, (%rdi)
1572	ret
1573
1574	.p2align 4
1575L(Fill5):
1576	mov	%edx, (%rdi)
1577	mov	%dl, 4(%rdi)
1578	ret
1579
1580	.p2align 4
1581L(Fill6):
1582	mov	%edx, (%rdi)
1583	mov	%dx, 4(%rdi)
1584	ret
1585
1586	.p2align 4
1587L(Fill7):
1588	mov	%rdx, -1(%rdi)
1589	ret
1590
1591	.p2align 4
1592L(Fill8):
1593	mov	%rdx, (%rdi)
1594	ret
1595
1596	.p2align 4
1597L(Fill9):
1598	mov	%rdx, (%rdi)
1599	mov	%dl, 8(%rdi)
1600	ret
1601
1602	.p2align 4
1603L(Fill10):
1604	mov	%rdx, (%rdi)
1605	mov	%dx, 8(%rdi)
1606	ret
1607
1608	.p2align 4
1609L(Fill11):
1610	mov	%rdx, (%rdi)
1611	mov	%edx, 7(%rdi)
1612	ret
1613
1614	.p2align 4
1615L(Fill12):
1616	mov	%rdx, (%rdi)
1617	mov	%edx, 8(%rdi)
1618	ret
1619
1620	.p2align 4
1621L(Fill13):
1622	mov	%rdx, (%rdi)
1623	mov	%rdx, 5(%rdi)
1624	ret
1625
1626	.p2align 4
1627L(Fill14):
1628	mov	%rdx, (%rdi)
1629	mov	%rdx, 6(%rdi)
1630	ret
1631
1632	.p2align 4
1633L(Fill15):
1634	movdqu	%xmm0, -1(%rdi)
1635	ret
1636
1637	.p2align 4
1638L(Fill16):
1639	movdqu	%xmm0, (%rdi)
1640	ret
1641
1642	.p2align 4
1643L(CopyFrom1To16BytesUnalignedXmm2):
1644	movdqu	%xmm2, (%rdi, %rcx)
1645
1646	.p2align 4
1647L(CopyFrom1To16BytesXmmExit):
1648	bsf	%rdx, %rdx
1649	add	$15, %r8
1650	add	%rcx, %rdi
1651#   ifdef USE_AS_STPCPY
1652	lea	(%rdi, %rdx), %rax
1653#   endif
1654	sub	%rdx, %r8
1655	lea	1(%rdi, %rdx), %rdi
1656
1657	.p2align 4
1658L(StrncpyFillTailWithZero):
1659	pxor	%xmm0, %xmm0
1660	xor	%rdx, %rdx
1661	sub	$16, %r8
1662	jbe	L(StrncpyFillExit)
1663
1664	movdqu	%xmm0, (%rdi)
1665	add	$16, %rdi
1666
1667	mov	%rdi, %rsi
1668	and	$0xf, %rsi
1669	sub	%rsi, %rdi
1670	add	%rsi, %r8
1671	sub	$64, %r8
1672	jb	L(StrncpyFillLess64)
1673
1674L(StrncpyFillLoopMovdqa):
1675	movdqa	%xmm0, (%rdi)
1676	movdqa	%xmm0, 16(%rdi)
1677	movdqa	%xmm0, 32(%rdi)
1678	movdqa	%xmm0, 48(%rdi)
1679	add	$64, %rdi
1680	sub	$64, %r8
1681	jae	L(StrncpyFillLoopMovdqa)
1682
1683L(StrncpyFillLess64):
1684	add	$32, %r8
1685	jl	L(StrncpyFillLess32)
1686	movdqa	%xmm0, (%rdi)
1687	movdqa	%xmm0, 16(%rdi)
1688	add	$32, %rdi
1689	sub	$16, %r8
1690	jl	L(StrncpyFillExit)
1691	movdqa	%xmm0, (%rdi)
1692	add	$16, %rdi
1693	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1694
1695L(StrncpyFillLess32):
1696	add	$16, %r8
1697	jl	L(StrncpyFillExit)
1698	movdqa	%xmm0, (%rdi)
1699	add	$16, %rdi
1700	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1701
1702L(StrncpyFillExit):
1703	add	$16, %r8
1704	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
1705
1706/* end of ifndef USE_AS_STRCAT */
1707#  endif
1708
1709	.p2align 4
1710L(UnalignedLeaveCase2OrCase3):
1711	test	%rdx, %rdx
1712	jnz	L(Unaligned64LeaveCase2)
1713L(Unaligned64LeaveCase3):
1714	lea	64(%r8), %rcx
1715	and	$-16, %rcx
1716	add	$48, %r8
1717	jl	L(CopyFrom1To16BytesCase3)
1718	movdqu	%xmm4, (%rdi)
1719	sub	$16, %r8
1720	jb	L(CopyFrom1To16BytesCase3)
1721	movdqu	%xmm5, 16(%rdi)
1722	sub	$16, %r8
1723	jb	L(CopyFrom1To16BytesCase3)
1724	movdqu	%xmm6, 32(%rdi)
1725	sub	$16, %r8
1726	jb	L(CopyFrom1To16BytesCase3)
1727	movdqu	%xmm7, 48(%rdi)
1728#  ifdef USE_AS_STPCPY
1729	lea	64(%rdi), %rax
1730#  endif
1731#  ifdef USE_AS_STRCAT
1732	xor	%ch, %ch
1733	movb	%ch, 64(%rdi)
1734#  endif
1735	ret
1736
1737	.p2align 4
1738L(Unaligned64LeaveCase2):
1739	xor	%rcx, %rcx
1740	pcmpeqb	%xmm4, %xmm0
1741	pmovmskb %xmm0, %rdx
1742	add	$48, %r8
1743	jle	L(CopyFrom1To16BytesCase2OrCase3)
1744	test	%rdx, %rdx
1745#  ifndef USE_AS_STRCAT
1746	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
1747#  else
1748	jnz	L(CopyFrom1To16Bytes)
1749#  endif
1750	pcmpeqb	%xmm5, %xmm0
1751	pmovmskb %xmm0, %rdx
1752	movdqu	%xmm4, (%rdi)
1753	add	$16, %rcx
1754	sub	$16, %r8
1755	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1756	test	%rdx, %rdx
1757#  ifndef USE_AS_STRCAT
1758	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
1759#  else
1760	jnz	L(CopyFrom1To16Bytes)
1761#  endif
1762
1763	pcmpeqb	%xmm6, %xmm0
1764	pmovmskb %xmm0, %rdx
1765	movdqu	%xmm5, 16(%rdi)
1766	add	$16, %rcx
1767	sub	$16, %r8
1768	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1769	test	%rdx, %rdx
1770#  ifndef USE_AS_STRCAT
1771	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
1772#  else
1773	jnz	L(CopyFrom1To16Bytes)
1774#  endif
1775
1776	pcmpeqb	%xmm7, %xmm0
1777	pmovmskb %xmm0, %rdx
1778	movdqu	%xmm6, 32(%rdi)
1779	lea	16(%rdi, %rcx), %rdi
1780	lea	16(%rsi, %rcx), %rsi
1781	bsf	%rdx, %rdx
1782	cmp	%r8, %rdx
1783	jb	L(CopyFrom1To16BytesExit)
1784	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
1785
1786	.p2align 4
1787L(ExitZero):
1788#  ifndef USE_AS_STRCAT
1789	mov	%rdi, %rax
1790#  endif
1791	ret
1792
1793# endif
1794
1795# ifndef USE_AS_STRCAT
1796END (STRCPY)
1797# else
1798END (STRCAT)
1799# endif
1800	.p2align 4
1801	.section .rodata
1802L(ExitTable):
1803	.int	JMPTBL(L(Exit1), L(ExitTable))
1804	.int	JMPTBL(L(Exit2), L(ExitTable))
1805	.int	JMPTBL(L(Exit3), L(ExitTable))
1806	.int	JMPTBL(L(Exit4), L(ExitTable))
1807	.int	JMPTBL(L(Exit5), L(ExitTable))
1808	.int	JMPTBL(L(Exit6), L(ExitTable))
1809	.int	JMPTBL(L(Exit7), L(ExitTable))
1810	.int	JMPTBL(L(Exit8), L(ExitTable))
1811	.int	JMPTBL(L(Exit9), L(ExitTable))
1812	.int	JMPTBL(L(Exit10), L(ExitTable))
1813	.int	JMPTBL(L(Exit11), L(ExitTable))
1814	.int	JMPTBL(L(Exit12), L(ExitTable))
1815	.int	JMPTBL(L(Exit13), L(ExitTable))
1816	.int	JMPTBL(L(Exit14), L(ExitTable))
1817	.int	JMPTBL(L(Exit15), L(ExitTable))
1818	.int	JMPTBL(L(Exit16), L(ExitTable))
1819	.int	JMPTBL(L(Exit17), L(ExitTable))
1820	.int	JMPTBL(L(Exit18), L(ExitTable))
1821	.int	JMPTBL(L(Exit19), L(ExitTable))
1822	.int	JMPTBL(L(Exit20), L(ExitTable))
1823	.int	JMPTBL(L(Exit21), L(ExitTable))
1824	.int	JMPTBL(L(Exit22), L(ExitTable))
1825	.int    JMPTBL(L(Exit23), L(ExitTable))
1826	.int	JMPTBL(L(Exit24), L(ExitTable))
1827	.int	JMPTBL(L(Exit25), L(ExitTable))
1828	.int	JMPTBL(L(Exit26), L(ExitTable))
1829	.int	JMPTBL(L(Exit27), L(ExitTable))
1830	.int	JMPTBL(L(Exit28), L(ExitTable))
1831	.int	JMPTBL(L(Exit29), L(ExitTable))
1832	.int	JMPTBL(L(Exit30), L(ExitTable))
1833	.int	JMPTBL(L(Exit31), L(ExitTable))
1834	.int	JMPTBL(L(Exit32), L(ExitTable))
1835# ifdef USE_AS_STRNCPY
1836L(ExitStrncpyTable):
1837	.int	JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
1838	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
1839	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
1840	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
1841	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
1842	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
1843	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
1844	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
1845	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
1846	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
1847	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
1848	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
1849	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
1850	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
1851	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
1852	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
1853	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
1854	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
1855	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
1856	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
1857	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
1858	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
1859	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
1860	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
1861	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
1862	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
1863	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
1864	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
1865	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
1866	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
1867	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
1868	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
1869	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
1870	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
1871#  ifndef USE_AS_STRCAT
1872	.p2align 4
1873L(FillTable):
1874	.int	JMPTBL(L(Fill0), L(FillTable))
1875	.int	JMPTBL(L(Fill1), L(FillTable))
1876	.int	JMPTBL(L(Fill2), L(FillTable))
1877	.int	JMPTBL(L(Fill3), L(FillTable))
1878	.int	JMPTBL(L(Fill4), L(FillTable))
1879	.int	JMPTBL(L(Fill5), L(FillTable))
1880	.int	JMPTBL(L(Fill6), L(FillTable))
1881	.int	JMPTBL(L(Fill7), L(FillTable))
1882	.int	JMPTBL(L(Fill8), L(FillTable))
1883	.int	JMPTBL(L(Fill9), L(FillTable))
1884	.int	JMPTBL(L(Fill10), L(FillTable))
1885	.int	JMPTBL(L(Fill11), L(FillTable))
1886	.int	JMPTBL(L(Fill12), L(FillTable))
1887	.int	JMPTBL(L(Fill13), L(FillTable))
1888	.int	JMPTBL(L(Fill14), L(FillTable))
1889	.int	JMPTBL(L(Fill15), L(FillTable))
1890	.int	JMPTBL(L(Fill16), L(FillTable))
1891#  endif
1892# endif
1893#endif
1894