1/* strcat with SSE2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24
25# define CFI_PUSH(REG)	\
26	cfi_adjust_cfa_offset (4);	\
27	cfi_rel_offset (REG, 0)
28
29# define CFI_POP(REG)	\
30	cfi_adjust_cfa_offset (-4);	\
31	cfi_restore (REG)
32
33# define PUSH(REG) pushl REG; CFI_PUSH (REG)
34# define POP(REG) popl REG; CFI_POP (REG)
35
36# ifdef PIC
37#  define JMPTBL(I, B) I - B
38
39/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
40	jump table with relative offsets.  INDEX is a register contains the
41	index into the jump table.   SCALE is the scale of INDEX. */
42
43#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
44	/* We first load PC into ECX.  */	\
45	SETUP_PIC_REG(cx);	\
46	/* Get the address of the jump table.  */	\
47	addl	$(TABLE - .), %ecx;	\
48	/* Get the entry and convert the relative offset to the	\
49	absolute address.  */	\
50	addl	(%ecx,INDEX,SCALE), %ecx;	\
51	/* We loaded the jump table and adjusted ECX. Go.  */	\
52	_CET_NOTRACK jmp *%ecx
53# else
54#  define JMPTBL(I, B) I
55
56/* Branch to an entry in a jump table.  TABLE is a jump table with
57	absolute offsets.  INDEX is a register contains the index into the
58	jump table.  SCALE is the scale of INDEX. */
59
60#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
61	_CET_NOTRACK jmp *TABLE(,INDEX,SCALE)
62# endif
63
64# ifndef STRCAT
65#  define STRCAT  __strcat_sse2
66# endif
67
68# define PARMS  4
69# define STR1  PARMS+4
70# define STR2  STR1+4
71
72# ifdef USE_AS_STRNCAT
73#  define LEN    STR2+8
74#  define STR3   STR1+4
75# else
76#  define STR3   STR1
77# endif
78
79# define USE_AS_STRCAT
80# ifdef USE_AS_STRNCAT
81#  define RETURN  POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi);
82# else
83#  define RETURN  POP(%esi); ret; CFI_PUSH(%esi);
84# endif
85
86.text
87ENTRY (STRCAT)
88	PUSH	(%esi)
89	mov	STR1(%esp), %eax
90	mov	STR2(%esp), %esi
91# ifdef USE_AS_STRNCAT
92	PUSH	(%ebx)
93	movl	LEN(%esp), %ebx
94	test	%ebx, %ebx
95	jz	L(ExitZero)
96# endif
97	cmpb	$0, (%esi)
98	mov	%esi, %ecx
99	mov	%eax, %edx
100	jz	L(ExitZero)
101
102	and	$63, %ecx
103	and	$63, %edx
104	cmp	$32, %ecx
105	ja	L(StrlenCore7_1)
106	cmp	$48, %edx
107	ja	L(alignment_prolog)
108
109	pxor	%xmm0, %xmm0
110	pxor	%xmm4, %xmm4
111	pxor	%xmm7, %xmm7
112	movdqu	(%eax), %xmm1
113	movdqu	(%esi), %xmm5
114	pcmpeqb	%xmm1, %xmm0
115	movdqu	16(%esi), %xmm6
116	pmovmskb %xmm0, %ecx
117	pcmpeqb	%xmm5, %xmm4
118	pcmpeqb	%xmm6, %xmm7
119	test	%ecx, %ecx
120	jnz	L(exit_less16_)
121	mov	%eax, %ecx
122	and	$-16, %eax
123	jmp	L(loop_prolog)
124
125L(alignment_prolog):
126	pxor	%xmm0, %xmm0
127	pxor	%xmm4, %xmm4
128	mov	%edx, %ecx
129	pxor	%xmm7, %xmm7
130	and	$15, %ecx
131	and	$-16, %eax
132	pcmpeqb	(%eax), %xmm0
133	movdqu	(%esi), %xmm5
134	movdqu	16(%esi), %xmm6
135	pmovmskb %xmm0, %edx
136	pcmpeqb	%xmm5, %xmm4
137	shr	%cl, %edx
138	pcmpeqb	%xmm6, %xmm7
139	test	%edx, %edx
140	jnz	L(exit_less16)
141	add	%eax, %ecx
142
143	pxor	%xmm0, %xmm0
144L(loop_prolog):
145	pxor	%xmm1, %xmm1
146	pxor	%xmm2, %xmm2
147	pxor	%xmm3, %xmm3
148	.p2align 4
149L(align16_loop):
150	pcmpeqb	16(%eax), %xmm0
151	pmovmskb %xmm0, %edx
152	test	%edx, %edx
153	jnz	L(exit16)
154
155	pcmpeqb	32(%eax), %xmm1
156	pmovmskb %xmm1, %edx
157	test	%edx, %edx
158	jnz	L(exit32)
159
160	pcmpeqb	48(%eax), %xmm2
161	pmovmskb %xmm2, %edx
162	test	%edx, %edx
163	jnz	L(exit48)
164
165	pcmpeqb	64(%eax), %xmm3
166	pmovmskb %xmm3, %edx
167	lea	64(%eax), %eax
168	test	%edx, %edx
169	jz	L(align16_loop)
170	bsf	%edx, %edx
171	add	%edx, %eax
172	jmp	L(StartStrcpyPart)
173
174	.p2align 4
175L(exit16):
176	bsf	%edx, %edx
177	lea	16(%eax, %edx), %eax
178	jmp	L(StartStrcpyPart)
179
180	.p2align 4
181L(exit32):
182	bsf	%edx, %edx
183	lea	32(%eax, %edx), %eax
184	jmp	L(StartStrcpyPart)
185
186	.p2align 4
187L(exit48):
188	bsf	%edx, %edx
189	lea	48(%eax, %edx), %eax
190	jmp	L(StartStrcpyPart)
191
192	.p2align 4
193L(exit_less16):
194	bsf	%edx, %edx
195	add	%ecx, %eax
196	add	%edx, %eax
197	jmp	L(StartStrcpyPart)
198
199	.p2align 4
200L(exit_less16_):
201	bsf	%ecx, %ecx
202	add	%ecx, %eax
203
204	.p2align 4
205L(StartStrcpyPart):
206	pmovmskb %xmm4, %edx
207# ifdef USE_AS_STRNCAT
208	cmp	$16, %ebx
209	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
210# endif
211	test	%edx, %edx
212	jnz	L(CopyFrom1To16BytesTail1)
213
214	movdqu	%xmm5, (%eax)
215	pmovmskb %xmm7, %edx
216# ifdef USE_AS_STRNCAT
217	cmp	$32, %ebx
218	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
219# endif
220	test	%edx, %edx
221	jnz	L(CopyFrom1To32Bytes1)
222
223	mov	%esi, %ecx
224	and	$-16, %esi
225	and	$15, %ecx
226	pxor	%xmm0, %xmm0
227# ifdef USE_AS_STRNCAT
228	add	%ecx, %ebx
229	sbb	%edx, %edx
230	or	%edx, %ebx
231# endif
232	sub	%ecx, %eax
233	jmp	L(Unalign16Both)
234
235L(StrlenCore7_1):
236	mov	%eax, %ecx
237	pxor	%xmm0, %xmm0
238	and	$15, %ecx
239	and	$-16, %eax
240	pcmpeqb	(%eax), %xmm0
241	pmovmskb %xmm0, %edx
242	shr	%cl, %edx
243	test	%edx, %edx
244	jnz	L(exit_less16_1)
245	add	%eax, %ecx
246
247	pxor	%xmm0, %xmm0
248	pxor	%xmm1, %xmm1
249	pxor	%xmm2, %xmm2
250	pxor	%xmm3, %xmm3
251
252	.p2align 4
253L(align16_loop_1):
254	pcmpeqb	16(%eax), %xmm0
255	pmovmskb %xmm0, %edx
256	test	%edx, %edx
257	jnz	L(exit16_1)
258
259	pcmpeqb	32(%eax), %xmm1
260	pmovmskb %xmm1, %edx
261	test	%edx, %edx
262	jnz	L(exit32_1)
263
264	pcmpeqb	48(%eax), %xmm2
265	pmovmskb %xmm2, %edx
266	test	%edx, %edx
267	jnz	L(exit48_1)
268
269	pcmpeqb	64(%eax), %xmm3
270	pmovmskb %xmm3, %edx
271	lea	64(%eax), %eax
272	test	%edx, %edx
273	jz	L(align16_loop_1)
274	bsf	%edx, %edx
275	add	%edx, %eax
276	jmp	L(StartStrcpyPart_1)
277
278	.p2align 4
279L(exit16_1):
280	bsf	%edx, %edx
281	lea	16(%eax, %edx), %eax
282	jmp	L(StartStrcpyPart_1)
283
284	.p2align 4
285L(exit32_1):
286	bsf	%edx, %edx
287	lea	32(%eax, %edx), %eax
288	jmp	L(StartStrcpyPart_1)
289
290	.p2align 4
291L(exit48_1):
292	bsf	%edx, %edx
293	lea	48(%eax, %edx), %eax
294	jmp	L(StartStrcpyPart_1)
295
296	.p2align 4
297L(exit_less16_1):
298	bsf	%edx, %edx
299	add	%ecx, %eax
300	add	%edx, %eax
301
302	.p2align 4
303L(StartStrcpyPart_1):
304	mov	%esi, %ecx
305	and	$15, %ecx
306	and	$-16, %esi
307	pxor	%xmm0, %xmm0
308	pxor	%xmm1, %xmm1
309
310# ifdef USE_AS_STRNCAT
311	cmp	$48, %ebx
312	ja      L(BigN)
313# endif
314	pcmpeqb	(%esi), %xmm1
315# ifdef USE_AS_STRNCAT
316	add	%ecx, %ebx
317# endif
318	pmovmskb %xmm1, %edx
319	shr	%cl, %edx
320# ifdef USE_AS_STRNCAT
321	cmp	$16, %ebx
322	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
323# endif
324	test	%edx, %edx
325	jnz	L(CopyFrom1To16BytesTail)
326
327	pcmpeqb	16(%esi), %xmm0
328	pmovmskb %xmm0, %edx
329# ifdef USE_AS_STRNCAT
330	cmp	$32, %ebx
331	jbe	L(CopyFrom1To32BytesCase2OrCase3)
332# endif
333	test	%edx, %edx
334	jnz	L(CopyFrom1To32Bytes)
335
336	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
337	movdqu	%xmm1, (%eax)
338	sub	%ecx, %eax
339
340	.p2align 4
341L(Unalign16Both):
342	mov	$16, %ecx
343	movdqa	(%esi, %ecx), %xmm1
344	movaps	16(%esi, %ecx), %xmm2
345	movdqu	%xmm1, (%eax, %ecx)
346	pcmpeqb	%xmm2, %xmm0
347	pmovmskb %xmm0, %edx
348	add	$16, %ecx
349# ifdef USE_AS_STRNCAT
350	sub	$48, %ebx
351	jbe	L(CopyFrom1To16BytesCase2OrCase3)
352# endif
353	test	%edx, %edx
354	jnz	L(CopyFrom1To16Bytes)
355L(Unalign16BothBigN):
356	movaps	16(%esi, %ecx), %xmm3
357	movdqu	%xmm2, (%eax, %ecx)
358	pcmpeqb	%xmm3, %xmm0
359	pmovmskb %xmm0, %edx
360	add	$16, %ecx
361# ifdef USE_AS_STRNCAT
362	sub	$16, %ebx
363	jbe	L(CopyFrom1To16BytesCase2OrCase3)
364# endif
365	test	%edx, %edx
366	jnz	L(CopyFrom1To16Bytes)
367
368	movaps	16(%esi, %ecx), %xmm4
369	movdqu	%xmm3, (%eax, %ecx)
370	pcmpeqb	%xmm4, %xmm0
371	pmovmskb %xmm0, %edx
372	add	$16, %ecx
373# ifdef USE_AS_STRNCAT
374	sub	$16, %ebx
375	jbe	L(CopyFrom1To16BytesCase2OrCase3)
376# endif
377	test	%edx, %edx
378	jnz	L(CopyFrom1To16Bytes)
379
380	movaps	16(%esi, %ecx), %xmm1
381	movdqu	%xmm4, (%eax, %ecx)
382	pcmpeqb	%xmm1, %xmm0
383	pmovmskb %xmm0, %edx
384	add	$16, %ecx
385# ifdef USE_AS_STRNCAT
386	sub	$16, %ebx
387	jbe	L(CopyFrom1To16BytesCase2OrCase3)
388# endif
389	test	%edx, %edx
390	jnz	L(CopyFrom1To16Bytes)
391
392	movaps	16(%esi, %ecx), %xmm2
393	movdqu	%xmm1, (%eax, %ecx)
394	pcmpeqb	%xmm2, %xmm0
395	pmovmskb %xmm0, %edx
396	add	$16, %ecx
397# ifdef USE_AS_STRNCAT
398	sub	$16, %ebx
399	jbe	L(CopyFrom1To16BytesCase2OrCase3)
400# endif
401	test	%edx, %edx
402	jnz	L(CopyFrom1To16Bytes)
403
404	movaps	16(%esi, %ecx), %xmm3
405	movdqu	%xmm2, (%eax, %ecx)
406	pcmpeqb	%xmm3, %xmm0
407	pmovmskb %xmm0, %edx
408	add	$16, %ecx
409# ifdef USE_AS_STRNCAT
410	sub	$16, %ebx
411	jbe	L(CopyFrom1To16BytesCase2OrCase3)
412# endif
413	test	%edx, %edx
414	jnz	L(CopyFrom1To16Bytes)
415
416	movdqu	%xmm3, (%eax, %ecx)
417	mov	%esi, %edx
418	lea	16(%esi, %ecx), %esi
419	and	$-0x40, %esi
420	sub	%esi, %edx
421	sub	%edx, %eax
422# ifdef USE_AS_STRNCAT
423	lea	128(%ebx, %edx), %ebx
424# endif
425	movaps	(%esi), %xmm2
426	movaps	%xmm2, %xmm4
427	movaps	16(%esi), %xmm5
428	movaps	32(%esi), %xmm3
429	movaps	%xmm3, %xmm6
430	movaps	48(%esi), %xmm7
431	pminub	%xmm5, %xmm2
432	pminub	%xmm7, %xmm3
433	pminub	%xmm2, %xmm3
434	pcmpeqb	%xmm0, %xmm3
435	pmovmskb %xmm3, %edx
436# ifdef USE_AS_STRNCAT
437	sub	$64, %ebx
438	jbe	L(UnalignedLeaveCase2OrCase3)
439# endif
440	test	%edx, %edx
441	jnz	L(Unaligned64Leave)
442
443	.p2align 4
444L(Unaligned64Loop_start):
445	add	$64, %eax
446	add	$64, %esi
447	movdqu	%xmm4, -64(%eax)
448	movaps	(%esi), %xmm2
449	movdqa	%xmm2, %xmm4
450	movdqu	%xmm5, -48(%eax)
451	movaps	16(%esi), %xmm5
452	pminub	%xmm5, %xmm2
453	movaps	32(%esi), %xmm3
454	movdqu	%xmm6, -32(%eax)
455	movaps	%xmm3, %xmm6
456	movdqu	%xmm7, -16(%eax)
457	movaps	48(%esi), %xmm7
458	pminub	%xmm7, %xmm3
459	pminub	%xmm2, %xmm3
460	pcmpeqb	%xmm0, %xmm3
461	pmovmskb %xmm3, %edx
462# ifdef USE_AS_STRNCAT
463	sub	$64, %ebx
464	jbe	L(UnalignedLeaveCase2OrCase3)
465# endif
466	test	%edx, %edx
467	jz	L(Unaligned64Loop_start)
468
469L(Unaligned64Leave):
470	pxor	%xmm1, %xmm1
471
472	pcmpeqb	%xmm4, %xmm0
473	pcmpeqb	%xmm5, %xmm1
474	pmovmskb %xmm0, %edx
475	pmovmskb %xmm1, %ecx
476	test	%edx, %edx
477	jnz	L(CopyFrom1To16BytesUnaligned_0)
478	test	%ecx, %ecx
479	jnz	L(CopyFrom1To16BytesUnaligned_16)
480
481	pcmpeqb	%xmm6, %xmm0
482	pcmpeqb	%xmm7, %xmm1
483	pmovmskb %xmm0, %edx
484	pmovmskb %xmm1, %ecx
485	test	%edx, %edx
486	jnz	L(CopyFrom1To16BytesUnaligned_32)
487
488	bsf	%ecx, %edx
489	movdqu	%xmm4, (%eax)
490	movdqu	%xmm5, 16(%eax)
491	movdqu	%xmm6, 32(%eax)
492	add	$48, %esi
493	add	$48, %eax
494	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
495
496# ifdef USE_AS_STRNCAT
497	.p2align 4
498L(BigN):
499	pcmpeqb	(%esi), %xmm1
500	pmovmskb %xmm1, %edx
501	shr	%cl, %edx
502	test	%edx, %edx
503	jnz	L(CopyFrom1To16BytesTail)
504
505	pcmpeqb	16(%esi), %xmm0
506	pmovmskb %xmm0, %edx
507	test	%edx, %edx
508	jnz	L(CopyFrom1To32Bytes)
509
510	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
511	movdqu	%xmm1, (%eax)
512	sub	%ecx, %eax
513	sub     $48, %ebx
514	add     %ecx, %ebx
515
516	mov	$16, %ecx
517	movdqa	(%esi, %ecx), %xmm1
518	movaps	16(%esi, %ecx), %xmm2
519	movdqu	%xmm1, (%eax, %ecx)
520	pcmpeqb	%xmm2, %xmm0
521	pmovmskb %xmm0, %edx
522	add	$16, %ecx
523	test	%edx, %edx
524	jnz	L(CopyFrom1To16Bytes)
525	jmp	L(Unalign16BothBigN)
526# endif
527
528/*------------end of main part-------------------------------*/
529
530/* Case1 */
531	.p2align 4
532L(CopyFrom1To16Bytes):
533	add	%ecx, %eax
534	add	%ecx, %esi
535	bsf	%edx, %edx
536	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
537
538	.p2align 4
539L(CopyFrom1To16BytesTail):
540	add	%ecx, %esi
541	bsf	%edx, %edx
542	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
543
544	.p2align 4
545L(CopyFrom1To32Bytes1):
546	add	$16, %esi
547	add	$16, %eax
548L(CopyFrom1To16BytesTail1):
549	bsf	%edx, %edx
550	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
551
552	.p2align 4
553L(CopyFrom1To32Bytes):
554	bsf	%edx, %edx
555	add	%ecx, %esi
556	add	$16, %edx
557	sub	%ecx, %edx
558	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
559
560	.p2align 4
561L(CopyFrom1To16BytesUnaligned_0):
562	bsf	%edx, %edx
563	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
564
565	.p2align 4
566L(CopyFrom1To16BytesUnaligned_16):
567	bsf	%ecx, %edx
568	movdqu	%xmm4, (%eax)
569	add	$16, %esi
570	add	$16, %eax
571	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
572
573	.p2align 4
574L(CopyFrom1To16BytesUnaligned_32):
575	bsf	%edx, %edx
576	movdqu	%xmm4, (%eax)
577	movdqu	%xmm5, 16(%eax)
578	add	$32, %esi
579	add	$32, %eax
580	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
581
582# ifdef USE_AS_STRNCAT
583
584	.p2align 4
585L(CopyFrom1To16BytesExit):
586	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
587
588/* Case2 */
589
590	.p2align 4
591L(CopyFrom1To16BytesCase2):
592	add	$16, %ebx
593	add	%ecx, %eax
594	add	%ecx, %esi
595	bsf	%edx, %edx
596	cmp	%ebx, %edx
597	jb	L(CopyFrom1To16BytesExit)
598	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
599
600	.p2align 4
601L(CopyFrom1To32BytesCase2):
602	sub	%ecx, %ebx
603	add	%ecx, %esi
604	bsf	%edx, %edx
605	add	$16, %edx
606	sub	%ecx, %edx
607	cmp	%ebx, %edx
608	jb	L(CopyFrom1To16BytesExit)
609	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
610
611L(CopyFrom1To16BytesTailCase2):
612	sub	%ecx, %ebx
613	add	%ecx, %esi
614	bsf	%edx, %edx
615	cmp	%ebx, %edx
616	jb	L(CopyFrom1To16BytesExit)
617	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
618
619L(CopyFrom1To16BytesTail1Case2):
620	bsf	%edx, %edx
621	cmp	%ebx, %edx
622	jb	L(CopyFrom1To16BytesExit)
623	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
624
625/* Case2 or Case3,  Case3 */
626
627	.p2align 4
628L(CopyFrom1To16BytesCase2OrCase3):
629	test	%edx, %edx
630	jnz	L(CopyFrom1To16BytesCase2)
631L(CopyFrom1To16BytesCase3):
632	add	$16, %ebx
633	add	%ecx, %eax
634	add	%ecx, %esi
635	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
636
637	.p2align 4
638L(CopyFrom1To32BytesCase2OrCase3):
639	test	%edx, %edx
640	jnz	L(CopyFrom1To32BytesCase2)
641	sub	%ecx, %ebx
642	add	%ecx, %esi
643	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
644
645	.p2align 4
646L(CopyFrom1To16BytesTailCase2OrCase3):
647	test	%edx, %edx
648	jnz	L(CopyFrom1To16BytesTailCase2)
649	sub	%ecx, %ebx
650	add	%ecx, %esi
651	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
652
653	.p2align 4
654L(CopyFrom1To32Bytes1Case2OrCase3):
655	add	$16, %eax
656	add	$16, %esi
657	sub	$16, %ebx
658L(CopyFrom1To16BytesTail1Case2OrCase3):
659	test	%edx, %edx
660	jnz	L(CopyFrom1To16BytesTail1Case2)
661	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
662
663# endif
664
665# ifdef USE_AS_STRNCAT
666	.p2align 4
667L(StrncatExit0):
668	movb	%bh, (%eax)
669	mov	STR3(%esp), %eax
670	RETURN
671# endif
672
673	.p2align 4
674# ifdef USE_AS_STRNCAT
675L(StrncatExit1):
676	movb	%bh, 1(%eax)
677# endif
678L(Exit1):
679# ifdef USE_AS_STRNCAT
680	movb	(%esi), %dh
681# endif
682	movb	%dh, (%eax)
683	mov	STR3(%esp), %eax
684	RETURN
685
686	.p2align 4
687# ifdef USE_AS_STRNCAT
688L(StrncatExit2):
689	movb	%bh, 2(%eax)
690# endif
691L(Exit2):
692	movw	(%esi), %dx
693	movw	%dx, (%eax)
694	mov	STR3(%esp), %eax
695	RETURN
696
697	.p2align 4
698# ifdef USE_AS_STRNCAT
699L(StrncatExit3):
700	movb	%bh, 3(%eax)
701# endif
702L(Exit3):
703	movw	(%esi), %cx
704	movw	%cx, (%eax)
705# ifdef USE_AS_STRNCAT
706	movb	2(%esi), %dh
707# endif
708	movb	%dh, 2(%eax)
709	mov	STR3(%esp), %eax
710	RETURN
711
712	.p2align 4
713# ifdef USE_AS_STRNCAT
714L(StrncatExit4):
715	movb	%bh, 4(%eax)
716# endif
717L(Exit4):
718	movl	(%esi), %edx
719	movl	%edx, (%eax)
720	mov	STR3(%esp), %eax
721	RETURN
722
723	.p2align 4
724# ifdef USE_AS_STRNCAT
725L(StrncatExit5):
726	movb	%bh, 5(%eax)
727# endif
728L(Exit5):
729	movl	(%esi), %ecx
730# ifdef USE_AS_STRNCAT
731	movb	4(%esi), %dh
732# endif
733	movb	%dh, 4(%eax)
734	movl	%ecx, (%eax)
735	mov	STR3(%esp), %eax
736	RETURN
737
738	.p2align 4
739# ifdef USE_AS_STRNCAT
740L(StrncatExit6):
741	movb	%bh, 6(%eax)
742# endif
743L(Exit6):
744	movl	(%esi), %ecx
745	movw	4(%esi), %dx
746	movl	%ecx, (%eax)
747	movw	%dx, 4(%eax)
748	mov	STR3(%esp), %eax
749	RETURN
750
751	.p2align 4
752# ifdef USE_AS_STRNCAT
753L(StrncatExit7):
754	movb	%bh, 7(%eax)
755# endif
756L(Exit7):
757	movl	(%esi), %ecx
758	movl	3(%esi), %edx
759	movl	%ecx, (%eax)
760	movl	%edx, 3(%eax)
761	mov	STR3(%esp), %eax
762	RETURN
763
764	.p2align 4
765# ifdef USE_AS_STRNCAT
766L(StrncatExit8):
767	movb	%bh, 8(%eax)
768# endif
769L(Exit8):
770	movlpd	(%esi), %xmm0
771	movlpd	%xmm0, (%eax)
772	mov	STR3(%esp), %eax
773	RETURN
774
775	.p2align 4
776# ifdef USE_AS_STRNCAT
777L(StrncatExit9):
778	movb	%bh, 9(%eax)
779# endif
780L(Exit9):
781	movlpd	(%esi), %xmm0
782# ifdef USE_AS_STRNCAT
783	movb	8(%esi), %dh
784# endif
785	movb	%dh, 8(%eax)
786	movlpd	%xmm0, (%eax)
787	mov	STR3(%esp), %eax
788	RETURN
789
790	.p2align 4
791# ifdef USE_AS_STRNCAT
792L(StrncatExit10):
793	movb	%bh, 10(%eax)
794# endif
795L(Exit10):
796	movlpd	(%esi), %xmm0
797	movw	8(%esi), %dx
798	movlpd	%xmm0, (%eax)
799	movw	%dx, 8(%eax)
800	mov	STR3(%esp), %eax
801	RETURN
802
803	.p2align 4
804# ifdef USE_AS_STRNCAT
805L(StrncatExit11):
806	movb	%bh, 11(%eax)
807# endif
808L(Exit11):
809	movlpd	(%esi), %xmm0
810	movl	7(%esi), %edx
811	movlpd	%xmm0, (%eax)
812	movl	%edx, 7(%eax)
813	mov	STR3(%esp), %eax
814	RETURN
815
816	.p2align 4
817# ifdef USE_AS_STRNCAT
818L(StrncatExit12):
819	movb	%bh, 12(%eax)
820# endif
821L(Exit12):
822	movlpd	(%esi), %xmm0
823	movl	8(%esi), %edx
824	movlpd	%xmm0, (%eax)
825	movl	%edx, 8(%eax)
826	mov	STR3(%esp), %eax
827	RETURN
828
829	.p2align 4
830# ifdef USE_AS_STRNCAT
831L(StrncatExit13):
832	movb	%bh, 13(%eax)
833# endif
834L(Exit13):
835	movlpd	(%esi), %xmm0
836	movlpd	5(%esi), %xmm1
837	movlpd	%xmm0, (%eax)
838	movlpd	%xmm1, 5(%eax)
839	mov	STR3(%esp), %eax
840	RETURN
841
842	.p2align 4
843# ifdef USE_AS_STRNCAT
844L(StrncatExit14):
845	movb	%bh, 14(%eax)
846# endif
847L(Exit14):
848	movlpd	(%esi), %xmm0
849	movlpd	6(%esi), %xmm1
850	movlpd	%xmm0, (%eax)
851	movlpd	%xmm1, 6(%eax)
852	mov	STR3(%esp), %eax
853	RETURN
854
855	.p2align 4
856# ifdef USE_AS_STRNCAT
857L(StrncatExit15):
858	movb	%bh, 15(%eax)
859# endif
860L(Exit15):
861	movlpd	(%esi), %xmm0
862	movlpd	7(%esi), %xmm1
863	movlpd	%xmm0, (%eax)
864	movlpd	%xmm1, 7(%eax)
865	mov	STR3(%esp), %eax
866	RETURN
867
868	.p2align 4
869# ifdef USE_AS_STRNCAT
870L(StrncatExit16):
871	movb	%bh, 16(%eax)
872# endif
873L(Exit16):
874	movdqu	(%esi), %xmm0
875	movdqu	%xmm0, (%eax)
876	mov	STR3(%esp), %eax
877	RETURN
878
879	.p2align 4
880# ifdef USE_AS_STRNCAT
881L(StrncatExit17):
882	movb	%bh, 17(%eax)
883# endif
884L(Exit17):
885	movdqu	(%esi), %xmm0
886# ifdef USE_AS_STRNCAT
887	movb	16(%esi), %dh
888# endif
889	movdqu	%xmm0, (%eax)
890	movb	%dh, 16(%eax)
891	mov	STR3(%esp), %eax
892	RETURN
893
894	.p2align 4
895# ifdef USE_AS_STRNCAT
896L(StrncatExit18):
897	movb	%bh, 18(%eax)
898# endif
899L(Exit18):
900	movdqu	(%esi), %xmm0
901	movw	16(%esi), %cx
902	movdqu	%xmm0, (%eax)
903	movw	%cx, 16(%eax)
904	mov	STR3(%esp), %eax
905	RETURN
906
907	.p2align 4
908# ifdef USE_AS_STRNCAT
909L(StrncatExit19):
910	movb	%bh, 19(%eax)
911# endif
912L(Exit19):
913	movdqu	(%esi), %xmm0
914	movl	15(%esi), %ecx
915	movdqu	%xmm0, (%eax)
916	movl	%ecx, 15(%eax)
917	mov	STR3(%esp), %eax
918	RETURN
919
920	.p2align 4
921# ifdef USE_AS_STRNCAT
922L(StrncatExit20):
923	movb	%bh, 20(%eax)
924# endif
925L(Exit20):
926	movdqu	(%esi), %xmm0
927	movl	16(%esi), %ecx
928	movdqu	%xmm0, (%eax)
929	movl	%ecx, 16(%eax)
930	mov	STR3(%esp), %eax
931	RETURN
932
933	.p2align 4
934# ifdef USE_AS_STRNCAT
935L(StrncatExit21):
936	movb	%bh, 21(%eax)
937# endif
938L(Exit21):
939	movdqu	(%esi), %xmm0
940	movl	16(%esi), %ecx
941# ifdef USE_AS_STRNCAT
942	movb	20(%esi), %dh
943# endif
944	movdqu	%xmm0, (%eax)
945	movl	%ecx, 16(%eax)
946	movb	%dh, 20(%eax)
947	mov	STR3(%esp), %eax
948	RETURN
949
950	.p2align 4
951# ifdef USE_AS_STRNCAT
952L(StrncatExit22):
953	movb	%bh, 22(%eax)
954# endif
955L(Exit22):
956	movdqu	(%esi), %xmm0
957	movlpd	14(%esi), %xmm3
958	movdqu	%xmm0, (%eax)
959	movlpd	%xmm3, 14(%eax)
960	mov	STR3(%esp), %eax
961	RETURN
962
963	.p2align 4
964# ifdef USE_AS_STRNCAT
965L(StrncatExit23):
966	movb	%bh, 23(%eax)
967# endif
968L(Exit23):
969	movdqu	(%esi), %xmm0
970	movlpd	15(%esi), %xmm3
971	movdqu	%xmm0, (%eax)
972	movlpd	%xmm3, 15(%eax)
973	mov	STR3(%esp), %eax
974	RETURN
975
976	.p2align 4
977# ifdef USE_AS_STRNCAT
978L(StrncatExit24):
979	movb	%bh, 24(%eax)
980# endif
981L(Exit24):
982	movdqu	(%esi), %xmm0
983	movlpd	16(%esi), %xmm2
984	movdqu	%xmm0, (%eax)
985	movlpd	%xmm2, 16(%eax)
986	mov	STR3(%esp), %eax
987	RETURN
988
989	.p2align 4
990# ifdef USE_AS_STRNCAT
991L(StrncatExit25):
992	movb	%bh, 25(%eax)
993# endif
994L(Exit25):
995	movdqu	(%esi), %xmm0
996	movlpd	16(%esi), %xmm2
997# ifdef USE_AS_STRNCAT
998	movb	24(%esi), %dh
999# endif
1000	movdqu	%xmm0, (%eax)
1001	movlpd	%xmm2, 16(%eax)
1002	movb	%dh, 24(%eax)
1003	mov	STR3(%esp), %eax
1004	RETURN
1005
1006	.p2align 4
1007# ifdef USE_AS_STRNCAT
1008L(StrncatExit26):
1009	movb	%bh, 26(%eax)
1010# endif
1011L(Exit26):
1012	movdqu	(%esi), %xmm0
1013	movlpd	16(%esi), %xmm2
1014	movw	24(%esi), %cx
1015	movdqu	%xmm0, (%eax)
1016	movlpd	%xmm2, 16(%eax)
1017	movw	%cx, 24(%eax)
1018	mov	STR3(%esp), %eax
1019	RETURN
1020
1021	.p2align 4
1022# ifdef USE_AS_STRNCAT
1023L(StrncatExit27):
1024	movb	%bh, 27(%eax)
1025# endif
1026L(Exit27):
1027	movdqu	(%esi), %xmm0
1028	movlpd	16(%esi), %xmm2
1029	movl	23(%esi), %ecx
1030	movdqu	%xmm0, (%eax)
1031	movlpd	%xmm2, 16(%eax)
1032	movl	%ecx, 23(%eax)
1033	mov	STR3(%esp), %eax
1034	RETURN
1035
1036	.p2align 4
1037# ifdef USE_AS_STRNCAT
1038L(StrncatExit28):
1039	movb	%bh, 28(%eax)
1040# endif
1041L(Exit28):
1042	movdqu	(%esi), %xmm0
1043	movlpd	16(%esi), %xmm2
1044	movl	24(%esi), %ecx
1045	movdqu	%xmm0, (%eax)
1046	movlpd	%xmm2, 16(%eax)
1047	movl	%ecx, 24(%eax)
1048	mov	STR3(%esp), %eax
1049	RETURN
1050
1051	.p2align 4
1052# ifdef USE_AS_STRNCAT
1053L(StrncatExit29):
1054	movb	%bh, 29(%eax)
1055# endif
1056L(Exit29):
1057	movdqu	(%esi), %xmm0
1058	movdqu	13(%esi), %xmm2
1059	movdqu	%xmm0, (%eax)
1060	movdqu	%xmm2, 13(%eax)
1061	mov	STR3(%esp), %eax
1062	RETURN
1063
1064	.p2align 4
1065# ifdef USE_AS_STRNCAT
1066L(StrncatExit30):
1067	movb	%bh, 30(%eax)
1068# endif
1069L(Exit30):
1070	movdqu	(%esi), %xmm0
1071	movdqu	14(%esi), %xmm2
1072	movdqu	%xmm0, (%eax)
1073	movdqu	%xmm2, 14(%eax)
1074	mov	STR3(%esp), %eax
1075	RETURN
1076
1077	.p2align 4
1078# ifdef USE_AS_STRNCAT
1079L(StrncatExit31):
1080	movb	%bh, 31(%eax)
1081# endif
1082L(Exit31):
1083	movdqu	(%esi), %xmm0
1084	movdqu	15(%esi), %xmm2
1085	movdqu	%xmm0, (%eax)
1086	movdqu	%xmm2, 15(%eax)
1087	mov	STR3(%esp), %eax
1088	RETURN
1089
1090	.p2align 4
1091# ifdef USE_AS_STRNCAT
1092L(StrncatExit32):
1093	movb	%bh, 32(%eax)
1094# endif
1095L(Exit32):
1096	movdqu	(%esi), %xmm0
1097	movdqu	16(%esi), %xmm2
1098	movdqu	%xmm0, (%eax)
1099	movdqu	%xmm2, 16(%eax)
1100	mov	STR3(%esp), %eax
1101	RETURN
1102
1103# ifdef USE_AS_STRNCAT
1104
1105	.p2align 4
1106L(UnalignedLeaveCase2OrCase3):
1107	test	%edx, %edx
1108	jnz	L(Unaligned64LeaveCase2)
1109L(Unaligned64LeaveCase3):
1110	lea	64(%ebx), %ecx
1111	and	$-16, %ecx
1112	add	$48, %ebx
1113	jl	L(CopyFrom1To16BytesCase3)
1114	movdqu	%xmm4, (%eax)
1115	sub	$16, %ebx
1116	jb	L(CopyFrom1To16BytesCase3)
1117	movdqu	%xmm5, 16(%eax)
1118	sub	$16, %ebx
1119	jb	L(CopyFrom1To16BytesCase3)
1120	movdqu	%xmm6, 32(%eax)
1121	sub	$16, %ebx
1122	jb	L(CopyFrom1To16BytesCase3)
1123	movdqu	%xmm7, 48(%eax)
1124	xor	%bh, %bh
1125	movb	%bh, 64(%eax)
1126	mov	STR3(%esp), %eax
1127	RETURN
1128
1129	.p2align 4
1130L(Unaligned64LeaveCase2):
1131	xor	%ecx, %ecx
1132	pcmpeqb	%xmm4, %xmm0
1133	pmovmskb %xmm0, %edx
1134	add	$48, %ebx
1135	jle	L(CopyFrom1To16BytesCase2OrCase3)
1136	test	%edx, %edx
1137	jnz	L(CopyFrom1To16Bytes)
1138
1139	pcmpeqb	%xmm5, %xmm0
1140	pmovmskb %xmm0, %edx
1141	movdqu	%xmm4, (%eax)
1142	add	$16, %ecx
1143	sub	$16, %ebx
1144	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1145	test	%edx, %edx
1146	jnz	L(CopyFrom1To16Bytes)
1147
1148	pcmpeqb	%xmm6, %xmm0
1149	pmovmskb %xmm0, %edx
1150	movdqu	%xmm5, 16(%eax)
1151	add	$16, %ecx
1152	sub	$16, %ebx
1153	jbe	L(CopyFrom1To16BytesCase2OrCase3)
1154	test	%edx, %edx
1155	jnz	L(CopyFrom1To16Bytes)
1156
1157	pcmpeqb	%xmm7, %xmm0
1158	pmovmskb %xmm0, %edx
1159	movdqu	%xmm6, 32(%eax)
1160	lea	16(%eax, %ecx), %eax
1161	lea	16(%esi, %ecx), %esi
1162	bsf	%edx, %edx
1163	cmp	%ebx, %edx
1164	jb	L(CopyFrom1To16BytesExit)
1165	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4)
1166# endif
1167	.p2align 4
1168L(ExitZero):
1169	RETURN
1170
1171END (STRCAT)
1172
1173	.p2align 4
1174	.section .rodata
1175L(ExitTable):
1176	.int	JMPTBL(L(Exit1), L(ExitTable))
1177	.int	JMPTBL(L(Exit2), L(ExitTable))
1178	.int	JMPTBL(L(Exit3), L(ExitTable))
1179	.int	JMPTBL(L(Exit4), L(ExitTable))
1180	.int	JMPTBL(L(Exit5), L(ExitTable))
1181	.int	JMPTBL(L(Exit6), L(ExitTable))
1182	.int	JMPTBL(L(Exit7), L(ExitTable))
1183	.int	JMPTBL(L(Exit8), L(ExitTable))
1184	.int	JMPTBL(L(Exit9), L(ExitTable))
1185	.int	JMPTBL(L(Exit10), L(ExitTable))
1186	.int	JMPTBL(L(Exit11), L(ExitTable))
1187	.int	JMPTBL(L(Exit12), L(ExitTable))
1188	.int	JMPTBL(L(Exit13), L(ExitTable))
1189	.int	JMPTBL(L(Exit14), L(ExitTable))
1190	.int	JMPTBL(L(Exit15), L(ExitTable))
1191	.int	JMPTBL(L(Exit16), L(ExitTable))
1192	.int	JMPTBL(L(Exit17), L(ExitTable))
1193	.int	JMPTBL(L(Exit18), L(ExitTable))
1194	.int	JMPTBL(L(Exit19), L(ExitTable))
1195	.int	JMPTBL(L(Exit20), L(ExitTable))
1196	.int	JMPTBL(L(Exit21), L(ExitTable))
1197	.int	JMPTBL(L(Exit22), L(ExitTable))
1198	.int	JMPTBL(L(Exit23), L(ExitTable))
1199	.int	JMPTBL(L(Exit24), L(ExitTable))
1200	.int	JMPTBL(L(Exit25), L(ExitTable))
1201	.int	JMPTBL(L(Exit26), L(ExitTable))
1202	.int	JMPTBL(L(Exit27), L(ExitTable))
1203	.int	JMPTBL(L(Exit28), L(ExitTable))
1204	.int	JMPTBL(L(Exit29), L(ExitTable))
1205	.int	JMPTBL(L(Exit30), L(ExitTable))
1206	.int	JMPTBL(L(Exit31), L(ExitTable))
1207	.int	JMPTBL(L(Exit32), L(ExitTable))
1208# ifdef USE_AS_STRNCAT
1209L(ExitStrncatTable):
1210	.int	JMPTBL(L(StrncatExit0), L(ExitStrncatTable))
1211	.int	JMPTBL(L(StrncatExit1), L(ExitStrncatTable))
1212	.int	JMPTBL(L(StrncatExit2), L(ExitStrncatTable))
1213	.int	JMPTBL(L(StrncatExit3), L(ExitStrncatTable))
1214	.int	JMPTBL(L(StrncatExit4), L(ExitStrncatTable))
1215	.int	JMPTBL(L(StrncatExit5), L(ExitStrncatTable))
1216	.int	JMPTBL(L(StrncatExit6), L(ExitStrncatTable))
1217	.int	JMPTBL(L(StrncatExit7), L(ExitStrncatTable))
1218	.int	JMPTBL(L(StrncatExit8), L(ExitStrncatTable))
1219	.int	JMPTBL(L(StrncatExit9), L(ExitStrncatTable))
1220	.int	JMPTBL(L(StrncatExit10), L(ExitStrncatTable))
1221	.int	JMPTBL(L(StrncatExit11), L(ExitStrncatTable))
1222	.int	JMPTBL(L(StrncatExit12), L(ExitStrncatTable))
1223	.int	JMPTBL(L(StrncatExit13), L(ExitStrncatTable))
1224	.int	JMPTBL(L(StrncatExit14), L(ExitStrncatTable))
1225	.int	JMPTBL(L(StrncatExit15), L(ExitStrncatTable))
1226	.int	JMPTBL(L(StrncatExit16), L(ExitStrncatTable))
1227	.int	JMPTBL(L(StrncatExit17), L(ExitStrncatTable))
1228	.int	JMPTBL(L(StrncatExit18), L(ExitStrncatTable))
1229	.int	JMPTBL(L(StrncatExit19), L(ExitStrncatTable))
1230	.int	JMPTBL(L(StrncatExit20), L(ExitStrncatTable))
1231	.int	JMPTBL(L(StrncatExit21), L(ExitStrncatTable))
1232	.int	JMPTBL(L(StrncatExit22), L(ExitStrncatTable))
1233	.int	JMPTBL(L(StrncatExit23), L(ExitStrncatTable))
1234	.int	JMPTBL(L(StrncatExit24), L(ExitStrncatTable))
1235	.int	JMPTBL(L(StrncatExit25), L(ExitStrncatTable))
1236	.int	JMPTBL(L(StrncatExit26), L(ExitStrncatTable))
1237	.int	JMPTBL(L(StrncatExit27), L(ExitStrncatTable))
1238	.int	JMPTBL(L(StrncatExit28), L(ExitStrncatTable))
1239	.int	JMPTBL(L(StrncatExit29), L(ExitStrncatTable))
1240	.int	JMPTBL(L(StrncatExit30), L(ExitStrncatTable))
1241	.int	JMPTBL(L(StrncatExit31), L(ExitStrncatTable))
1242	.int	JMPTBL(L(StrncatExit32), L(ExitStrncatTable))
1243# endif
1244#endif
1245