1/* strcat with SSSE3
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY	or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24# define CFI_PUSH(REG)	\
25	cfi_adjust_cfa_offset (4);	\
26	cfi_rel_offset (REG, 0)
27
28# define CFI_POP(REG)	\
29	cfi_adjust_cfa_offset (-4);	\
30	cfi_restore (REG)
31
32# define PUSH(REG) pushl REG; CFI_PUSH (REG)
33# define POP(REG) popl REG; CFI_POP (REG)
34
35# ifndef STRCAT
36#  define STRCAT  __strcat_ssse3
37# endif
38
39# define PARMS  4
40# define STR1  PARMS+4
41# define STR2  STR1+4
42
43# ifdef USE_AS_STRNCAT
44#  define LEN STR2+8
45# endif
46
47# define USE_AS_STRCAT
48
49.text
50ENTRY (STRCAT)
51	PUSH	(%edi)
52	mov	STR1(%esp), %edi
53	mov	%edi, %edx
54
55# define RETURN  jmp L(StartStrcpyPart)
56# include "strlen-sse2.S"
57
58L(StartStrcpyPart):
59	mov	STR2(%esp), %ecx
60	lea	(%edi, %eax), %edx
61# ifdef USE_AS_STRNCAT
62	PUSH	(%ebx)
63	mov	LEN(%esp), %ebx
64	test	%ebx, %ebx
65	jz	L(StrncatExit0)
66	cmp	$8, %ebx
67	jbe	L(StrncatExit8Bytes)
68# endif
69	cmpb	$0, (%ecx)
70	jz	L(Exit1)
71	cmpb	$0, 1(%ecx)
72	jz	L(Exit2)
73	cmpb	$0, 2(%ecx)
74	jz	L(Exit3)
75	cmpb	$0, 3(%ecx)
76	jz	L(Exit4)
77	cmpb	$0, 4(%ecx)
78	jz	L(Exit5)
79	cmpb	$0, 5(%ecx)
80	jz	L(Exit6)
81	cmpb	$0, 6(%ecx)
82	jz	L(Exit7)
83	cmpb	$0, 7(%ecx)
84	jz	L(Exit8)
85	cmpb	$0, 8(%ecx)
86	jz	L(Exit9)
87# ifdef USE_AS_STRNCAT
88	cmp	$16, %ebx
89	jb	L(StrncatExit15Bytes)
90# endif
91	cmpb	$0, 9(%ecx)
92	jz	L(Exit10)
93	cmpb	$0, 10(%ecx)
94	jz	L(Exit11)
95	cmpb	$0, 11(%ecx)
96	jz	L(Exit12)
97	cmpb	$0, 12(%ecx)
98	jz	L(Exit13)
99	cmpb	$0, 13(%ecx)
100	jz	L(Exit14)
101	cmpb	$0, 14(%ecx)
102	jz	L(Exit15)
103	cmpb	$0, 15(%ecx)
104	jz	L(Exit16)
105# ifdef USE_AS_STRNCAT
106	cmp	$16, %ebx
107	je	L(StrncatExit16)
108
109#  define RETURN1	\
110	POP	(%ebx);	\
111	POP	(%edi);	\
112	ret;	\
113	CFI_PUSH	(%ebx);	\
114	CFI_PUSH	(%edi)
115#  define USE_AS_STRNCPY
116# else
117#  define RETURN1  POP (%edi); ret; CFI_PUSH (%edi)
118# endif
119# include "strcpy-ssse3.S"
120	.p2align 4
121L(CopyFrom1To16Bytes):
122	add	%esi, %edx
123	add	%esi, %ecx
124
125	POP	(%esi)
126	test	%al, %al
127	jz	L(ExitHigh)
128	test	$0x01, %al
129	jnz	L(Exit1)
130	test	$0x02, %al
131	jnz	L(Exit2)
132	test	$0x04, %al
133	jnz	L(Exit3)
134	test	$0x08, %al
135	jnz	L(Exit4)
136	test	$0x10, %al
137	jnz	L(Exit5)
138	test	$0x20, %al
139	jnz	L(Exit6)
140	test	$0x40, %al
141	jnz	L(Exit7)
142	movlpd	(%ecx), %xmm0
143	movlpd	%xmm0, (%edx)
144	movl	%edi, %eax
145	RETURN1
146
147	.p2align 4
148L(ExitHigh):
149	test	$0x01, %ah
150	jnz	L(Exit9)
151	test	$0x02, %ah
152	jnz	L(Exit10)
153	test	$0x04, %ah
154	jnz	L(Exit11)
155	test	$0x08, %ah
156	jnz	L(Exit12)
157	test	$0x10, %ah
158	jnz	L(Exit13)
159	test	$0x20, %ah
160	jnz	L(Exit14)
161	test	$0x40, %ah
162	jnz	L(Exit15)
163	movlpd	(%ecx), %xmm0
164	movlpd	8(%ecx), %xmm1
165	movlpd	%xmm0, (%edx)
166	movlpd	%xmm1, 8(%edx)
167	movl	%edi, %eax
168	RETURN1
169
170	.p2align 4
171L(StrncatExit1):
172	movb	%bh, 1(%edx)
173L(Exit1):
174	movb	(%ecx), %al
175	movb	%al, (%edx)
176	movl	%edi, %eax
177	RETURN1
178
179	.p2align 4
180L(StrncatExit2):
181	movb	%bh, 2(%edx)
182L(Exit2):
183	movw	(%ecx), %ax
184	movw	%ax, (%edx)
185	movl	%edi, %eax
186	RETURN1
187
188	.p2align 4
189L(StrncatExit3):
190	movb	%bh, 3(%edx)
191L(Exit3):
192	movw	(%ecx), %ax
193	movw	%ax, (%edx)
194	movb	2(%ecx), %al
195	movb	%al, 2(%edx)
196	movl	%edi, %eax
197	RETURN1
198
199	.p2align 4
200L(StrncatExit4):
201	movb	%bh, 4(%edx)
202L(Exit4):
203	movl	(%ecx), %eax
204	movl	%eax, (%edx)
205	movl	%edi, %eax
206	RETURN1
207
208	.p2align 4
209L(StrncatExit5):
210	movb	%bh, 5(%edx)
211L(Exit5):
212	movl	(%ecx), %eax
213	movl	%eax, (%edx)
214	movb	4(%ecx), %al
215	movb	%al, 4(%edx)
216	movl	%edi, %eax
217	RETURN1
218
219	.p2align 4
220L(StrncatExit6):
221	movb	%bh, 6(%edx)
222L(Exit6):
223	movl	(%ecx), %eax
224	movl	%eax, (%edx)
225	movw	4(%ecx), %ax
226	movw	%ax, 4(%edx)
227	movl	%edi, %eax
228	RETURN1
229
230	.p2align 4
231L(StrncatExit7):
232	movb	%bh, 7(%edx)
233L(Exit7):
234	movl	(%ecx), %eax
235	movl	%eax, (%edx)
236	movl	3(%ecx), %eax
237	movl	%eax, 3(%edx)
238	movl	%edi, %eax
239	RETURN1
240
241	.p2align 4
242L(StrncatExit8):
243	movb	%bh, 8(%edx)
244L(Exit8):
245	movlpd	(%ecx), %xmm0
246	movlpd	%xmm0, (%edx)
247	movl	%edi, %eax
248	RETURN1
249
250	.p2align 4
251L(StrncatExit9):
252	movb	%bh, 9(%edx)
253L(Exit9):
254	movlpd	(%ecx), %xmm0
255	movlpd	%xmm0, (%edx)
256	movb	8(%ecx), %al
257	movb	%al, 8(%edx)
258	movl	%edi, %eax
259	RETURN1
260
261	.p2align 4
262L(StrncatExit10):
263	movb	%bh, 10(%edx)
264L(Exit10):
265	movlpd	(%ecx), %xmm0
266	movlpd	%xmm0, (%edx)
267	movw	8(%ecx), %ax
268	movw	%ax, 8(%edx)
269	movl	%edi, %eax
270	RETURN1
271
272	.p2align 4
273L(StrncatExit11):
274	movb	%bh, 11(%edx)
275L(Exit11):
276	movlpd	(%ecx), %xmm0
277	movlpd	%xmm0, (%edx)
278	movl	7(%ecx), %eax
279	movl	%eax, 7(%edx)
280	movl	%edi, %eax
281	RETURN1
282
283	.p2align 4
284L(StrncatExit12):
285	movb	%bh, 12(%edx)
286L(Exit12):
287	movlpd	(%ecx), %xmm0
288	movlpd	%xmm0, (%edx)
289	movl	8(%ecx), %eax
290	movl	%eax, 8(%edx)
291	movl	%edi, %eax
292	RETURN1
293
294	.p2align 4
295L(StrncatExit13):
296	movb	%bh, 13(%edx)
297L(Exit13):
298	movlpd	(%ecx), %xmm0
299	movlpd	%xmm0, (%edx)
300	movlpd	5(%ecx), %xmm0
301	movlpd	%xmm0, 5(%edx)
302	movl	%edi, %eax
303	RETURN1
304
305	.p2align 4
306L(StrncatExit14):
307	movb	%bh, 14(%edx)
308L(Exit14):
309	movlpd	(%ecx), %xmm0
310	movlpd	%xmm0, (%edx)
311	movlpd	6(%ecx), %xmm0
312	movlpd	%xmm0, 6(%edx)
313	movl	%edi, %eax
314	RETURN1
315
316	.p2align 4
317L(StrncatExit15):
318	movb	%bh, 15(%edx)
319L(Exit15):
320	movlpd	(%ecx), %xmm0
321	movlpd	%xmm0, (%edx)
322	movlpd	7(%ecx), %xmm0
323	movlpd	%xmm0, 7(%edx)
324	movl	%edi, %eax
325	RETURN1
326
327	.p2align 4
328L(StrncatExit16):
329	movb	%bh, 16(%edx)
330L(Exit16):
331	movlpd	(%ecx), %xmm0
332	movlpd	8(%ecx), %xmm1
333	movlpd	%xmm0, (%edx)
334	movlpd	%xmm1, 8(%edx)
335	movl	%edi, %eax
336	RETURN1
337
338# ifdef USE_AS_STRNCPY
339
340	CFI_PUSH(%esi)
341
342	.p2align 4
343L(CopyFrom1To16BytesCase2):
344	add	$16, %ebx
345	add	%esi, %ecx
346	lea	(%esi, %edx), %esi
347	lea	-9(%ebx), %edx
348	and	$1<<7, %dh
349	or	%al, %dh
350	test	%dh, %dh
351	lea	(%esi), %edx
352	POP	(%esi)
353	jz	L(ExitHighCase2)
354
355	test	$0x01, %al
356	jnz	L(Exit1)
357	cmp	$1, %ebx
358	je	L(StrncatExit1)
359	test	$0x02, %al
360	jnz	L(Exit2)
361	cmp	$2, %ebx
362	je	L(StrncatExit2)
363	test	$0x04, %al
364	jnz	L(Exit3)
365	cmp	$3, %ebx
366	je	L(StrncatExit3)
367	test	$0x08, %al
368	jnz	L(Exit4)
369	cmp	$4, %ebx
370	je	L(StrncatExit4)
371	test	$0x10, %al
372	jnz	L(Exit5)
373	cmp	$5, %ebx
374	je	L(StrncatExit5)
375	test	$0x20, %al
376	jnz	L(Exit6)
377	cmp	$6, %ebx
378	je	L(StrncatExit6)
379	test	$0x40, %al
380	jnz	L(Exit7)
381	cmp	$7, %ebx
382	je	L(StrncatExit7)
383	movlpd	(%ecx), %xmm0
384	movlpd	%xmm0, (%edx)
385	lea	7(%edx), %eax
386	cmpb	$1, (%eax)
387	sbb	$-1, %eax
388	xor	%cl, %cl
389	movb	%cl, (%eax)
390	movl	%edi, %eax
391	RETURN1
392
393	.p2align 4
394L(ExitHighCase2):
395	test	$0x01, %ah
396	jnz	L(Exit9)
397	cmp	$9, %ebx
398	je	L(StrncatExit9)
399	test	$0x02, %ah
400	jnz	L(Exit10)
401	cmp	$10, %ebx
402	je	L(StrncatExit10)
403	test	$0x04, %ah
404	jnz	L(Exit11)
405	cmp	$11, %ebx
406	je	L(StrncatExit11)
407	test	$0x8, %ah
408	jnz	L(Exit12)
409	cmp	$12, %ebx
410	je	L(StrncatExit12)
411	test	$0x10, %ah
412	jnz	L(Exit13)
413	cmp	$13, %ebx
414	je	L(StrncatExit13)
415	test	$0x20, %ah
416	jnz	L(Exit14)
417	cmp	$14, %ebx
418	je	L(StrncatExit14)
419	test	$0x40, %ah
420	jnz	L(Exit15)
421	cmp	$15, %ebx
422	je	L(StrncatExit15)
423	movlpd	(%ecx), %xmm0
424	movlpd	%xmm0, (%edx)
425	movlpd	8(%ecx), %xmm1
426	movlpd	%xmm1, 8(%edx)
427	movl	%edi, %eax
428	RETURN1
429
430	CFI_PUSH(%esi)
431
432L(CopyFrom1To16BytesCase2OrCase3):
433	test	%eax, %eax
434	jnz	L(CopyFrom1To16BytesCase2)
435
436	.p2align 4
437L(CopyFrom1To16BytesCase3):
438	add	$16, %ebx
439	add	%esi, %edx
440	add	%esi, %ecx
441
442	POP	(%esi)
443
444	cmp	$8, %ebx
445	ja	L(ExitHighCase3)
446	cmp	$1, %ebx
447	je	L(StrncatExit1)
448	cmp	$2, %ebx
449	je	L(StrncatExit2)
450	cmp	$3, %ebx
451	je	L(StrncatExit3)
452	cmp	$4, %ebx
453	je	L(StrncatExit4)
454	cmp	$5, %ebx
455	je	L(StrncatExit5)
456	cmp	$6, %ebx
457	je	L(StrncatExit6)
458	cmp	$7, %ebx
459	je	L(StrncatExit7)
460	movlpd	(%ecx), %xmm0
461	movlpd	%xmm0, (%edx)
462	movb	%bh, 8(%edx)
463	movl	%edi, %eax
464	RETURN1
465
466	.p2align 4
467L(ExitHighCase3):
468	cmp	$9, %ebx
469	je	L(StrncatExit9)
470	cmp	$10, %ebx
471	je	L(StrncatExit10)
472	cmp	$11, %ebx
473	je	L(StrncatExit11)
474	cmp	$12, %ebx
475	je	L(StrncatExit12)
476	cmp	$13, %ebx
477	je	L(StrncatExit13)
478	cmp	$14, %ebx
479	je	L(StrncatExit14)
480	cmp	$15, %ebx
481	je	L(StrncatExit15)
482	movlpd	(%ecx), %xmm0
483	movlpd	%xmm0, (%edx)
484	movlpd	8(%ecx), %xmm1
485	movlpd	%xmm1, 8(%edx)
486	movb	%bh, 16(%edx)
487	movl	%edi, %eax
488	RETURN1
489
490	.p2align 4
491L(StrncatExit0):
492	movl	%edi, %eax
493	RETURN1
494
495	.p2align 4
496L(StrncatExit15Bytes):
497	cmp	$9, %ebx
498	je	L(StrncatExit9)
499	cmpb	$0, 9(%ecx)
500	jz	L(Exit10)
501	cmp	$10, %ebx
502	je	L(StrncatExit10)
503	cmpb	$0, 10(%ecx)
504	jz	L(Exit11)
505	cmp	$11, %ebx
506	je	L(StrncatExit11)
507	cmpb	$0, 11(%ecx)
508	jz	L(Exit12)
509	cmp	$12, %ebx
510	je	L(StrncatExit12)
511	cmpb	$0, 12(%ecx)
512	jz	L(Exit13)
513	cmp	$13, %ebx
514	je	L(StrncatExit13)
515	cmpb	$0, 13(%ecx)
516	jz	L(Exit14)
517	cmp	$14, %ebx
518	je	L(StrncatExit14)
519	movlpd	(%ecx), %xmm0
520	movlpd	%xmm0, (%edx)
521	movlpd	7(%ecx), %xmm0
522	movlpd	%xmm0, 7(%edx)
523	lea	14(%edx), %eax
524	cmpb	$1, (%eax)
525	sbb	$-1, %eax
526	movb	%bh, (%eax)
527	movl	%edi, %eax
528	RETURN1
529
530	.p2align 4
531L(StrncatExit8Bytes):
532	cmpb	$0, (%ecx)
533	jz	L(Exit1)
534	cmp	$1, %ebx
535	je	L(StrncatExit1)
536	cmpb	$0, 1(%ecx)
537	jz	L(Exit2)
538	cmp	$2, %ebx
539	je	L(StrncatExit2)
540	cmpb	$0, 2(%ecx)
541	jz	L(Exit3)
542	cmp	$3, %ebx
543	je	L(StrncatExit3)
544	cmpb	$0, 3(%ecx)
545	jz	L(Exit4)
546	cmp	$4, %ebx
547	je	L(StrncatExit4)
548	cmpb	$0, 4(%ecx)
549	jz	L(Exit5)
550	cmp	$5, %ebx
551	je	L(StrncatExit5)
552	cmpb	$0, 5(%ecx)
553	jz	L(Exit6)
554	cmp	$6, %ebx
555	je	L(StrncatExit6)
556	cmpb	$0, 6(%ecx)
557	jz	L(Exit7)
558	cmp	$7, %ebx
559	je	L(StrncatExit7)
560	movlpd	(%ecx), %xmm0
561	movlpd	%xmm0, (%edx)
562	lea	7(%edx), %eax
563	cmpb	$1, (%eax)
564	sbb	$-1, %eax
565	movb	%bh, (%eax)
566	movl	%edi, %eax
567	RETURN1
568
569# endif
570END (STRCAT)
571#endif
572