1/* strcat with SSE2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22   so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25
26# include <sysdep.h>
27
28# ifndef STRCAT
29#  define STRCAT  __strcat_sse2_unaligned
30# endif
31
32# define USE_AS_STRCAT
33
34.text
35ENTRY (STRCAT)
36	mov	%rdi, %r9
37# ifdef USE_AS_STRNCAT
38	mov	%rdx, %r8
39# endif
40
41/* Inline corresponding strlen file, temporary until new strcpy
42   implementation gets merged.  */
43
44	xor	%rax, %rax
45	mov	%edi, %ecx
46	and	$0x3f, %ecx
47	pxor	%xmm0, %xmm0
48	cmp	$0x30, %ecx
49	ja	L(next)
50	movdqu	(%rdi), %xmm1
51	pcmpeqb	%xmm1, %xmm0
52	pmovmskb %xmm0, %edx
53	test	%edx, %edx
54	jnz	L(exit_less16)
55	mov	%rdi, %rax
56	and	$-16, %rax
57	jmp	L(align16_start)
58L(next):
59	mov	%rdi, %rax
60	and	$-16, %rax
61	pcmpeqb	(%rax), %xmm0
62	mov	$-1, %r10d
63	sub	%rax, %rcx
64	shl	%cl, %r10d
65	pmovmskb %xmm0, %edx
66	and	%r10d, %edx
67	jnz	L(exit)
68
69L(align16_start):
70	pxor	%xmm0, %xmm0
71	pxor	%xmm1, %xmm1
72	pxor	%xmm2, %xmm2
73	pxor	%xmm3, %xmm3
74	pcmpeqb	16(%rax), %xmm0
75	pmovmskb %xmm0, %edx
76	test	%edx, %edx
77	jnz	L(exit16)
78
79	pcmpeqb	32(%rax), %xmm1
80	pmovmskb %xmm1, %edx
81	test	%edx, %edx
82	jnz	L(exit32)
83
84	pcmpeqb	48(%rax), %xmm2
85	pmovmskb %xmm2, %edx
86	test	%edx, %edx
87	jnz	L(exit48)
88
89	pcmpeqb	64(%rax), %xmm3
90	pmovmskb %xmm3, %edx
91	test	%edx, %edx
92	jnz	L(exit64)
93
94	pcmpeqb	80(%rax), %xmm0
95	add	$64, %rax
96	pmovmskb %xmm0, %edx
97	test	%edx, %edx
98	jnz	L(exit16)
99
100	pcmpeqb	32(%rax), %xmm1
101	pmovmskb %xmm1, %edx
102	test	%edx, %edx
103	jnz	L(exit32)
104
105	pcmpeqb	48(%rax), %xmm2
106	pmovmskb %xmm2, %edx
107	test	%edx, %edx
108	jnz	L(exit48)
109
110	pcmpeqb	64(%rax), %xmm3
111	pmovmskb %xmm3, %edx
112	test	%edx, %edx
113	jnz	L(exit64)
114
115	pcmpeqb	80(%rax), %xmm0
116	add	$64, %rax
117	pmovmskb %xmm0, %edx
118	test	%edx, %edx
119	jnz	L(exit16)
120
121	pcmpeqb	32(%rax), %xmm1
122	pmovmskb %xmm1, %edx
123	test	%edx, %edx
124	jnz	L(exit32)
125
126	pcmpeqb	48(%rax), %xmm2
127	pmovmskb %xmm2, %edx
128	test	%edx, %edx
129	jnz	L(exit48)
130
131	pcmpeqb	64(%rax), %xmm3
132	pmovmskb %xmm3, %edx
133	test	%edx, %edx
134	jnz	L(exit64)
135
136	pcmpeqb	80(%rax), %xmm0
137	add	$64, %rax
138	pmovmskb %xmm0, %edx
139	test	%edx, %edx
140	jnz	L(exit16)
141
142	pcmpeqb	32(%rax), %xmm1
143	pmovmskb %xmm1, %edx
144	test	%edx, %edx
145	jnz	L(exit32)
146
147	pcmpeqb	48(%rax), %xmm2
148	pmovmskb %xmm2, %edx
149	test	%edx, %edx
150	jnz	L(exit48)
151
152	pcmpeqb	64(%rax), %xmm3
153	pmovmskb %xmm3, %edx
154	test	%edx, %edx
155	jnz	L(exit64)
156
157	test	$0x3f, %rax
158	jz	L(align64_loop)
159
160	pcmpeqb	80(%rax), %xmm0
161	add	$80, %rax
162	pmovmskb %xmm0, %edx
163	test	%edx, %edx
164	jnz	L(exit)
165
166	test	$0x3f, %rax
167	jz	L(align64_loop)
168
169	pcmpeqb	16(%rax), %xmm1
170	add	$16, %rax
171	pmovmskb %xmm1, %edx
172	test	%edx, %edx
173	jnz	L(exit)
174
175	test	$0x3f, %rax
176	jz	L(align64_loop)
177
178	pcmpeqb	16(%rax), %xmm2
179	add	$16, %rax
180	pmovmskb %xmm2, %edx
181	test	%edx, %edx
182	jnz	L(exit)
183
184	test	$0x3f, %rax
185	jz	L(align64_loop)
186
187	pcmpeqb	16(%rax), %xmm3
188	add	$16, %rax
189	pmovmskb %xmm3, %edx
190	test	%edx, %edx
191	jnz	L(exit)
192
193	add	$16, %rax
194	.p2align 4
195	L(align64_loop):
196	movaps	(%rax),	%xmm4
197	pminub	16(%rax),	%xmm4
198	movaps	32(%rax),	%xmm5
199	pminub	48(%rax),	%xmm5
200	add	$64,	%rax
201	pminub	%xmm4,	%xmm5
202	pcmpeqb	%xmm0,	%xmm5
203	pmovmskb %xmm5,	%edx
204	test	%edx,	%edx
205	jz	L(align64_loop)
206
207	pcmpeqb	-64(%rax), %xmm0
208	sub	$80,	%rax
209	pmovmskb %xmm0, %edx
210	test	%edx, %edx
211	jnz	L(exit16)
212
213	pcmpeqb	32(%rax), %xmm1
214	pmovmskb %xmm1, %edx
215	test	%edx, %edx
216	jnz	L(exit32)
217
218	pcmpeqb	48(%rax), %xmm2
219	pmovmskb %xmm2, %edx
220	test	%edx, %edx
221	jnz	L(exit48)
222
223	pcmpeqb	64(%rax), %xmm3
224	pmovmskb %xmm3, %edx
225	sub	%rdi, %rax
226	bsf	%rdx, %rdx
227	add	%rdx, %rax
228	add	$64, %rax
229	jmp	L(StartStrcpyPart)
230
231	.p2align 4
232L(exit):
233	sub	%rdi, %rax
234L(exit_less16):
235	bsf	%rdx, %rdx
236	add	%rdx, %rax
237	jmp	L(StartStrcpyPart)
238
239	.p2align 4
240L(exit16):
241	sub	%rdi, %rax
242	bsf	%rdx, %rdx
243	add	%rdx, %rax
244	add	$16, %rax
245	jmp	L(StartStrcpyPart)
246
247	.p2align 4
248L(exit32):
249	sub	%rdi, %rax
250	bsf	%rdx, %rdx
251	add	%rdx, %rax
252	add	$32, %rax
253	jmp	L(StartStrcpyPart)
254
255	.p2align 4
256L(exit48):
257	sub	%rdi, %rax
258	bsf	%rdx, %rdx
259	add	%rdx, %rax
260	add	$48, %rax
261	jmp	L(StartStrcpyPart)
262
263	.p2align 4
264L(exit64):
265	sub	%rdi, %rax
266	bsf	%rdx, %rdx
267	add	%rdx, %rax
268	add	$64, %rax
269
270	.p2align 4
271L(StartStrcpyPart):
272	lea	(%r9, %rax), %rdi
273	mov	%rsi, %rcx
274	mov	%r9, %rax      /* save result */
275
276# ifdef USE_AS_STRNCAT
277	test	%r8, %r8
278	jz	L(ExitZero)
279#  define USE_AS_STRNCPY
280# endif
281
282# include "strcpy-sse2-unaligned.S"
283#endif
284