1/* strcat with AVX2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23
24# include <sysdep.h>
25
26# ifndef STRCAT
27#  define STRCAT  __strcat_avx2
28# endif
29
30# define USE_AS_STRCAT
31
32/* Number of bytes in a vector register */
33# define VEC_SIZE	32
34
35# ifndef SECTION
36#  define SECTION(p)	p##.avx
37# endif
38
39	.section SECTION(.text),"ax",@progbits
40ENTRY (STRCAT)
41	mov	%rdi, %r9
42# ifdef USE_AS_STRNCAT
43	mov	%rdx, %r8
44# endif
45
46	xor	%eax, %eax
47	mov	%edi, %ecx
48	and	$((VEC_SIZE * 4) - 1), %ecx
49	vpxor	%xmm6, %xmm6, %xmm6
50	cmp	$(VEC_SIZE * 3), %ecx
51	ja	L(fourth_vector_boundary)
52	vpcmpeqb (%rdi), %ymm6, %ymm0
53	vpmovmskb %ymm0, %edx
54	test	%edx, %edx
55	jnz	L(exit_null_on_first_vector)
56	mov	%rdi, %rax
57	and	$-VEC_SIZE, %rax
58	jmp	L(align_vec_size_start)
59L(fourth_vector_boundary):
60	mov	%rdi, %rax
61	and	$-VEC_SIZE, %rax
62	vpcmpeqb	(%rax), %ymm6, %ymm0
63	mov	$-1, %r10d
64	sub	%rax, %rcx
65	shl	%cl, %r10d
66	vpmovmskb %ymm0, %edx
67	and	%r10d, %edx
68	jnz	L(exit)
69
70L(align_vec_size_start):
71	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
72	vpmovmskb %ymm0, %edx
73	test	%edx, %edx
74	jnz	L(exit_null_on_second_vector)
75
76	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
77	vpmovmskb %ymm1, %edx
78	test	%edx, %edx
79	jnz	L(exit_null_on_third_vector)
80
81	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
82	vpmovmskb %ymm2, %edx
83	test	%edx, %edx
84	jnz	L(exit_null_on_fourth_vector)
85
86	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
87	vpmovmskb %ymm3, %edx
88	test	%edx, %edx
89	jnz	L(exit_null_on_fifth_vector)
90
91	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
92	add	$(VEC_SIZE * 4), %rax
93	vpmovmskb %ymm0, %edx
94	test	%edx, %edx
95	jnz	L(exit_null_on_second_vector)
96
97	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
98	vpmovmskb %ymm1, %edx
99	test	%edx, %edx
100	jnz	L(exit_null_on_third_vector)
101
102	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
103	vpmovmskb %ymm2, %edx
104	test	%edx, %edx
105	jnz	L(exit_null_on_fourth_vector)
106
107	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
108	vpmovmskb %ymm3, %edx
109	test	%edx, %edx
110	jnz	L(exit_null_on_fifth_vector)
111
112	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
113	add	$(VEC_SIZE * 4), %rax
114	vpmovmskb %ymm0, %edx
115	test	%edx, %edx
116	jnz	L(exit_null_on_second_vector)
117
118	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
119	vpmovmskb %ymm1, %edx
120	test	%edx, %edx
121	jnz	L(exit_null_on_third_vector)
122
123	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
124	vpmovmskb %ymm2, %edx
125	test	%edx, %edx
126	jnz	L(exit_null_on_fourth_vector)
127
128	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
129	vpmovmskb %ymm3, %edx
130	test	%edx, %edx
131	jnz	L(exit_null_on_fifth_vector)
132
133	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
134	add	$(VEC_SIZE * 4), %rax
135	vpmovmskb %ymm0, %edx
136	test	%edx, %edx
137	jnz	L(exit_null_on_second_vector)
138
139	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
140	vpmovmskb %ymm1, %edx
141	test	%edx, %edx
142	jnz	L(exit_null_on_third_vector)
143
144	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
145	vpmovmskb %ymm2, %edx
146	test	%edx, %edx
147	jnz	L(exit_null_on_fourth_vector)
148
149	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
150	vpmovmskb %ymm3, %edx
151	test	%edx, %edx
152	jnz	L(exit_null_on_fifth_vector)
153
154	test	$((VEC_SIZE * 4) - 1), %rax
155	jz	L(align_four_vec_loop)
156
157	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
158	add	$(VEC_SIZE * 5), %rax
159	vpmovmskb %ymm0, %edx
160	test	%edx, %edx
161	jnz	L(exit)
162
163	test	$((VEC_SIZE * 4) - 1), %rax
164	jz	L(align_four_vec_loop)
165
166	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
167	add	$VEC_SIZE, %rax
168	vpmovmskb %ymm1, %edx
169	test	%edx, %edx
170	jnz	L(exit)
171
172	test	$((VEC_SIZE * 4) - 1), %rax
173	jz	L(align_four_vec_loop)
174
175	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
176	add	$VEC_SIZE, %rax
177	vpmovmskb %ymm2, %edx
178	test	%edx, %edx
179	jnz	L(exit)
180
181	test	$((VEC_SIZE * 4) - 1), %rax
182	jz	L(align_four_vec_loop)
183
184	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
185	add	$VEC_SIZE, %rax
186	vpmovmskb %ymm3, %edx
187	test	%edx, %edx
188	jnz	L(exit)
189
190	add	$VEC_SIZE, %rax
191
192	.p2align 4
193L(align_four_vec_loop):
194	vmovaps	(%rax),	%ymm4
195	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
196	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
197	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
198	add	$(VEC_SIZE * 4),	%rax
199	vpminub	%ymm4,	%ymm5, %ymm5
200	vpcmpeqb %ymm5,	%ymm6, %ymm5
201	vpmovmskb %ymm5,	%edx
202	test	%edx,	%edx
203	jz	L(align_four_vec_loop)
204
205	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
206	sub	$(VEC_SIZE * 5),	%rax
207	vpmovmskb %ymm0, %edx
208	test	%edx, %edx
209	jnz	L(exit_null_on_second_vector)
210
211	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
212	vpmovmskb %ymm1, %edx
213	test	%edx, %edx
214	jnz	L(exit_null_on_third_vector)
215
216	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
217	vpmovmskb %ymm2, %edx
218	test	%edx, %edx
219	jnz	L(exit_null_on_fourth_vector)
220
221	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
222	vpmovmskb %ymm3, %edx
223	sub	%rdi, %rax
224	bsf	%rdx, %rdx
225	add	%rdx, %rax
226	add	$(VEC_SIZE * 4), %rax
227	jmp	L(StartStrcpyPart)
228
229	.p2align 4
230L(exit):
231	sub	%rdi, %rax
232L(exit_null_on_first_vector):
233	bsf	%rdx, %rdx
234	add	%rdx, %rax
235	jmp	L(StartStrcpyPart)
236
237	.p2align 4
238L(exit_null_on_second_vector):
239	sub	%rdi, %rax
240	bsf	%rdx, %rdx
241	add	%rdx, %rax
242	add	$VEC_SIZE, %rax
243	jmp	L(StartStrcpyPart)
244
245	.p2align 4
246L(exit_null_on_third_vector):
247	sub	%rdi, %rax
248	bsf	%rdx, %rdx
249	add	%rdx, %rax
250	add	$(VEC_SIZE * 2), %rax
251	jmp	L(StartStrcpyPart)
252
253	.p2align 4
254L(exit_null_on_fourth_vector):
255	sub	%rdi, %rax
256	bsf	%rdx, %rdx
257	add	%rdx, %rax
258	add	$(VEC_SIZE * 3), %rax
259	jmp	L(StartStrcpyPart)
260
261	.p2align 4
262L(exit_null_on_fifth_vector):
263	sub	%rdi, %rax
264	bsf	%rdx, %rdx
265	add	%rdx, %rax
266	add	$(VEC_SIZE * 4), %rax
267
268	.p2align 4
269L(StartStrcpyPart):
270	lea	(%r9, %rax), %rdi
271	mov	%rsi, %rcx
272	mov	%r9, %rax      /* save result */
273
274# ifdef USE_AS_STRNCAT
275	test	%r8, %r8
276	jz	L(ExitZero)
277#  define USE_AS_STRNCPY
278# endif
279
280# include "strcpy-avx2.S"
281#endif
282