1/* strchr/strchrnul optimized with AVX2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRCHR
26#  define STRCHR	__strchr_avx2
27# endif
28
29# ifdef USE_AS_WCSCHR
30#  define VPBROADCAST	vpbroadcastd
31#  define VPCMPEQ	vpcmpeqd
32#  define VPMINU	vpminud
33#  define CHAR_REG	esi
34# else
35#  define VPBROADCAST	vpbroadcastb
36#  define VPCMPEQ	vpcmpeqb
37#  define VPMINU	vpminub
38#  define CHAR_REG	sil
39# endif
40
41# ifndef VZEROUPPER
42#  define VZEROUPPER	vzeroupper
43# endif
44
45# ifndef SECTION
46#  define SECTION(p)	p##.avx
47# endif
48
49# define VEC_SIZE 32
50# define PAGE_SIZE 4096
51
52	.section SECTION(.text),"ax",@progbits
53ENTRY_P2ALIGN (STRCHR, 5)
54	/* Broadcast CHAR to YMM0.	*/
55	vmovd	%esi, %xmm0
56	movl	%edi, %eax
57	andl	$(PAGE_SIZE - 1), %eax
58	VPBROADCAST	%xmm0, %ymm0
59	vpxor	%xmm1, %xmm1, %xmm1
60
61	/* Check if we cross page boundary with one vector load.  */
62	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
63	ja	L(cross_page_boundary)
64
65	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
66	   null byte.  */
67	vmovdqu	(%rdi), %ymm2
68	VPCMPEQ	%ymm2, %ymm0, %ymm3
69	VPCMPEQ	%ymm2, %ymm1, %ymm2
70	vpor	%ymm3, %ymm2, %ymm3
71	vpmovmskb %ymm3, %eax
72	testl	%eax, %eax
73	jz	L(aligned_more)
74	tzcntl	%eax, %eax
75# ifndef USE_AS_STRCHRNUL
76	/* Found CHAR or the null byte.  */
77	cmp	(%rdi, %rax), %CHAR_REG
78	/* NB: Use a branch instead of cmovcc here. The expectation is
79	   that with strchr the user will branch based on input being
80	   null. Since this branch will be 100% predictive of the user
81	   branch a branch miss here should save what otherwise would
82	   be branch miss in the user code. Otherwise using a branch 1)
83	   saves code size and 2) is faster in highly predictable
84	   environments.  */
85	jne	L(zero)
86# endif
87	addq	%rdi, %rax
88L(return_vzeroupper):
89	ZERO_UPPER_VEC_REGISTERS_RETURN
90
91# ifndef USE_AS_STRCHRNUL
92L(zero):
93	xorl	%eax, %eax
94	VZEROUPPER_RETURN
95# endif
96
97
98	.p2align 4
99L(first_vec_x1):
100	/* Use bsf to save code size.  */
101	bsfl	%eax, %eax
102	incq	%rdi
103# ifndef USE_AS_STRCHRNUL
104	/* Found CHAR or the null byte.	 */
105	cmp	(%rdi, %rax), %CHAR_REG
106	jne	L(zero)
107# endif
108	addq	%rdi, %rax
109	VZEROUPPER_RETURN
110
111	.p2align 4,, 10
112L(first_vec_x2):
113	/* Use bsf to save code size.  */
114	bsfl	%eax, %eax
115	addq	$(VEC_SIZE + 1), %rdi
116# ifndef USE_AS_STRCHRNUL
117	/* Found CHAR or the null byte.	 */
118	cmp	(%rdi, %rax), %CHAR_REG
119	jne	L(zero)
120# endif
121	addq	%rdi, %rax
122	VZEROUPPER_RETURN
123
124	.p2align 4,, 8
125L(first_vec_x3):
126	/* Use bsf to save code size.  */
127	bsfl	%eax, %eax
128	addq	$(VEC_SIZE * 2 + 1), %rdi
129# ifndef USE_AS_STRCHRNUL
130	/* Found CHAR or the null byte.	 */
131	cmp	(%rdi, %rax), %CHAR_REG
132	jne	L(zero)
133# endif
134	addq	%rdi, %rax
135	VZEROUPPER_RETURN
136
137	.p2align 4,, 10
138L(first_vec_x4):
139	/* Use bsf to save code size.  */
140	bsfl	%eax, %eax
141	addq	$(VEC_SIZE * 3 + 1), %rdi
142# ifndef USE_AS_STRCHRNUL
143	/* Found CHAR or the null byte.	 */
144	cmp	(%rdi, %rax), %CHAR_REG
145	jne	L(zero)
146# endif
147	addq	%rdi, %rax
148	VZEROUPPER_RETURN
149
150
151
152	.p2align 4
153L(aligned_more):
154	/* Align data to VEC_SIZE - 1. This is the same number of
155	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
156	   on x4 check.  */
157	orq	$(VEC_SIZE - 1), %rdi
158L(cross_page_continue):
159	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
160	   since data is only aligned to VEC_SIZE.  */
161	vmovdqa	1(%rdi), %ymm2
162	VPCMPEQ	%ymm2, %ymm0, %ymm3
163	VPCMPEQ	%ymm2, %ymm1, %ymm2
164	vpor	%ymm3, %ymm2, %ymm3
165	vpmovmskb %ymm3, %eax
166	testl	%eax, %eax
167	jnz	L(first_vec_x1)
168
169	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
170	VPCMPEQ	%ymm2, %ymm0, %ymm3
171	VPCMPEQ	%ymm2, %ymm1, %ymm2
172	vpor	%ymm3, %ymm2, %ymm3
173	vpmovmskb %ymm3, %eax
174	testl	%eax, %eax
175	jnz	L(first_vec_x2)
176
177	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
178	VPCMPEQ	%ymm2, %ymm0, %ymm3
179	VPCMPEQ	%ymm2, %ymm1, %ymm2
180	vpor	%ymm3, %ymm2, %ymm3
181	vpmovmskb %ymm3, %eax
182	testl	%eax, %eax
183	jnz	L(first_vec_x3)
184
185	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
186	VPCMPEQ	%ymm2, %ymm0, %ymm3
187	VPCMPEQ	%ymm2, %ymm1, %ymm2
188	vpor	%ymm3, %ymm2, %ymm3
189	vpmovmskb %ymm3, %eax
190	testl	%eax, %eax
191	jnz	L(first_vec_x4)
192	/* Align data to VEC_SIZE * 4 - 1.  */
193	incq	%rdi
194	orq	$(VEC_SIZE * 4 - 1), %rdi
195	.p2align 4
196L(loop_4x_vec):
197	/* Compare 4 * VEC at a time forward.  */
198	vmovdqa	1(%rdi), %ymm6
199	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
200
201	/* Leaves only CHARS matching esi as 0.	 */
202	vpxor	%ymm6, %ymm0, %ymm2
203	vpxor	%ymm7, %ymm0, %ymm3
204
205	VPMINU	%ymm2, %ymm6, %ymm2
206	VPMINU	%ymm3, %ymm7, %ymm3
207
208	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
209	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
210
211	vpxor	%ymm6, %ymm0, %ymm4
212	vpxor	%ymm7, %ymm0, %ymm5
213
214	VPMINU	%ymm4, %ymm6, %ymm4
215	VPMINU	%ymm5, %ymm7, %ymm5
216
217	VPMINU	%ymm2, %ymm3, %ymm6
218	VPMINU	%ymm4, %ymm5, %ymm7
219
220	VPMINU	%ymm6, %ymm7, %ymm7
221
222	VPCMPEQ	%ymm7, %ymm1, %ymm7
223	vpmovmskb %ymm7, %ecx
224	subq	$-(VEC_SIZE * 4), %rdi
225	testl	%ecx, %ecx
226	jz	L(loop_4x_vec)
227
228	VPCMPEQ	%ymm2, %ymm1, %ymm2
229	vpmovmskb %ymm2, %eax
230	testl	%eax, %eax
231	jnz	L(last_vec_x0)
232
233
234	VPCMPEQ	%ymm3, %ymm1, %ymm3
235	vpmovmskb %ymm3, %eax
236	testl	%eax, %eax
237	jnz	L(last_vec_x1)
238
239	VPCMPEQ	%ymm4, %ymm1, %ymm4
240	vpmovmskb %ymm4, %eax
241	/* rcx has combined result from all 4 VEC. It will only be used
242	   if the first 3 other VEC all did not contain a match.  */
243	salq	$32, %rcx
244	orq	%rcx, %rax
245	tzcntq	%rax, %rax
246	subq	$(VEC_SIZE * 2 - 1), %rdi
247# ifndef USE_AS_STRCHRNUL
248	/* Found CHAR or the null byte.	 */
249	cmp	(%rdi, %rax), %CHAR_REG
250	jne	L(zero_end)
251# endif
252	addq	%rdi, %rax
253	VZEROUPPER_RETURN
254
255
256	.p2align 4,, 10
257L(last_vec_x0):
258	/* Use bsf to save code size.  */
259	bsfl	%eax, %eax
260	addq	$-(VEC_SIZE * 4 - 1), %rdi
261# ifndef USE_AS_STRCHRNUL
262	/* Found CHAR or the null byte.	 */
263	cmp	(%rdi, %rax), %CHAR_REG
264	jne	L(zero_end)
265# endif
266	addq	%rdi, %rax
267	VZEROUPPER_RETURN
268
269
270	.p2align 4,, 10
271L(last_vec_x1):
272	tzcntl	%eax, %eax
273	subq	$(VEC_SIZE * 3 - 1), %rdi
274# ifndef USE_AS_STRCHRNUL
275	/* Found CHAR or the null byte.	 */
276	cmp	(%rdi, %rax), %CHAR_REG
277	jne	L(zero_end)
278# endif
279	addq	%rdi, %rax
280	VZEROUPPER_RETURN
281
282# ifndef USE_AS_STRCHRNUL
283L(zero_end):
284	xorl	%eax, %eax
285	VZEROUPPER_RETURN
286# endif
287
288	/* Cold case for crossing page with first load.	 */
289	.p2align 4,, 8
290L(cross_page_boundary):
291	movq	%rdi, %rdx
292	/* Align rdi to VEC_SIZE - 1.  */
293	orq	$(VEC_SIZE - 1), %rdi
294	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
295	VPCMPEQ	%ymm2, %ymm0, %ymm3
296	VPCMPEQ	%ymm2, %ymm1, %ymm2
297	vpor	%ymm3, %ymm2, %ymm3
298	vpmovmskb %ymm3, %eax
299	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
300	   so no need to manually mod edx.  */
301	sarxl	%edx, %eax, %eax
302	testl	%eax, %eax
303	jz	L(cross_page_continue)
304	tzcntl	%eax, %eax
305# ifndef USE_AS_STRCHRNUL
306	xorl	%ecx, %ecx
307	/* Found CHAR or the null byte.	 */
308	cmp	(%rdx, %rax), %CHAR_REG
309	jne	L(zero_end)
310# endif
311	addq	%rdx, %rax
312	VZEROUPPER_RETURN
313
314END (STRCHR)
315#endif
316