1/* strrchr/wcsrchr optimized with AVX2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRRCHR
26#  define STRRCHR	__strrchr_avx2
27# endif
28
29# ifdef USE_AS_WCSRCHR
30#  define VPBROADCAST	vpbroadcastd
31#  define VPCMPEQ	vpcmpeqd
32#  define VPMIN	vpminud
33#  define CHAR_SIZE	4
34# else
35#  define VPBROADCAST	vpbroadcastb
36#  define VPCMPEQ	vpcmpeqb
37#  define VPMIN	vpminub
38#  define CHAR_SIZE	1
39# endif
40
41# ifndef VZEROUPPER
42#  define VZEROUPPER	vzeroupper
43# endif
44
45# ifndef SECTION
46#  define SECTION(p)	p##.avx
47# endif
48
49# define VEC_SIZE	32
50# define PAGE_SIZE	4096
51
52	.section SECTION(.text), "ax", @progbits
53ENTRY(STRRCHR)
54	vmovd	%esi, %xmm7
55	movl	%edi, %eax
56	/* Broadcast CHAR to YMM4.  */
57	VPBROADCAST %xmm7, %ymm7
58	vpxor	%xmm0, %xmm0, %xmm0
59
60	/* Shift here instead of `andl` to save code size (saves a fetch
61	   block).  */
62	sall	$20, %eax
63	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
64	ja	L(cross_page)
65
66L(page_cross_continue):
67	vmovdqu	(%rdi), %ymm1
68	/* Check end of string match.  */
69	VPCMPEQ	%ymm1, %ymm0, %ymm6
70	vpmovmskb %ymm6, %ecx
71	testl	%ecx, %ecx
72	jz	L(aligned_more)
73
74	/* Only check match with search CHAR if needed.  */
75	VPCMPEQ	%ymm1, %ymm7, %ymm1
76	vpmovmskb %ymm1, %eax
77	/* Check if match before first zero.  */
78	blsmskl	%ecx, %ecx
79	andl	%ecx, %eax
80	jz	L(ret0)
81	bsrl	%eax, %eax
82	addq	%rdi, %rax
83	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
84	   search CHAR is zero we are correct. Either way `andq
85	   -CHAR_SIZE, %rax` gets the correct result.  */
86# ifdef USE_AS_WCSRCHR
87	andq	$-CHAR_SIZE, %rax
88# endif
89L(ret0):
90L(return_vzeroupper):
91	ZERO_UPPER_VEC_REGISTERS_RETURN
92
93	/* Returns for first vec x1/x2 have hard coded backward search
94	   path for earlier matches.  */
95	.p2align 4,, 10
96L(first_vec_x1):
97	VPCMPEQ	%ymm2, %ymm7, %ymm6
98	vpmovmskb %ymm6, %eax
99	blsmskl	%ecx, %ecx
100	andl	%ecx, %eax
101	jnz	L(first_vec_x1_return)
102
103	.p2align 4,, 4
104L(first_vec_x0_test):
105	VPCMPEQ	%ymm1, %ymm7, %ymm6
106	vpmovmskb %ymm6, %eax
107	testl	%eax, %eax
108	jz	L(ret1)
109	bsrl	%eax, %eax
110	addq	%r8, %rax
111# ifdef USE_AS_WCSRCHR
112	andq	$-CHAR_SIZE, %rax
113# endif
114L(ret1):
115	VZEROUPPER_RETURN
116
117	.p2align 4,, 10
118L(first_vec_x0_x1_test):
119	VPCMPEQ	%ymm2, %ymm7, %ymm6
120	vpmovmskb %ymm6, %eax
121	/* Check ymm2 for search CHAR match. If no match then check ymm1
122	   before returning.  */
123	testl	%eax, %eax
124	jz	L(first_vec_x0_test)
125	.p2align 4,, 4
126L(first_vec_x1_return):
127	bsrl	%eax, %eax
128	leaq	1(%rdi, %rax), %rax
129# ifdef USE_AS_WCSRCHR
130	andq	$-CHAR_SIZE, %rax
131# endif
132	VZEROUPPER_RETURN
133
134
135	.p2align 4,, 10
136L(first_vec_x2):
137	VPCMPEQ	%ymm3, %ymm7, %ymm6
138	vpmovmskb %ymm6, %eax
139	blsmskl	%ecx, %ecx
140	/* If no in-range search CHAR match in ymm3 then need to check
141	   ymm1/ymm2 for an earlier match (we delay checking search
142	   CHAR matches until needed).  */
143	andl	%ecx, %eax
144	jz	L(first_vec_x0_x1_test)
145	bsrl	%eax, %eax
146	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
147# ifdef USE_AS_WCSRCHR
148	andq	$-CHAR_SIZE, %rax
149# endif
150	VZEROUPPER_RETURN
151
152
153	.p2align 4
154L(aligned_more):
155	/* Save original pointer if match was in VEC 0.  */
156	movq	%rdi, %r8
157
158	/* Align src.  */
159	orq	$(VEC_SIZE - 1), %rdi
160	vmovdqu	1(%rdi), %ymm2
161	VPCMPEQ	%ymm2, %ymm0, %ymm6
162	vpmovmskb %ymm6, %ecx
163	testl	%ecx, %ecx
164	jnz	L(first_vec_x1)
165
166	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
167	VPCMPEQ	%ymm3, %ymm0, %ymm6
168	vpmovmskb %ymm6, %ecx
169	testl	%ecx, %ecx
170	jnz	L(first_vec_x2)
171
172	/* Save pointer again before realigning.  */
173	movq	%rdi, %rsi
174	addq	$(VEC_SIZE + 1), %rdi
175	andq	$-(VEC_SIZE * 2), %rdi
176	.p2align 4
177L(first_aligned_loop):
178	/* Do 2x VEC at a time. Any more and the cost of finding the
179	   match outweights loop benefit.  */
180	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
181	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
182
183	VPCMPEQ	%ymm4, %ymm7, %ymm6
184	VPMIN	%ymm4, %ymm5, %ymm8
185	VPCMPEQ	%ymm5, %ymm7, %ymm10
186	vpor	%ymm6, %ymm10, %ymm5
187	VPCMPEQ	%ymm8, %ymm0, %ymm8
188	vpor	%ymm5, %ymm8, %ymm9
189
190	vpmovmskb %ymm9, %eax
191	addq	$(VEC_SIZE * 2), %rdi
192	/* No zero or search CHAR.  */
193	testl	%eax, %eax
194	jz	L(first_aligned_loop)
195
196	/* If no zero CHAR then go to second loop (this allows us to
197	   throw away all prior work).  */
198	vpmovmskb %ymm8, %ecx
199	testl	%ecx, %ecx
200	jz	L(second_aligned_loop_prep)
201
202	/* Search char could be zero so we need to get the true match.
203	 */
204	vpmovmskb %ymm5, %eax
205	testl	%eax, %eax
206	jnz	L(first_aligned_loop_return)
207
208	.p2align 4,, 4
209L(first_vec_x1_or_x2):
210	VPCMPEQ	%ymm3, %ymm7, %ymm3
211	VPCMPEQ	%ymm2, %ymm7, %ymm2
212	vpmovmskb %ymm3, %eax
213	vpmovmskb %ymm2, %edx
214	/* Use add for macro-fusion.  */
215	addq	%rax, %rdx
216	jz	L(first_vec_x0_test)
217	/* NB: We could move this shift to before the branch and save a
218	   bit of code size / performance on the fall through. The
219	   branch leads to the null case which generally seems hotter
220	   than char in first 3x VEC.  */
221	salq	$32, %rax
222	addq	%rdx, %rax
223	bsrq	%rax, %rax
224	leaq	1(%rsi, %rax), %rax
225# ifdef USE_AS_WCSRCHR
226	andq	$-CHAR_SIZE, %rax
227# endif
228	VZEROUPPER_RETURN
229
230	.p2align 4,, 8
231L(first_aligned_loop_return):
232	VPCMPEQ	%ymm4, %ymm0, %ymm4
233	vpmovmskb %ymm4, %edx
234	salq	$32, %rcx
235	orq	%rdx, %rcx
236
237	vpmovmskb %ymm10, %eax
238	vpmovmskb %ymm6, %edx
239	salq	$32, %rax
240	orq	%rdx, %rax
241	blsmskq	%rcx, %rcx
242	andq	%rcx, %rax
243	jz	L(first_vec_x1_or_x2)
244
245	bsrq	%rax, %rax
246	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
247# ifdef USE_AS_WCSRCHR
248	andq	$-CHAR_SIZE, %rax
249# endif
250	VZEROUPPER_RETURN
251
252	/* Search char cannot be zero.  */
253	.p2align 4
254L(second_aligned_loop_set_furthest_match):
255	/* Save VEC and pointer from most recent match.  */
256L(second_aligned_loop_prep):
257	movq	%rdi, %rsi
258	vmovdqu	%ymm6, %ymm2
259	vmovdqu	%ymm10, %ymm3
260
261	.p2align 4
262L(second_aligned_loop):
263	/* Search 2x at at time.  */
264	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
265	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
266
267	VPCMPEQ	%ymm4, %ymm7, %ymm6
268	VPMIN	%ymm4, %ymm5, %ymm1
269	VPCMPEQ	%ymm5, %ymm7, %ymm10
270	vpor	%ymm6, %ymm10, %ymm5
271	VPCMPEQ	%ymm1, %ymm0, %ymm1
272	vpor	%ymm5, %ymm1, %ymm9
273
274	vpmovmskb %ymm9, %eax
275	addq	$(VEC_SIZE * 2), %rdi
276	testl	%eax, %eax
277	jz	L(second_aligned_loop)
278	vpmovmskb %ymm1, %ecx
279	testl	%ecx, %ecx
280	jz	L(second_aligned_loop_set_furthest_match)
281	vpmovmskb %ymm5, %eax
282	testl	%eax, %eax
283	jnz	L(return_new_match)
284
285	/* This is the hot patch. We know CHAR is inbounds and that
286	   ymm3/ymm2 have latest match.  */
287	.p2align 4,, 4
288L(return_old_match):
289	vpmovmskb %ymm3, %eax
290	vpmovmskb %ymm2, %edx
291	salq	$32, %rax
292	orq	%rdx, %rax
293	bsrq	%rax, %rax
294	/* Search char cannot be zero so safe to just use lea for
295	   wcsrchr.  */
296	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
297	VZEROUPPER_RETURN
298
299	/* Last iteration also potentially has a match.  */
300	.p2align 4,, 8
301L(return_new_match):
302	VPCMPEQ	%ymm4, %ymm0, %ymm4
303	vpmovmskb %ymm4, %edx
304	salq	$32, %rcx
305	orq	%rdx, %rcx
306
307	vpmovmskb %ymm10, %eax
308	vpmovmskb %ymm6, %edx
309	salq	$32, %rax
310	orq	%rdx, %rax
311	blsmskq	%rcx, %rcx
312	andq	%rcx, %rax
313	jz	L(return_old_match)
314	bsrq	%rax, %rax
315	/* Search char cannot be zero so safe to just use lea for
316	   wcsrchr.  */
317	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
318	VZEROUPPER_RETURN
319
320	.p2align 4,, 4
321L(cross_page):
322	movq	%rdi, %rsi
323	andq	$-VEC_SIZE, %rsi
324	vmovdqu	(%rsi), %ymm1
325	VPCMPEQ	%ymm1, %ymm0, %ymm6
326	vpmovmskb %ymm6, %ecx
327	/* Shift out zero CHAR matches that are before the begining of
328	   src (rdi).  */
329	shrxl	%edi, %ecx, %ecx
330	testl	%ecx, %ecx
331	jz	L(page_cross_continue)
332	VPCMPEQ	%ymm1, %ymm7, %ymm1
333	vpmovmskb %ymm1, %eax
334
335	/* Shift out search CHAR matches that are before the begining of
336	   src (rdi).  */
337	shrxl	%edi, %eax, %eax
338	blsmskl	%ecx, %ecx
339	/* Check if any search CHAR match in range.  */
340	andl	%ecx, %eax
341	jz	L(ret2)
342	bsrl	%eax, %eax
343	addq	%rdi, %rax
344# ifdef USE_AS_WCSRCHR
345	andq	$-CHAR_SIZE, %rax
346# endif
347L(ret2):
348	VZEROUPPER_RETURN
349END(STRRCHR)
350#endif
351