1/* strrchr optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* ISA level >= 2 because there are no {wcs|str}rchr-sse4
22   implementations.  */
23#if ISA_SHOULD_BUILD (2)
24
25# include <sysdep.h>
26
27# ifndef STRRCHR
28#  define STRRCHR __strrchr_sse2
29# endif
30
31# ifdef USE_AS_WCSRCHR
32#  define PCMPEQ	pcmpeqd
33#  define CHAR_SIZE	4
34#  define PMINU	pminud
35# else
36#  define PCMPEQ	pcmpeqb
37#  define CHAR_SIZE	1
38#  define PMINU	pminub
39# endif
40
41# define PAGE_SIZE	4096
42# define VEC_SIZE	16
43
44	.text
45ENTRY(STRRCHR)
46	movd	%esi, %xmm0
47	movq	%rdi, %rax
48	andl	$(PAGE_SIZE - 1), %eax
49# ifndef USE_AS_WCSRCHR
50	punpcklbw %xmm0, %xmm0
51	punpcklwd %xmm0, %xmm0
52# endif
53	pshufd	$0, %xmm0, %xmm0
54	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
55	ja	L(cross_page)
56
57L(cross_page_continue):
58	movups	(%rdi), %xmm1
59	pxor	%xmm2, %xmm2
60	PCMPEQ	%xmm1, %xmm2
61	pmovmskb %xmm2, %ecx
62	testl	%ecx, %ecx
63	jz	L(aligned_more)
64
65	PCMPEQ	%xmm0, %xmm1
66	pmovmskb %xmm1, %eax
67	leal	-1(%rcx), %edx
68	xorl	%edx, %ecx
69	andl	%ecx, %eax
70	jz	L(ret0)
71	bsrl	%eax, %eax
72	addq	%rdi, %rax
73	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
74	   search CHAR is zero we are correct. Either way `andq
75	   -CHAR_SIZE, %rax` gets the correct result.  */
76# ifdef USE_AS_WCSRCHR
77	andq	$-CHAR_SIZE, %rax
78# endif
79L(ret0):
80	ret
81
82	/* Returns for first vec x1/x2 have hard coded backward search
83	   path for earlier matches.  */
84	.p2align 4
85L(first_vec_x0_test):
86	PCMPEQ	%xmm0, %xmm1
87	pmovmskb %xmm1, %eax
88	testl	%eax, %eax
89	jz	L(ret0)
90	bsrl	%eax, %eax
91	addq	%r8, %rax
92# ifdef USE_AS_WCSRCHR
93	andq	$-CHAR_SIZE, %rax
94# endif
95	ret
96
97	.p2align 4
98L(first_vec_x1):
99	PCMPEQ	%xmm0, %xmm2
100	pmovmskb %xmm2, %eax
101	leal	-1(%rcx), %edx
102	xorl	%edx, %ecx
103	andl	%ecx, %eax
104	jz	L(first_vec_x0_test)
105	bsrl	%eax, %eax
106	leaq	(VEC_SIZE)(%rdi, %rax), %rax
107# ifdef USE_AS_WCSRCHR
108	andq	$-CHAR_SIZE, %rax
109# endif
110	ret
111
112	.p2align 4
113L(first_vec_x1_test):
114	PCMPEQ	%xmm0, %xmm2
115	pmovmskb %xmm2, %eax
116	testl	%eax, %eax
117	jz	L(first_vec_x0_test)
118	bsrl	%eax, %eax
119	leaq	(VEC_SIZE)(%rdi, %rax), %rax
120# ifdef USE_AS_WCSRCHR
121	andq	$-CHAR_SIZE, %rax
122# endif
123	ret
124
125	.p2align 4
126L(first_vec_x2):
127	PCMPEQ	%xmm0, %xmm3
128	pmovmskb %xmm3, %eax
129	leal	-1(%rcx), %edx
130	xorl	%edx, %ecx
131	andl	%ecx, %eax
132	jz	L(first_vec_x1_test)
133	bsrl	%eax, %eax
134	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
135# ifdef USE_AS_WCSRCHR
136	andq	$-CHAR_SIZE, %rax
137# endif
138	ret
139
140	.p2align 4
141L(aligned_more):
142	/* Save original pointer if match was in VEC 0.  */
143	movq	%rdi, %r8
144	andq	$-VEC_SIZE, %rdi
145
146	movaps	VEC_SIZE(%rdi), %xmm2
147	pxor	%xmm3, %xmm3
148	PCMPEQ	%xmm2, %xmm3
149	pmovmskb %xmm3, %ecx
150	testl	%ecx, %ecx
151	jnz	L(first_vec_x1)
152
153	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
154	pxor	%xmm4, %xmm4
155	PCMPEQ	%xmm3, %xmm4
156	pmovmskb %xmm4, %ecx
157	testl	%ecx, %ecx
158	jnz	L(first_vec_x2)
159
160	addq	$VEC_SIZE, %rdi
161	/* Save pointer again before realigning.  */
162	movq	%rdi, %rsi
163	andq	$-(VEC_SIZE * 2), %rdi
164	.p2align 4
165L(first_loop):
166	/* Do 2x VEC at a time.  */
167	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
168	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
169	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
170	   detecting zero. Note if this is found to be a bottleneck it
171	   may be worth adding an SSE4.1 wcsrchr implementation.  */
172# ifdef USE_AS_WCSRCHR
173	movaps	%xmm5, %xmm6
174	pxor	%xmm8, %xmm8
175
176	PCMPEQ	%xmm8, %xmm5
177	PCMPEQ	%xmm4, %xmm8
178	por	%xmm5, %xmm8
179# else
180	movaps	%xmm5, %xmm6
181	PMINU	%xmm4, %xmm5
182# endif
183
184	movaps	%xmm4, %xmm9
185	PCMPEQ	%xmm0, %xmm4
186	PCMPEQ	%xmm0, %xmm6
187	movaps	%xmm6, %xmm7
188	por	%xmm4, %xmm6
189# ifndef USE_AS_WCSRCHR
190	pxor	%xmm8, %xmm8
191	PCMPEQ	%xmm5, %xmm8
192# endif
193	pmovmskb %xmm8, %ecx
194	pmovmskb %xmm6, %eax
195
196	addq	$(VEC_SIZE * 2), %rdi
197	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
198	   macro-fuse with `jz`.  */
199	addl	%ecx, %eax
200	jz	L(first_loop)
201
202	/* Check if there is zero match.  */
203	testl	%ecx, %ecx
204	jz	L(second_loop_match)
205
206	/* Check if there was a match in last iteration.  */
207	subl	%ecx, %eax
208	jnz	L(new_match)
209
210L(first_loop_old_match):
211	PCMPEQ	%xmm0, %xmm2
212	PCMPEQ	%xmm0, %xmm3
213	pmovmskb %xmm2, %ecx
214	pmovmskb %xmm3, %eax
215	addl	%eax, %ecx
216	jz	L(first_vec_x0_test)
217	/* NB: We could move this shift to before the branch and save a
218	   bit of code size / performance on the fall through. The
219	   branch leads to the null case which generally seems hotter
220	   than char in first 3x VEC.  */
221	sall	$16, %eax
222	orl	%ecx, %eax
223
224	bsrl	%eax, %eax
225	addq	%rsi, %rax
226# ifdef USE_AS_WCSRCHR
227	andq	$-CHAR_SIZE, %rax
228# endif
229	ret
230
231	.p2align 4
232L(new_match):
233	pxor	%xmm6, %xmm6
234	PCMPEQ	%xmm9, %xmm6
235	pmovmskb %xmm6, %eax
236	sall	$16, %ecx
237	orl	%eax, %ecx
238
239	/* We can't reuse either of the old comparisons as since we mask
240	   of zeros after first zero (instead of using the full
241	   comparison) we can't gurantee no interference between match
242	   after end of string and valid match.  */
243	pmovmskb %xmm4, %eax
244	pmovmskb %xmm7, %edx
245	sall	$16, %edx
246	orl	%edx, %eax
247
248	leal	-1(%ecx), %edx
249	xorl	%edx, %ecx
250	andl	%ecx, %eax
251	jz	L(first_loop_old_match)
252	bsrl	%eax, %eax
253	addq	%rdi, %rax
254# ifdef USE_AS_WCSRCHR
255	andq	$-CHAR_SIZE, %rax
256# endif
257	ret
258
259	/* Save minimum state for getting most recent match. We can
260	   throw out all previous work.  */
261	.p2align 4
262L(second_loop_match):
263	movq	%rdi, %rsi
264	movaps	%xmm4, %xmm2
265	movaps	%xmm7, %xmm3
266
267	.p2align 4
268L(second_loop):
269	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
270	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
271	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
272	   detecting zero. Note if this is found to be a bottleneck it
273	   may be worth adding an SSE4.1 wcsrchr implementation.  */
274# ifdef USE_AS_WCSRCHR
275	movaps	%xmm5, %xmm6
276	pxor	%xmm8, %xmm8
277
278	PCMPEQ	%xmm8, %xmm5
279	PCMPEQ	%xmm4, %xmm8
280	por	%xmm5, %xmm8
281# else
282	movaps	%xmm5, %xmm6
283	PMINU	%xmm4, %xmm5
284# endif
285
286	movaps	%xmm4, %xmm9
287	PCMPEQ	%xmm0, %xmm4
288	PCMPEQ	%xmm0, %xmm6
289	movaps	%xmm6, %xmm7
290	por	%xmm4, %xmm6
291# ifndef USE_AS_WCSRCHR
292	pxor	%xmm8, %xmm8
293	PCMPEQ	%xmm5, %xmm8
294# endif
295
296	pmovmskb %xmm8, %ecx
297	pmovmskb %xmm6, %eax
298
299	addq	$(VEC_SIZE * 2), %rdi
300	/* Either null term or new occurence of CHAR.  */
301	addl	%ecx, %eax
302	jz	L(second_loop)
303
304	/* No null term so much be new occurence of CHAR.  */
305	testl	%ecx, %ecx
306	jz	L(second_loop_match)
307
308
309	subl	%ecx, %eax
310	jnz	L(second_loop_new_match)
311
312L(second_loop_old_match):
313	pmovmskb %xmm2, %ecx
314	pmovmskb %xmm3, %eax
315	sall	$16, %eax
316	orl	%ecx, %eax
317	bsrl	%eax, %eax
318	addq	%rsi, %rax
319# ifdef USE_AS_WCSRCHR
320	andq	$-CHAR_SIZE, %rax
321# endif
322	ret
323
324	.p2align 4
325L(second_loop_new_match):
326	pxor	%xmm6, %xmm6
327	PCMPEQ	%xmm9, %xmm6
328	pmovmskb %xmm6, %eax
329	sall	$16, %ecx
330	orl	%eax, %ecx
331
332	/* We can't reuse either of the old comparisons as since we mask
333	   of zeros after first zero (instead of using the full
334	   comparison) we can't gurantee no interference between match
335	   after end of string and valid match.  */
336	pmovmskb %xmm4, %eax
337	pmovmskb %xmm7, %edx
338	sall	$16, %edx
339	orl	%edx, %eax
340
341	leal	-1(%ecx), %edx
342	xorl	%edx, %ecx
343	andl	%ecx, %eax
344	jz	L(second_loop_old_match)
345	bsrl	%eax, %eax
346	addq	%rdi, %rax
347# ifdef USE_AS_WCSRCHR
348	andq	$-CHAR_SIZE, %rax
349# endif
350	ret
351
352	.p2align 4,, 4
353L(cross_page):
354	movq	%rdi, %rsi
355	andq	$-VEC_SIZE, %rsi
356	movaps	(%rsi), %xmm1
357	pxor	%xmm2, %xmm2
358	PCMPEQ	%xmm1, %xmm2
359	pmovmskb %xmm2, %edx
360	movl	%edi, %ecx
361	andl	$(VEC_SIZE - 1), %ecx
362	sarl	%cl, %edx
363	jz	L(cross_page_continue)
364	PCMPEQ	%xmm0, %xmm1
365	pmovmskb %xmm1, %eax
366	sarl	%cl, %eax
367	leal	-1(%rdx), %ecx
368	xorl	%edx, %ecx
369	andl	%ecx, %eax
370	jz	L(ret1)
371	bsrl	%eax, %eax
372	addq	%rdi, %rax
373# ifdef USE_AS_WCSRCHR
374	andq	$-CHAR_SIZE, %rax
375# endif
376L(ret1):
377	ret
378END(STRRCHR)
379#endif
380