1/* Optimized memrchr with sse2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS  4
35# define STR1  PARMS
36# define STR2  STR1+4
37# define LEN   STR2+4
38
39# define MEMCHR __memrchr_sse2_bsf
40
41	.text
42ENTRY (MEMCHR)
43	mov	STR1(%esp), %ecx
44	movd	STR2(%esp), %xmm1
45	mov	LEN(%esp), %edx
46
47	sub	$16, %edx
48	jbe	L(length_less16)
49
50	punpcklbw %xmm1, %xmm1
51	add	%edx, %ecx
52	punpcklbw %xmm1, %xmm1
53
54	movdqu	(%ecx), %xmm0
55	pshufd	$0, %xmm1, %xmm1
56	pcmpeqb	%xmm1, %xmm0
57
58/* Check if there is a match.  */
59	pmovmskb %xmm0, %eax
60	test	%eax, %eax
61	jnz	L(matches0)
62
63	sub	$64, %ecx
64	mov	%ecx, %eax
65	and	$15, %eax
66	jz	L(loop_prolog)
67
68	add	$16, %ecx
69	add	$16, %edx
70	sub	%eax, %ecx
71	sub	%eax, %edx
72
73	.p2align 4
74/* Loop start on aligned string.  */
75L(loop_prolog):
76	sub	$64, %edx
77	jbe	L(exit_loop)
78
79	movdqa	48(%ecx), %xmm0
80	pcmpeqb	%xmm1, %xmm0
81	pmovmskb %xmm0, %eax
82	test	%eax, %eax
83	jnz	L(matches48)
84
85	movdqa	32(%ecx), %xmm2
86	pcmpeqb	%xmm1, %xmm2
87	pmovmskb %xmm2, %eax
88	test	%eax, %eax
89	jnz	L(matches32)
90
91	movdqa	16(%ecx), %xmm3
92	pcmpeqb	%xmm1, %xmm3
93	pmovmskb %xmm3, %eax
94	test	%eax, %eax
95	jnz	L(matches16)
96
97	movdqa	(%ecx), %xmm4
98	pcmpeqb	%xmm1, %xmm4
99	pmovmskb %xmm4, %eax
100	test	%eax, %eax
101	jnz	L(matches0)
102
103	sub	$64, %ecx
104	sub	$64, %edx
105	jbe	L(exit_loop)
106
107	movdqa	48(%ecx), %xmm0
108	pcmpeqb	%xmm1, %xmm0
109	pmovmskb %xmm0, %eax
110	test	%eax, %eax
111	jnz	L(matches48)
112
113	movdqa	32(%ecx), %xmm2
114	pcmpeqb	%xmm1, %xmm2
115	pmovmskb %xmm2, %eax
116	test	%eax, %eax
117	jnz	L(matches32)
118
119	movdqa	16(%ecx), %xmm3
120	pcmpeqb	%xmm1, %xmm3
121	pmovmskb %xmm3, %eax
122	test	%eax, %eax
123	jnz	L(matches16)
124
125	movdqa	(%ecx), %xmm3
126	pcmpeqb	%xmm1, %xmm3
127	pmovmskb %xmm3, %eax
128	test	%eax, %eax
129	jnz	L(matches0)
130
131	mov	%ecx, %eax
132	and	$63, %eax
133	test	%eax, %eax
134	jz	L(align64_loop)
135
136	add	$64, %ecx
137	add	$64, %edx
138	sub	%eax, %ecx
139	sub	%eax, %edx
140
141	.p2align 4
142L(align64_loop):
143	sub	$64, %ecx
144	sub	$64, %edx
145	jbe	L(exit_loop)
146
147	movdqa	(%ecx), %xmm0
148	movdqa	16(%ecx), %xmm2
149	movdqa	32(%ecx), %xmm3
150	movdqa	48(%ecx), %xmm4
151
152	pcmpeqb	%xmm1, %xmm0
153	pcmpeqb	%xmm1, %xmm2
154	pcmpeqb	%xmm1, %xmm3
155	pcmpeqb	%xmm1, %xmm4
156
157	pmaxub	%xmm3, %xmm0
158	pmaxub	%xmm4, %xmm2
159	pmaxub	%xmm0, %xmm2
160	pmovmskb %xmm2, %eax
161
162	test	%eax, %eax
163	jz	L(align64_loop)
164
165	pmovmskb %xmm4, %eax
166	test	%eax, %eax
167	jnz	L(matches48)
168
169	pmovmskb %xmm3, %eax
170	test	%eax, %eax
171	jnz	L(matches32)
172
173	movdqa	16(%ecx), %xmm2
174
175	pcmpeqb	%xmm1, %xmm2
176	pcmpeqb	(%ecx), %xmm1
177
178	pmovmskb %xmm2, %eax
179	test	%eax, %eax
180	jnz	L(matches16)
181
182	pmovmskb %xmm1, %eax
183	bsr	%eax, %eax
184
185	add	%ecx, %eax
186	ret
187
188	.p2align 4
189L(exit_loop):
190	add	$64, %edx
191	cmp	$32, %edx
192	jbe	L(exit_loop_32)
193
194	movdqa	48(%ecx), %xmm0
195	pcmpeqb	%xmm1, %xmm0
196	pmovmskb %xmm0, %eax
197	test	%eax, %eax
198	jnz	L(matches48)
199
200	movdqa	32(%ecx), %xmm2
201	pcmpeqb	%xmm1, %xmm2
202	pmovmskb %xmm2, %eax
203	test	%eax, %eax
204	jnz	L(matches32)
205
206	movdqa	16(%ecx), %xmm3
207	pcmpeqb	%xmm1, %xmm3
208	pmovmskb %xmm3, %eax
209	test	%eax, %eax
210	jnz	L(matches16_1)
211	cmp	$48, %edx
212	jbe	L(return_null)
213
214	pcmpeqb	(%ecx), %xmm1
215	pmovmskb %xmm1, %eax
216	test	%eax, %eax
217	jnz	L(matches0_1)
218	xor	%eax, %eax
219	ret
220
221	.p2align 4
222L(exit_loop_32):
223	movdqa	48(%ecx), %xmm0
224	pcmpeqb	%xmm1, %xmm0
225	pmovmskb %xmm0, %eax
226	test	%eax, %eax
227	jnz	L(matches48_1)
228	cmp	$16, %edx
229	jbe	L(return_null)
230
231	pcmpeqb	32(%ecx), %xmm1
232	pmovmskb %xmm1, %eax
233	test	%eax, %eax
234	jnz	L(matches32_1)
235	xor	%eax, %eax
236	ret
237
238	.p2align 4
239L(matches0):
240	bsr	%eax, %eax
241	add	%ecx, %eax
242	ret
243
244	.p2align 4
245L(matches16):
246	bsr	%eax, %eax
247	lea	16(%eax, %ecx), %eax
248	ret
249
250	.p2align 4
251L(matches32):
252	bsr	%eax, %eax
253	lea	32(%eax, %ecx), %eax
254	ret
255
256	.p2align 4
257L(matches48):
258	bsr	%eax, %eax
259	lea	48(%eax, %ecx), %eax
260	ret
261
262	.p2align 4
263L(matches0_1):
264	bsr	%eax, %eax
265	sub	$64, %edx
266	add	%eax, %edx
267	jl	L(return_null)
268	add	%ecx, %eax
269	ret
270
271	.p2align 4
272L(matches16_1):
273	bsr	%eax, %eax
274	sub	$48, %edx
275	add	%eax, %edx
276	jl	L(return_null)
277	lea	16(%ecx, %eax), %eax
278	ret
279
280	.p2align 4
281L(matches32_1):
282	bsr	%eax, %eax
283	sub	$32, %edx
284	add	%eax, %edx
285	jl	L(return_null)
286	lea	32(%ecx, %eax), %eax
287	ret
288
289	.p2align 4
290L(matches48_1):
291	bsr	%eax, %eax
292	sub	$16, %edx
293	add	%eax, %edx
294	jl	L(return_null)
295	lea	48(%ecx, %eax), %eax
296	ret
297
298	.p2align 4
299L(return_null):
300	xor	%eax, %eax
301	ret
302
303	.p2align 4
304L(length_less16_offset0):
305	mov	%dl, %cl
306	pcmpeqb	(%eax), %xmm1
307
308	mov	$1, %edx
309	sal	%cl, %edx
310	sub	$1, %edx
311	mov	%edx, %ecx
312
313	pmovmskb %xmm1, %edx
314
315	and	%ecx, %edx
316	test	%edx, %edx
317	jz	L(return_null)
318
319	bsr	%edx, %ecx
320	add	%ecx, %eax
321	ret
322
323	.p2align 4
324L(length_less16):
325	punpcklbw %xmm1, %xmm1
326	mov	%ecx, %eax
327	punpcklbw %xmm1, %xmm1
328	add	$16, %edx
329	jz	L(return_null)
330
331	pshufd	$0, %xmm1, %xmm1
332	and	$15, %ecx
333	jz	L(length_less16_offset0)
334
335	PUSH	(%edi)
336	mov	%cl, %dh
337	add	%dl, %dh
338	and	$-16, %eax
339
340	sub	$16, %dh
341	ja	L(length_less16_part2)
342
343	pcmpeqb	(%eax), %xmm1
344	pmovmskb %xmm1, %edi
345
346	sar	%cl, %edi
347	add	%ecx, %eax
348	mov	%dl, %cl
349
350	mov	$1, %edx
351	sal	%cl, %edx
352	sub	$1, %edx
353
354	and	%edx, %edi
355	test	%edi, %edi
356	jz	L(ret_null)
357
358	bsr	%edi, %edi
359	add	%edi, %eax
360	POP	(%edi)
361	ret
362
363	CFI_PUSH     (%edi)
364
365	.p2align 4
366L(length_less16_part2):
367	movdqa	16(%eax), %xmm2
368	pcmpeqb	%xmm1, %xmm2
369	pmovmskb %xmm2, %edi
370
371	mov	%cl, %ch
372
373	mov	%dh, %cl
374	mov	$1, %edx
375	sal	%cl, %edx
376	sub	$1, %edx
377
378	and	%edx, %edi
379
380	test	%edi, %edi
381	jnz	L(length_less16_part2_return)
382
383	pcmpeqb	(%eax), %xmm1
384	pmovmskb %xmm1, %edi
385
386	mov	%ch, %cl
387	sar	%cl, %edi
388	test	%edi, %edi
389	jz	L(ret_null)
390
391	bsr	%edi, %edi
392	add	%edi, %eax
393	xor	%ch, %ch
394	add	%ecx, %eax
395	POP	(%edi)
396	ret
397
398	CFI_PUSH     (%edi)
399
400	.p2align 4
401L(length_less16_part2_return):
402	bsr	%edi, %edi
403	lea	16(%eax, %edi), %eax
404	POP	(%edi)
405	ret
406
407	CFI_PUSH     (%edi)
408
409	.p2align 4
410L(ret_null):
411	xor	%eax, %eax
412	POP	(%edi)
413	ret
414
415END (MEMCHR)
416#endif
417