1/* Optimized memchr with sse2
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS  4
35# define STR1  PARMS
36# define STR2  STR1+4
37
38# ifndef USE_AS_RAWMEMCHR
39#  define LEN   STR2+4
40#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
41# endif
42
43# ifndef MEMCHR
44#  define MEMCHR __memchr_sse2_bsf
45# endif
46
47	.text
48ENTRY (MEMCHR)
49
50	mov	STR1(%esp), %ecx
51	movd	STR2(%esp), %xmm1
52
53# ifndef USE_AS_RAWMEMCHR
54	mov	LEN(%esp), %edx
55	test	%edx, %edx
56	jz	L(return_null_1)
57# endif
58	mov	%ecx, %eax
59
60	punpcklbw %xmm1, %xmm1
61	punpcklbw %xmm1, %xmm1
62
63	and	$63, %ecx
64	pshufd	$0, %xmm1, %xmm1
65
66	cmp	$48, %ecx
67	ja	L(crosscache)
68
69	movdqu	(%eax), %xmm0
70	pcmpeqb	%xmm1, %xmm0
71/* Check if there is a match.  */
72	pmovmskb %xmm0, %ecx
73	test	%ecx, %ecx
74	je	L(unaligned_no_match_1)
75/* Check which byte is a match.  */
76	bsf	%ecx, %ecx
77
78# ifndef USE_AS_RAWMEMCHR
79	sub	%ecx, %edx
80	jbe	L(return_null_1)
81# endif
82	add	%ecx, %eax
83	ret
84
85	.p2align 4
86L(unaligned_no_match_1):
87# ifndef USE_AS_RAWMEMCHR
88	sub	$16, %edx
89	jbe	L(return_null_1)
90	PUSH	(%edi)
91	lea	16(%eax), %edi
92	and	$15, %eax
93	and	$-16, %edi
94	add	%eax, %edx
95# else
96	lea	16(%eax), %edx
97	and	$-16, %edx
98# endif
99	jmp	L(loop_prolog)
100
101	.p2align 4
102L(return_null_1):
103	xor	%eax, %eax
104	ret
105
106# ifndef USE_AS_RAWMEMCHR
107	CFI_POP	(%edi)
108# endif
109
110	.p2align 4
111L(crosscache):
112/* Handle unaligned string.  */
113
114# ifndef USE_AS_RAWMEMCHR
115	PUSH	(%edi)
116	mov	%eax, %edi
117	and	$15, %ecx
118	and	$-16, %edi
119	movdqa	(%edi), %xmm0
120# else
121	mov	%eax, %edx
122	and	$15, %ecx
123	and	$-16, %edx
124	movdqa	(%edx), %xmm0
125# endif
126	pcmpeqb	%xmm1, %xmm0
127/* Check if there is a match.  */
128	pmovmskb %xmm0, %eax
129/* Remove the leading bytes.  */
130	sar	%cl, %eax
131	test	%eax, %eax
132	je	L(unaligned_no_match)
133/* Check which byte is a match.  */
134	bsf	%eax, %eax
135
136# ifndef USE_AS_RAWMEMCHR
137	sub	%eax, %edx
138	jbe	L(return_null)
139	add	%edi, %eax
140	add	%ecx, %eax
141	RETURN
142# else
143	add	%edx, %eax
144	add	%ecx, %eax
145	ret
146# endif
147
148	.p2align 4
149L(unaligned_no_match):
150# ifndef USE_AS_RAWMEMCHR
151        /* Calculate the last acceptable address and check for possible
152           addition overflow by using satured math:
153           edx = ecx + edx
154           edx |= -(edx < ecx)  */
155	add	%ecx, %edx
156	sbb	%eax, %eax
157	or	%eax, %edx
158	sub	$16, %edx
159	jbe	L(return_null)
160	add	$16, %edi
161# else
162	add	$16, %edx
163# endif
164
165	.p2align 4
166/* Loop start on aligned string.  */
167L(loop_prolog):
168# ifndef USE_AS_RAWMEMCHR
169	sub	$64, %edx
170	jbe	L(exit_loop)
171	movdqa	(%edi), %xmm0
172# else
173	movdqa	(%edx), %xmm0
174# endif
175	pcmpeqb	%xmm1, %xmm0
176	pmovmskb %xmm0, %eax
177	test	%eax, %eax
178	jnz	L(matches)
179
180# ifndef USE_AS_RAWMEMCHR
181	movdqa	16(%edi), %xmm2
182# else
183	movdqa	16(%edx), %xmm2
184# endif
185	pcmpeqb	%xmm1, %xmm2
186	pmovmskb %xmm2, %eax
187	test	%eax, %eax
188	jnz	L(matches16)
189
190# ifndef USE_AS_RAWMEMCHR
191	movdqa	32(%edi), %xmm3
192# else
193	movdqa	32(%edx), %xmm3
194# endif
195	pcmpeqb	%xmm1, %xmm3
196	pmovmskb %xmm3, %eax
197	test	%eax, %eax
198	jnz	L(matches32)
199
200# ifndef USE_AS_RAWMEMCHR
201	movdqa	48(%edi), %xmm4
202# else
203	movdqa	48(%edx), %xmm4
204# endif
205	pcmpeqb	%xmm1, %xmm4
206
207# ifndef USE_AS_RAWMEMCHR
208	add	$64, %edi
209# else
210	add	$64, %edx
211# endif
212	pmovmskb %xmm4, %eax
213	test	%eax, %eax
214	jnz	L(matches0)
215
216# ifndef USE_AS_RAWMEMCHR
217	test	$0x3f, %edi
218# else
219	test	$0x3f, %edx
220# endif
221	jz	L(align64_loop)
222
223# ifndef USE_AS_RAWMEMCHR
224	sub	$64, %edx
225	jbe	L(exit_loop)
226	movdqa	(%edi), %xmm0
227# else
228	movdqa	(%edx), %xmm0
229# endif
230	pcmpeqb	%xmm1, %xmm0
231	pmovmskb %xmm0, %eax
232	test	%eax, %eax
233	jnz	L(matches)
234
235# ifndef USE_AS_RAWMEMCHR
236	movdqa	16(%edi), %xmm2
237# else
238	movdqa	16(%edx), %xmm2
239# endif
240	pcmpeqb	%xmm1, %xmm2
241	pmovmskb %xmm2, %eax
242	test	%eax, %eax
243	jnz	L(matches16)
244
245# ifndef USE_AS_RAWMEMCHR
246	movdqa	32(%edi), %xmm3
247# else
248	movdqa	32(%edx), %xmm3
249# endif
250	pcmpeqb	%xmm1, %xmm3
251	pmovmskb %xmm3, %eax
252	test	%eax, %eax
253	jnz	L(matches32)
254
255# ifndef USE_AS_RAWMEMCHR
256	movdqa	48(%edi), %xmm3
257# else
258	movdqa	48(%edx), %xmm3
259# endif
260	pcmpeqb	%xmm1, %xmm3
261	pmovmskb %xmm3, %eax
262
263# ifndef USE_AS_RAWMEMCHR
264	add	$64, %edi
265# else
266	add	$64, %edx
267# endif
268	test	%eax, %eax
269	jnz	L(matches0)
270
271# ifndef USE_AS_RAWMEMCHR
272	mov	%edi, %ecx
273	and	$-64, %edi
274	and	$63, %ecx
275	add	%ecx, %edx
276# else
277	and	$-64, %edx
278# endif
279
280	.p2align 4
281L(align64_loop):
282# ifndef USE_AS_RAWMEMCHR
283	sub	$64, %edx
284	jbe	L(exit_loop)
285	movdqa	(%edi), %xmm0
286	movdqa	16(%edi), %xmm2
287	movdqa	32(%edi), %xmm3
288	movdqa	48(%edi), %xmm4
289# else
290	movdqa	(%edx), %xmm0
291	movdqa	16(%edx), %xmm2
292	movdqa	32(%edx), %xmm3
293	movdqa	48(%edx), %xmm4
294# endif
295	pcmpeqb	%xmm1, %xmm0
296	pcmpeqb	%xmm1, %xmm2
297	pcmpeqb	%xmm1, %xmm3
298	pcmpeqb	%xmm1, %xmm4
299
300	pmaxub	%xmm0, %xmm3
301	pmaxub	%xmm2, %xmm4
302	pmaxub	%xmm3, %xmm4
303	pmovmskb %xmm4, %eax
304
305# ifndef USE_AS_RAWMEMCHR
306	add	$64, %edi
307# else
308	add	$64, %edx
309# endif
310
311	test	%eax, %eax
312	jz	L(align64_loop)
313
314# ifndef USE_AS_RAWMEMCHR
315	sub	$64, %edi
316# else
317	sub	$64, %edx
318# endif
319
320	pmovmskb %xmm0, %eax
321	test	%eax, %eax
322	jnz	L(matches)
323
324	pmovmskb %xmm2, %eax
325	test	%eax, %eax
326	jnz	L(matches16)
327
328# ifndef USE_AS_RAWMEMCHR
329	movdqa	32(%edi), %xmm3
330# else
331	movdqa	32(%edx), %xmm3
332# endif
333
334	pcmpeqb	%xmm1, %xmm3
335
336# ifndef USE_AS_RAWMEMCHR
337	pcmpeqb	48(%edi), %xmm1
338# else
339	pcmpeqb	48(%edx), %xmm1
340# endif
341	pmovmskb %xmm3, %eax
342	test	%eax, %eax
343	jnz	L(matches32)
344
345	pmovmskb %xmm1, %eax
346	bsf	%eax, %eax
347
348# ifndef USE_AS_RAWMEMCHR
349	lea	48(%edi, %eax), %eax
350	RETURN
351# else
352	lea	48(%edx, %eax), %eax
353	ret
354# endif
355
356# ifndef USE_AS_RAWMEMCHR
357	.p2align 4
358L(exit_loop):
359	add	$64, %edx
360	cmp	$32, %edx
361	jbe	L(exit_loop_32)
362
363	movdqa	(%edi), %xmm0
364	pcmpeqb	%xmm1, %xmm0
365	pmovmskb %xmm0, %eax
366	test	%eax, %eax
367	jnz	L(matches)
368
369	movdqa	16(%edi), %xmm2
370	pcmpeqb	%xmm1, %xmm2
371	pmovmskb %xmm2, %eax
372	test	%eax, %eax
373	jnz	L(matches16)
374
375	movdqa	32(%edi), %xmm3
376	pcmpeqb	%xmm1, %xmm3
377	pmovmskb %xmm3, %eax
378	test	%eax, %eax
379	jnz	L(matches32_1)
380	cmp	$48, %edx
381	jbe	L(return_null)
382
383	pcmpeqb	48(%edi), %xmm1
384	pmovmskb %xmm1, %eax
385	test	%eax, %eax
386	jnz	L(matches48_1)
387	xor	%eax, %eax
388	RETURN
389
390	.p2align 4
391L(exit_loop_32):
392	movdqa	(%edi), %xmm0
393	pcmpeqb	%xmm1, %xmm0
394	pmovmskb %xmm0, %eax
395	test	%eax, %eax
396	jnz	L(matches_1)
397	cmp	$16, %edx
398	jbe	L(return_null)
399
400	pcmpeqb	16(%edi), %xmm1
401	pmovmskb %xmm1, %eax
402	test	%eax, %eax
403	jnz	L(matches16_1)
404	xor	%eax, %eax
405	RETURN
406# endif
407	.p2align 4
408L(matches0):
409	bsf	%eax, %eax
410# ifndef USE_AS_RAWMEMCHR
411	lea	-16(%eax, %edi), %eax
412	RETURN
413# else
414	lea	-16(%eax, %edx), %eax
415	ret
416# endif
417
418	.p2align 4
419L(matches):
420	bsf	%eax, %eax
421# ifndef USE_AS_RAWMEMCHR
422	add	%edi, %eax
423	RETURN
424# else
425	add	%edx, %eax
426	ret
427# endif
428
429	.p2align 4
430L(matches16):
431	bsf	%eax, %eax
432# ifndef USE_AS_RAWMEMCHR
433	lea	16(%eax, %edi), %eax
434	RETURN
435# else
436	lea	16(%eax, %edx), %eax
437	ret
438# endif
439
440	.p2align 4
441L(matches32):
442	bsf	%eax, %eax
443# ifndef USE_AS_RAWMEMCHR
444	lea	32(%eax, %edi), %eax
445	RETURN
446# else
447	lea	32(%eax, %edx), %eax
448	ret
449# endif
450
451# ifndef USE_AS_RAWMEMCHR
452	.p2align 4
453L(matches_1):
454	bsf	%eax, %eax
455	sub	%eax, %edx
456	jbe	L(return_null)
457
458	add	%edi, %eax
459	RETURN
460
461	.p2align 4
462L(matches16_1):
463	sub	$16, %edx
464	bsf	%eax, %eax
465	sub	%eax, %edx
466	jbe	L(return_null)
467
468	lea	16(%edi, %eax), %eax
469	RETURN
470
471	.p2align 4
472L(matches32_1):
473	sub	$32, %edx
474	bsf	%eax, %eax
475	sub	%eax, %edx
476	jbe	L(return_null)
477
478	lea	32(%edi, %eax), %eax
479	RETURN
480
481	.p2align 4
482L(matches48_1):
483	sub	$48, %edx
484	bsf	%eax, %eax
485	sub	%eax, %edx
486	jbe	L(return_null)
487
488	lea	48(%edi, %eax), %eax
489	RETURN
490# endif
491	.p2align 4
492L(return_null):
493	xor	%eax, %eax
494# ifndef USE_AS_RAWMEMCHR
495	RETURN
496# else
497	ret
498# endif
499
500END (MEMCHR)
501#endif
502