1/* Optimized memchr with sse2 without bsf
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# ifndef USE_AS_RAWMEMCHR
35#  define ENTRANCE PUSH(%edi);
36#  define PARMS  8
37#  define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
38# else
39#  define ENTRANCE
40#  define PARMS  4
41# endif
42
43# define STR1  PARMS
44# define STR2  STR1+4
45
46# ifndef USE_AS_RAWMEMCHR
47#  define LEN   STR2+4
48# endif
49
50# ifndef MEMCHR
51#  define MEMCHR __memchr_sse2
52# endif
53
54	atom_text_section
55ENTRY (MEMCHR)
56	ENTRANCE
57	mov	STR1(%esp), %ecx
58	movd	STR2(%esp), %xmm1
59# ifndef USE_AS_RAWMEMCHR
60	mov	LEN(%esp), %edx
61	test	%edx, %edx
62	jz	L(return_null)
63# endif
64
65	punpcklbw %xmm1, %xmm1
66# ifndef USE_AS_RAWMEMCHR
67	mov	%ecx, %edi
68# else
69	mov	%ecx, %edx
70# endif
71	punpcklbw %xmm1, %xmm1
72
73	and	$63, %ecx
74	pshufd	$0, %xmm1, %xmm1
75	cmp	$48, %ecx
76	ja	L(crosscache)
77
78# ifndef USE_AS_RAWMEMCHR
79	movdqu	(%edi), %xmm0
80# else
81	movdqu	(%edx), %xmm0
82# endif
83	pcmpeqb	%xmm1, %xmm0
84	pmovmskb %xmm0, %eax
85	test	%eax, %eax
86# ifndef USE_AS_RAWMEMCHR
87	jnz	L(match_case2_prolog)
88
89	sub	$16, %edx
90	jbe	L(return_null)
91	lea	16(%edi), %edi
92	and	$15, %ecx
93	and	$-16, %edi
94	add	%ecx, %edx
95# else
96	jnz	L(match_case1_prolog)
97	lea	16(%edx), %edx
98	and	$-16, %edx
99# endif
100	jmp	L(loop_prolog)
101
102	.p2align 4
103L(crosscache):
104	and	$15, %ecx
105# ifndef USE_AS_RAWMEMCHR
106	and	$-16, %edi
107	movdqa	(%edi), %xmm0
108# else
109	and	$-16, %edx
110	movdqa	(%edx), %xmm0
111# endif
112	pcmpeqb	%xmm1, %xmm0
113	pmovmskb %xmm0, %eax
114	sar	%cl, %eax
115	test	%eax, %eax
116
117# ifndef USE_AS_RAWMEMCHR
118	jnz	L(match_case2_prolog1)
119        /* "ecx" is less than 16.  Calculate "edx + ecx - 16" by using
120	   "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void
121	   possible addition overflow.  */
122	neg	%ecx
123	add	$16, %ecx
124	sub	%ecx, %edx
125	jbe	L(return_null)
126	lea	16(%edi), %edi
127# else
128	jnz	L(match_case1_prolog1)
129	lea	16(%edx), %edx
130# endif
131
132	.p2align 4
133L(loop_prolog):
134# ifndef USE_AS_RAWMEMCHR
135	sub	$64, %edx
136	jbe	L(exit_loop)
137	movdqa	(%edi), %xmm0
138# else
139	movdqa	(%edx), %xmm0
140# endif
141	pcmpeqb	%xmm1, %xmm0
142	xor	%ecx, %ecx
143	pmovmskb %xmm0, %eax
144	test	%eax, %eax
145	jnz	L(match_case1)
146
147# ifndef USE_AS_RAWMEMCHR
148	movdqa	16(%edi), %xmm2
149# else
150	movdqa	16(%edx), %xmm2
151# endif
152	pcmpeqb	%xmm1, %xmm2
153	lea	16(%ecx), %ecx
154	pmovmskb %xmm2, %eax
155	test	%eax, %eax
156	jnz	L(match_case1)
157
158# ifndef USE_AS_RAWMEMCHR
159	movdqa	32(%edi), %xmm3
160# else
161	movdqa	32(%edx), %xmm3
162# endif
163	pcmpeqb	%xmm1, %xmm3
164	lea	16(%ecx), %ecx
165	pmovmskb %xmm3, %eax
166	test	%eax, %eax
167	jnz	L(match_case1)
168
169# ifndef USE_AS_RAWMEMCHR
170	movdqa	48(%edi), %xmm4
171# else
172	movdqa	48(%edx), %xmm4
173# endif
174	pcmpeqb	%xmm1, %xmm4
175	lea	16(%ecx), %ecx
176	pmovmskb %xmm4, %eax
177	test	%eax, %eax
178	jnz	L(match_case1)
179
180# ifndef USE_AS_RAWMEMCHR
181	lea	64(%edi), %edi
182	sub	$64, %edx
183	jbe	L(exit_loop)
184
185	movdqa	(%edi), %xmm0
186# else
187	lea	64(%edx), %edx
188	movdqa	(%edx), %xmm0
189# endif
190	pcmpeqb	%xmm1, %xmm0
191	xor	%ecx, %ecx
192	pmovmskb %xmm0, %eax
193	test	%eax, %eax
194	jnz	L(match_case1)
195
196# ifndef USE_AS_RAWMEMCHR
197	movdqa	16(%edi), %xmm2
198# else
199	movdqa	16(%edx), %xmm2
200# endif
201	pcmpeqb	%xmm1, %xmm2
202	lea	16(%ecx), %ecx
203	pmovmskb %xmm2, %eax
204	test	%eax, %eax
205	jnz	L(match_case1)
206
207# ifndef USE_AS_RAWMEMCHR
208	movdqa	32(%edi), %xmm3
209# else
210	movdqa	32(%edx), %xmm3
211# endif
212	pcmpeqb	%xmm1, %xmm3
213	lea	16(%ecx), %ecx
214	pmovmskb %xmm3, %eax
215	test	%eax, %eax
216	jnz	L(match_case1)
217
218# ifndef USE_AS_RAWMEMCHR
219	movdqa	48(%edi), %xmm4
220# else
221	movdqa	48(%edx), %xmm4
222# endif
223	pcmpeqb	%xmm1, %xmm4
224	lea	16(%ecx), %ecx
225	pmovmskb %xmm4, %eax
226	test	%eax, %eax
227	jnz	L(match_case1)
228
229# ifndef USE_AS_RAWMEMCHR
230	lea	64(%edi), %edi
231	mov	%edi, %ecx
232	and	$-64, %edi
233	and	$63, %ecx
234	add	%ecx, %edx
235# else
236	lea	64(%edx), %edx
237	and	$-64, %edx
238# endif
239
240	.p2align 4
241L(align64_loop):
242
243# ifndef USE_AS_RAWMEMCHR
244	sub	$64, %edx
245	jbe	L(exit_loop)
246	movdqa	(%edi), %xmm0
247	movdqa	16(%edi), %xmm2
248	movdqa	32(%edi), %xmm3
249	movdqa	48(%edi), %xmm4
250# else
251	movdqa	(%edx), %xmm0
252	movdqa	16(%edx), %xmm2
253	movdqa	32(%edx), %xmm3
254	movdqa	48(%edx), %xmm4
255# endif
256	pcmpeqb	%xmm1, %xmm0
257	pcmpeqb	%xmm1, %xmm2
258	pcmpeqb	%xmm1, %xmm3
259	pcmpeqb	%xmm1, %xmm4
260
261	pmaxub	%xmm0, %xmm3
262	pmaxub	%xmm2, %xmm4
263	pmaxub	%xmm3, %xmm4
264# ifndef USE_AS_RAWMEMCHR
265	add	$64, %edi
266# else
267	add	$64, %edx
268# endif
269	pmovmskb %xmm4, %eax
270
271	test	%eax, %eax
272	jz	L(align64_loop)
273
274# ifndef USE_AS_RAWMEMCHR
275	sub	$64, %edi
276# else
277	sub	$64, %edx
278# endif
279
280	pmovmskb %xmm0, %eax
281	xor	%ecx, %ecx
282	test	%eax, %eax
283	jnz	L(match_case1)
284
285	pmovmskb %xmm2, %eax
286	lea	16(%ecx), %ecx
287	test	%eax, %eax
288	jnz	L(match_case1)
289
290# ifndef USE_AS_RAWMEMCHR
291	movdqa	32(%edi), %xmm3
292# else
293	movdqa	32(%edx), %xmm3
294# endif
295	pcmpeqb	%xmm1, %xmm3
296	pmovmskb %xmm3, %eax
297	lea	16(%ecx), %ecx
298	test	%eax, %eax
299	jnz	L(match_case1)
300
301# ifndef USE_AS_RAWMEMCHR
302	pcmpeqb	48(%edi), %xmm1
303# else
304	pcmpeqb	48(%edx), %xmm1
305# endif
306	pmovmskb %xmm1, %eax
307	lea	16(%ecx), %ecx
308
309	.p2align 4
310L(match_case1):
311# ifndef USE_AS_RAWMEMCHR
312	add	%ecx, %edi
313# else
314L(match_case1_prolog1):
315	add	%ecx, %edx
316L(match_case1_prolog):
317# endif
318	test	%al, %al
319	jz	L(match_case1_high)
320	mov	%al, %cl
321	and	$15, %cl
322	jz	L(match_case1_8)
323	test	$0x01, %al
324	jnz	L(ExitCase1_1)
325	test	$0x02, %al
326	jnz	L(ExitCase1_2)
327	test	$0x04, %al
328	jnz	L(ExitCase1_3)
329# ifndef USE_AS_RAWMEMCHR
330	lea	3(%edi), %eax
331	RETURN
332# else
333	lea	3(%edx), %eax
334	ret
335# endif
336
337	.p2align 4
338L(match_case1_8):
339	test	$0x10, %al
340	jnz	L(ExitCase1_5)
341	test	$0x20, %al
342	jnz	L(ExitCase1_6)
343	test	$0x40, %al
344	jnz	L(ExitCase1_7)
345# ifndef USE_AS_RAWMEMCHR
346	lea	7(%edi), %eax
347	RETURN
348# else
349	lea	7(%edx), %eax
350	ret
351# endif
352
353	.p2align 4
354L(match_case1_high):
355	mov	%ah, %ch
356	and	$15, %ch
357	jz	L(match_case1_high_8)
358	test	$0x01, %ah
359	jnz	L(ExitCase1_9)
360	test	$0x02, %ah
361	jnz	L(ExitCase1_10)
362	test	$0x04, %ah
363	jnz	L(ExitCase1_11)
364# ifndef USE_AS_RAWMEMCHR
365	lea	11(%edi), %eax
366	RETURN
367# else
368	lea	11(%edx), %eax
369	ret
370# endif
371
372	.p2align 4
373L(match_case1_high_8):
374	test	$0x10, %ah
375	jnz	L(ExitCase1_13)
376	test	$0x20, %ah
377	jnz	L(ExitCase1_14)
378	test	$0x40, %ah
379	jnz	L(ExitCase1_15)
380# ifndef USE_AS_RAWMEMCHR
381	lea	15(%edi), %eax
382	RETURN
383# else
384	lea	15(%edx), %eax
385	ret
386# endif
387
388# ifndef USE_AS_RAWMEMCHR
389	.p2align 4
390L(exit_loop):
391	add	$64, %edx
392
393	movdqa	(%edi), %xmm0
394	pcmpeqb	%xmm1, %xmm0
395	xor	%ecx, %ecx
396	pmovmskb %xmm0, %eax
397	test	%eax, %eax
398	jnz	L(match_case2)
399	cmp	$16, %edx
400	jbe	L(return_null)
401
402	movdqa	16(%edi), %xmm2
403	pcmpeqb	%xmm1, %xmm2
404	lea	16(%ecx), %ecx
405	pmovmskb %xmm2, %eax
406	test	%eax, %eax
407	jnz	L(match_case2)
408	cmp	$32, %edx
409	jbe	L(return_null)
410
411	movdqa	32(%edi), %xmm3
412	pcmpeqb	%xmm1, %xmm3
413	lea	16(%ecx), %ecx
414	pmovmskb %xmm3, %eax
415	test	%eax, %eax
416	jnz	L(match_case2)
417	cmp	$48, %edx
418	jbe	L(return_null)
419
420	pcmpeqb	48(%edi), %xmm1
421	lea	16(%ecx), %ecx
422	pmovmskb %xmm1, %eax
423	test	%eax, %eax
424	jnz	L(match_case2)
425
426	xor	%eax, %eax
427	RETURN
428# endif
429
430	.p2align 4
431L(ExitCase1_1):
432# ifndef USE_AS_RAWMEMCHR
433	mov	%edi, %eax
434	RETURN
435# else
436	mov	%edx, %eax
437	ret
438# endif
439
440	.p2align 4
441L(ExitCase1_2):
442# ifndef USE_AS_RAWMEMCHR
443	lea	1(%edi), %eax
444	RETURN
445# else
446	lea	1(%edx), %eax
447	ret
448# endif
449
450	.p2align 4
451L(ExitCase1_3):
452# ifndef USE_AS_RAWMEMCHR
453	lea	2(%edi), %eax
454	RETURN
455# else
456	lea	2(%edx), %eax
457	ret
458# endif
459
460	.p2align 4
461L(ExitCase1_5):
462# ifndef USE_AS_RAWMEMCHR
463	lea	4(%edi), %eax
464	RETURN
465# else
466	lea	4(%edx), %eax
467	ret
468# endif
469
470	.p2align 4
471L(ExitCase1_6):
472# ifndef USE_AS_RAWMEMCHR
473	lea	5(%edi), %eax
474	RETURN
475# else
476	lea	5(%edx), %eax
477	ret
478# endif
479
480	.p2align 4
481L(ExitCase1_7):
482# ifndef USE_AS_RAWMEMCHR
483	lea	6(%edi), %eax
484	RETURN
485# else
486	lea	6(%edx), %eax
487	ret
488# endif
489
490	.p2align 4
491L(ExitCase1_9):
492# ifndef USE_AS_RAWMEMCHR
493	lea	8(%edi), %eax
494	RETURN
495# else
496	lea	8(%edx), %eax
497	ret
498# endif
499
500	.p2align 4
501L(ExitCase1_10):
502# ifndef USE_AS_RAWMEMCHR
503	lea	9(%edi), %eax
504	RETURN
505# else
506	lea	9(%edx), %eax
507	ret
508# endif
509
510	.p2align 4
511L(ExitCase1_11):
512# ifndef USE_AS_RAWMEMCHR
513	lea	10(%edi), %eax
514	RETURN
515# else
516	lea	10(%edx), %eax
517	ret
518# endif
519
520	.p2align 4
521L(ExitCase1_13):
522# ifndef USE_AS_RAWMEMCHR
523	lea	12(%edi), %eax
524	RETURN
525# else
526	lea	12(%edx), %eax
527	ret
528# endif
529
530	.p2align 4
531L(ExitCase1_14):
532# ifndef USE_AS_RAWMEMCHR
533	lea	13(%edi), %eax
534	RETURN
535# else
536	lea	13(%edx), %eax
537	ret
538# endif
539
540	.p2align 4
541L(ExitCase1_15):
542# ifndef USE_AS_RAWMEMCHR
543	lea	14(%edi), %eax
544	RETURN
545# else
546	lea	14(%edx), %eax
547	ret
548# endif
549
550# ifndef USE_AS_RAWMEMCHR
551	.p2align 4
552L(match_case2):
553	sub	%ecx, %edx
554L(match_case2_prolog1):
555	add	%ecx, %edi
556L(match_case2_prolog):
557	test	%al, %al
558	jz	L(match_case2_high)
559	mov	%al, %cl
560	and	$15, %cl
561	jz	L(match_case2_8)
562	test	$0x01, %al
563	jnz	L(ExitCase2_1)
564	test	$0x02, %al
565	jnz	L(ExitCase2_2)
566	test	$0x04, %al
567	jnz	L(ExitCase2_3)
568	sub	$4, %edx
569	jb	L(return_null)
570	lea	3(%edi), %eax
571	RETURN
572
573	.p2align 4
574L(match_case2_8):
575	test	$0x10, %al
576	jnz	L(ExitCase2_5)
577	test	$0x20, %al
578	jnz	L(ExitCase2_6)
579	test	$0x40, %al
580	jnz	L(ExitCase2_7)
581	sub	$8, %edx
582	jb	L(return_null)
583	lea	7(%edi), %eax
584	RETURN
585
586	.p2align 4
587L(match_case2_high):
588	mov	%ah, %ch
589	and	$15, %ch
590	jz	L(match_case2_high_8)
591	test	$0x01, %ah
592	jnz	L(ExitCase2_9)
593	test	$0x02, %ah
594	jnz	L(ExitCase2_10)
595	test	$0x04, %ah
596	jnz	L(ExitCase2_11)
597	sub	$12, %edx
598	jb	L(return_null)
599	lea	11(%edi), %eax
600	RETURN
601
602	.p2align 4
603L(match_case2_high_8):
604	test	$0x10, %ah
605	jnz	L(ExitCase2_13)
606	test	$0x20, %ah
607	jnz	L(ExitCase2_14)
608	test	$0x40, %ah
609	jnz	L(ExitCase2_15)
610	sub	$16, %edx
611	jb	L(return_null)
612	lea	15(%edi), %eax
613	RETURN
614
615	.p2align 4
616L(ExitCase2_1):
617	mov	%edi, %eax
618	RETURN
619
620	.p2align 4
621L(ExitCase2_2):
622	sub	$2, %edx
623	jb	L(return_null)
624	lea	1(%edi), %eax
625	RETURN
626
627	.p2align 4
628L(ExitCase2_3):
629	sub	$3, %edx
630	jb	L(return_null)
631	lea	2(%edi), %eax
632	RETURN
633
634	.p2align 4
635L(ExitCase2_5):
636	sub	$5, %edx
637	jb	L(return_null)
638	lea	4(%edi), %eax
639	RETURN
640
641	.p2align 4
642L(ExitCase2_6):
643	sub	$6, %edx
644	jb	L(return_null)
645	lea	5(%edi), %eax
646	RETURN
647
648	.p2align 4
649L(ExitCase2_7):
650	sub	$7, %edx
651	jb	L(return_null)
652	lea	6(%edi), %eax
653	RETURN
654
655	.p2align 4
656L(ExitCase2_9):
657	sub	$9, %edx
658	jb	L(return_null)
659	lea	8(%edi), %eax
660	RETURN
661
662	.p2align 4
663L(ExitCase2_10):
664	sub	$10, %edx
665	jb	L(return_null)
666	lea	9(%edi), %eax
667	RETURN
668
669	.p2align 4
670L(ExitCase2_11):
671	sub	$11, %edx
672	jb	L(return_null)
673	lea	10(%edi), %eax
674	RETURN
675
676	.p2align 4
677L(ExitCase2_13):
678	sub	$13, %edx
679	jb	L(return_null)
680	lea	12(%edi), %eax
681	RETURN
682
683	.p2align 4
684L(ExitCase2_14):
685	sub	$14, %edx
686	jb	L(return_null)
687	lea	13(%edi), %eax
688	RETURN
689
690	.p2align 4
691L(ExitCase2_15):
692	sub	$15, %edx
693	jb	L(return_null)
694	lea	14(%edi), %eax
695	RETURN
696# endif
697
698	.p2align 4
699L(return_null):
700	xor	%eax, %eax
701# ifndef USE_AS_RAWMEMCHR
702	RETURN
703# else
704	ret
705# endif
706
707END (MEMCHR)
708#endif
709