1/* Optimized memrchr with sse2 without bsf
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22# define CFI_PUSH(REG)	\
23	cfi_adjust_cfa_offset (4);	\
24	cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG)	\
27	cfi_adjust_cfa_offset (-4);	\
28	cfi_restore (REG)
29
30# define PUSH(REG) pushl REG; CFI_PUSH (REG)
31# define POP(REG) popl REG; CFI_POP (REG)
32
33# define PARMS  4
34# define STR1  PARMS
35# define STR2  STR1+4
36# define LEN   STR2+4
37
38	atom_text_section
39ENTRY (__memrchr_sse2)
40	mov	STR1(%esp), %ecx
41	movd	STR2(%esp), %xmm1
42	mov	LEN(%esp), %edx
43
44	sub	$16, %edx
45	jbe	L(length_less16)
46
47	punpcklbw %xmm1, %xmm1
48	add	%edx, %ecx
49	punpcklbw %xmm1, %xmm1
50
51	movdqu	(%ecx), %xmm0
52	pshufd	$0, %xmm1, %xmm1
53	pcmpeqb	%xmm1, %xmm0
54
55	pmovmskb %xmm0, %eax
56	test	%eax, %eax
57	jnz	L(exit_dispatch)
58
59	sub	$64, %ecx
60	mov	%ecx, %eax
61	and	$15, %eax
62	jz	L(loop_prolog)
63
64	lea	16(%ecx), %ecx
65	lea	16(%edx), %edx
66	sub	%eax, %edx
67	and	$-16, %ecx
68
69	.p2align 4
70/* Loop start on aligned string.  */
71L(loop_prolog):
72	sub	$64, %edx
73	jbe	L(exit_loop)
74
75	movdqa	48(%ecx), %xmm0
76	pcmpeqb	%xmm1, %xmm0
77	pmovmskb %xmm0, %eax
78	test	%eax, %eax
79	jnz	L(matches48)
80
81	movdqa	32(%ecx), %xmm2
82	pcmpeqb	%xmm1, %xmm2
83	pmovmskb %xmm2, %eax
84	test	%eax, %eax
85	jnz	L(matches32)
86
87	movdqa	16(%ecx), %xmm3
88	pcmpeqb	%xmm1, %xmm3
89	pmovmskb %xmm3, %eax
90	test	%eax, %eax
91	jnz	L(matches16)
92
93	movdqa	(%ecx), %xmm4
94	pcmpeqb	%xmm1, %xmm4
95	pmovmskb %xmm4, %eax
96	test	%eax, %eax
97	jnz	L(exit_dispatch)
98
99	sub	$64, %ecx
100	sub	$64, %edx
101	jbe	L(exit_loop)
102
103	movdqa	48(%ecx), %xmm0
104	pcmpeqb	%xmm1, %xmm0
105	pmovmskb %xmm0, %eax
106	test	%eax, %eax
107	jnz	L(matches48)
108
109	movdqa	32(%ecx), %xmm2
110	pcmpeqb	%xmm1, %xmm2
111	pmovmskb %xmm2, %eax
112	test	%eax, %eax
113	jnz	L(matches32)
114
115	movdqa	16(%ecx), %xmm3
116	pcmpeqb	%xmm1, %xmm3
117	pmovmskb %xmm3, %eax
118	test	%eax, %eax
119	jnz	L(matches16)
120
121	movdqa	(%ecx), %xmm3
122	pcmpeqb	%xmm1, %xmm3
123	pmovmskb %xmm3, %eax
124	test	%eax, %eax
125	jnz	L(exit_dispatch)
126
127	mov	%ecx, %eax
128	and	$63, %eax
129	test	%eax, %eax
130	jz	L(align64_loop)
131
132	lea	64(%ecx), %ecx
133	lea	64(%edx), %edx
134	and	$-64, %ecx
135	sub	%eax, %edx
136
137	.p2align 4
138L(align64_loop):
139	sub	$64, %ecx
140	sub	$64, %edx
141	jbe	L(exit_loop)
142
143	movdqa	(%ecx), %xmm0
144	movdqa	16(%ecx), %xmm2
145	movdqa	32(%ecx), %xmm3
146	movdqa	48(%ecx), %xmm4
147
148	pcmpeqb	%xmm1, %xmm0
149	pcmpeqb	%xmm1, %xmm2
150	pcmpeqb	%xmm1, %xmm3
151	pcmpeqb	%xmm1, %xmm4
152
153	pmaxub	%xmm3, %xmm0
154	pmaxub	%xmm4, %xmm2
155	pmaxub	%xmm0, %xmm2
156	pmovmskb %xmm2, %eax
157
158	test	%eax, %eax
159	jz	L(align64_loop)
160
161	pmovmskb %xmm4, %eax
162	test	%eax, %eax
163	jnz	L(matches48)
164
165	pmovmskb %xmm3, %eax
166	test	%eax, %eax
167	jnz	L(matches32)
168
169	movdqa	16(%ecx), %xmm2
170
171	pcmpeqb	%xmm1, %xmm2
172	pcmpeqb	(%ecx), %xmm1
173
174	pmovmskb %xmm2, %eax
175	test	%eax, %eax
176	jnz	L(matches16)
177
178	pmovmskb %xmm1, %eax
179	test	%ah, %ah
180	jnz	L(exit_dispatch_high)
181	mov	%al, %dl
182	and	$15 << 4, %dl
183	jnz	L(exit_dispatch_8)
184	test	$0x08, %al
185	jnz	L(exit_4)
186	test	$0x04, %al
187	jnz	L(exit_3)
188	test	$0x02, %al
189	jnz	L(exit_2)
190	mov	%ecx, %eax
191	ret
192
193	.p2align 4
194L(exit_loop):
195	add	$64, %edx
196	cmp	$32, %edx
197	jbe	L(exit_loop_32)
198
199	movdqa	48(%ecx), %xmm0
200	pcmpeqb	%xmm1, %xmm0
201	pmovmskb %xmm0, %eax
202	test	%eax, %eax
203	jnz	L(matches48)
204
205	movdqa	32(%ecx), %xmm2
206	pcmpeqb	%xmm1, %xmm2
207	pmovmskb %xmm2, %eax
208	test	%eax, %eax
209	jnz	L(matches32)
210
211	movdqa	16(%ecx), %xmm3
212	pcmpeqb	%xmm1, %xmm3
213	pmovmskb %xmm3, %eax
214	test	%eax, %eax
215	jnz	L(matches16_1)
216	cmp	$48, %edx
217	jbe	L(return_null)
218
219	pcmpeqb	(%ecx), %xmm1
220	pmovmskb %xmm1, %eax
221	test	%eax, %eax
222	jnz	L(matches0_1)
223	xor	%eax, %eax
224	ret
225
226	.p2align 4
227L(exit_loop_32):
228	movdqa	48(%ecx), %xmm0
229	pcmpeqb	%xmm1, %xmm0
230	pmovmskb %xmm0, %eax
231	test	%eax, %eax
232	jnz	L(matches48_1)
233	cmp	$16, %edx
234	jbe	L(return_null)
235
236	pcmpeqb	32(%ecx), %xmm1
237	pmovmskb %xmm1, %eax
238	test	%eax, %eax
239	jnz	L(matches32_1)
240	xor	%eax, %eax
241	ret
242
243	.p2align 4
244L(matches16):
245	lea	16(%ecx), %ecx
246	test	%ah, %ah
247	jnz	L(exit_dispatch_high)
248	mov	%al, %dl
249	and	$15 << 4, %dl
250	jnz	L(exit_dispatch_8)
251	test	$0x08, %al
252	jnz	L(exit_4)
253	test	$0x04, %al
254	jnz	L(exit_3)
255	test	$0x02, %al
256	jnz	L(exit_2)
257	mov	%ecx, %eax
258	ret
259
260	.p2align 4
261L(matches32):
262	lea	32(%ecx), %ecx
263	test	%ah, %ah
264	jnz	L(exit_dispatch_high)
265	mov	%al, %dl
266	and	$15 << 4, %dl
267	jnz	L(exit_dispatch_8)
268	test	$0x08, %al
269	jnz	L(exit_4)
270	test	$0x04, %al
271	jnz	L(exit_3)
272	test	$0x02, %al
273	jnz	L(exit_2)
274	mov	%ecx, %eax
275	ret
276
277	.p2align 4
278L(matches48):
279	lea	48(%ecx), %ecx
280
281	.p2align 4
282L(exit_dispatch):
283	test	%ah, %ah
284	jnz	L(exit_dispatch_high)
285	mov	%al, %dl
286	and	$15 << 4, %dl
287	jnz	L(exit_dispatch_8)
288	test	$0x08, %al
289	jnz	L(exit_4)
290	test	$0x04, %al
291	jnz	L(exit_3)
292	test	$0x02, %al
293	jnz	L(exit_2)
294	mov	%ecx, %eax
295	ret
296
297	.p2align 4
298L(exit_dispatch_8):
299	test	$0x80, %al
300	jnz	L(exit_8)
301	test	$0x40, %al
302	jnz	L(exit_7)
303	test	$0x20, %al
304	jnz	L(exit_6)
305	lea	4(%ecx), %eax
306	ret
307
308	.p2align 4
309L(exit_dispatch_high):
310	mov	%ah, %dh
311	and	$15 << 4, %dh
312	jnz	L(exit_dispatch_high_8)
313	test	$0x08, %ah
314	jnz	L(exit_12)
315	test	$0x04, %ah
316	jnz	L(exit_11)
317	test	$0x02, %ah
318	jnz	L(exit_10)
319	lea	8(%ecx), %eax
320	ret
321
322	.p2align 4
323L(exit_dispatch_high_8):
324	test	$0x80, %ah
325	jnz	L(exit_16)
326	test	$0x40, %ah
327	jnz	L(exit_15)
328	test	$0x20, %ah
329	jnz	L(exit_14)
330	lea	12(%ecx), %eax
331	ret
332
333	.p2align 4
334L(exit_2):
335	lea	1(%ecx), %eax
336	ret
337
338	.p2align 4
339L(exit_3):
340	lea	2(%ecx), %eax
341	ret
342
343	.p2align 4
344L(exit_4):
345	lea	3(%ecx), %eax
346	ret
347
348	.p2align 4
349L(exit_6):
350	lea	5(%ecx), %eax
351	ret
352
353	.p2align 4
354L(exit_7):
355	lea	6(%ecx), %eax
356	ret
357
358	.p2align 4
359L(exit_8):
360	lea	7(%ecx), %eax
361	ret
362
363	.p2align 4
364L(exit_10):
365	lea	9(%ecx), %eax
366	ret
367
368	.p2align 4
369L(exit_11):
370	lea	10(%ecx), %eax
371	ret
372
373	.p2align 4
374L(exit_12):
375	lea	11(%ecx), %eax
376	ret
377
378	.p2align 4
379L(exit_14):
380	lea	13(%ecx), %eax
381	ret
382
383	.p2align 4
384L(exit_15):
385	lea	14(%ecx), %eax
386	ret
387
388	.p2align 4
389L(exit_16):
390	lea	15(%ecx), %eax
391	ret
392
393	.p2align 4
394L(matches0_1):
395	lea	-64(%edx), %edx
396
397	test	%ah, %ah
398	jnz	L(exit_dispatch_1_high)
399	mov	%al, %ah
400	and	$15 << 4, %ah
401	jnz	L(exit_dispatch_1_8)
402	test	$0x08, %al
403	jnz	L(exit_1_4)
404	test	$0x04, %al
405	jnz	L(exit_1_3)
406	test	$0x02, %al
407	jnz	L(exit_1_2)
408	add	$0, %edx
409	jl	L(return_null)
410	mov	%ecx, %eax
411	ret
412
413	.p2align 4
414L(matches16_1):
415	lea	-48(%edx), %edx
416	lea	16(%ecx), %ecx
417
418	test	%ah, %ah
419	jnz	L(exit_dispatch_1_high)
420	mov	%al, %ah
421	and	$15 << 4, %ah
422	jnz	L(exit_dispatch_1_8)
423	test	$0x08, %al
424	jnz	L(exit_1_4)
425	test	$0x04, %al
426	jnz	L(exit_1_3)
427	test	$0x02, %al
428	jnz	L(exit_1_2)
429	add	$0, %edx
430	jl	L(return_null)
431	mov	%ecx, %eax
432	ret
433
434	.p2align 4
435L(matches32_1):
436	lea	-32(%edx), %edx
437	lea	32(%ecx), %ecx
438
439	test	%ah, %ah
440	jnz	L(exit_dispatch_1_high)
441	mov	%al, %ah
442	and	$15 << 4, %ah
443	jnz	L(exit_dispatch_1_8)
444	test	$0x08, %al
445	jnz	L(exit_1_4)
446	test	$0x04, %al
447	jnz	L(exit_1_3)
448	test	$0x02, %al
449	jnz	L(exit_1_2)
450	add	$0, %edx
451	jl	L(return_null)
452	mov	%ecx, %eax
453	ret
454
455	.p2align 4
456L(matches48_1):
457	lea	-16(%edx), %edx
458	lea	48(%ecx), %ecx
459
460	.p2align 4
461L(exit_dispatch_1):
462	test	%ah, %ah
463	jnz	L(exit_dispatch_1_high)
464	mov	%al, %ah
465	and	$15 << 4, %ah
466	jnz	L(exit_dispatch_1_8)
467	test	$0x08, %al
468	jnz	L(exit_1_4)
469	test	$0x04, %al
470	jnz	L(exit_1_3)
471	test	$0x02, %al
472	jnz	L(exit_1_2)
473	add	$0, %edx
474	jl	L(return_null)
475	mov	%ecx, %eax
476	ret
477
478	.p2align 4
479L(exit_dispatch_1_8):
480	test	$0x80, %al
481	jnz	L(exit_1_8)
482	test	$0x40, %al
483	jnz	L(exit_1_7)
484	test	$0x20, %al
485	jnz	L(exit_1_6)
486	add	$4, %edx
487	jl	L(return_null)
488	lea	4(%ecx), %eax
489	ret
490
491	.p2align 4
492L(exit_dispatch_1_high):
493	mov	%ah, %al
494	and	$15 << 4, %al
495	jnz	L(exit_dispatch_1_high_8)
496	test	$0x08, %ah
497	jnz	L(exit_1_12)
498	test	$0x04, %ah
499	jnz	L(exit_1_11)
500	test	$0x02, %ah
501	jnz	L(exit_1_10)
502	add	$8, %edx
503	jl	L(return_null)
504	lea	8(%ecx), %eax
505	ret
506
507	.p2align 4
508L(exit_dispatch_1_high_8):
509	test	$0x80, %ah
510	jnz	L(exit_1_16)
511	test	$0x40, %ah
512	jnz	L(exit_1_15)
513	test	$0x20, %ah
514	jnz	L(exit_1_14)
515	add	$12, %edx
516	jl	L(return_null)
517	lea	12(%ecx), %eax
518	ret
519
520	.p2align 4
521L(exit_1_2):
522	add	$1, %edx
523	jl	L(return_null)
524	lea	1(%ecx), %eax
525	ret
526
527	.p2align 4
528L(exit_1_3):
529	add	$2, %edx
530	jl	L(return_null)
531	lea	2(%ecx), %eax
532	ret
533
534	.p2align 4
535L(exit_1_4):
536	add	$3, %edx
537	jl	L(return_null)
538	lea	3(%ecx), %eax
539	ret
540
541	.p2align 4
542L(exit_1_6):
543	add	$5, %edx
544	jl	L(return_null)
545	lea	5(%ecx), %eax
546	ret
547
548	.p2align 4
549L(exit_1_7):
550	add	$6, %edx
551	jl	L(return_null)
552	lea	6(%ecx), %eax
553	ret
554
555	.p2align 4
556L(exit_1_8):
557	add	$7, %edx
558	jl	L(return_null)
559	lea	7(%ecx), %eax
560	ret
561
562	.p2align 4
563L(exit_1_10):
564	add	$9, %edx
565	jl	L(return_null)
566	lea	9(%ecx), %eax
567	ret
568
569	.p2align 4
570L(exit_1_11):
571	add	$10, %edx
572	jl	L(return_null)
573	lea	10(%ecx), %eax
574	ret
575
576	.p2align 4
577L(exit_1_12):
578	add	$11, %edx
579	jl	L(return_null)
580	lea	11(%ecx), %eax
581	ret
582
583	.p2align 4
584L(exit_1_14):
585	add	$13, %edx
586	jl	L(return_null)
587	lea	13(%ecx), %eax
588	ret
589
590	.p2align 4
591L(exit_1_15):
592	add	$14, %edx
593	jl	L(return_null)
594	lea	14(%ecx), %eax
595	ret
596
597	.p2align 4
598L(exit_1_16):
599	add	$15, %edx
600	jl	L(return_null)
601	lea	15(%ecx), %eax
602	ret
603
604	.p2align 4
605L(return_null):
606	xor	%eax, %eax
607	ret
608
609	.p2align 4
610L(length_less16_offset0):
611	mov	%dl, %cl
612	pcmpeqb	(%eax), %xmm1
613
614	mov	$1, %edx
615	sal	%cl, %edx
616	sub	$1, %edx
617
618	mov	%eax, %ecx
619	pmovmskb %xmm1, %eax
620
621	and	%edx, %eax
622	test	%eax, %eax
623	jnz	L(exit_dispatch)
624
625	xor	%eax, %eax
626	ret
627
628	.p2align 4
629L(length_less16):
630	punpcklbw %xmm1, %xmm1
631	add	$16, %edx
632	je	L(return_null)
633	punpcklbw %xmm1, %xmm1
634
635	mov	%ecx, %eax
636	pshufd	$0, %xmm1, %xmm1
637
638	and	$15, %ecx
639	jz	L(length_less16_offset0)
640
641	PUSH	(%edi)
642
643	mov	%cl, %dh
644	add	%dl, %dh
645	and	$-16, %eax
646
647	sub	$16, %dh
648	ja	L(length_less16_part2)
649
650	pcmpeqb	(%eax), %xmm1
651	pmovmskb %xmm1, %edi
652
653	sar	%cl, %edi
654	add	%ecx, %eax
655	mov	%dl, %cl
656
657	mov	$1, %edx
658	sal	%cl, %edx
659	sub	$1, %edx
660
661	and	%edx, %edi
662	test	%edi, %edi
663	jz	L(ret_null)
664
665	bsr	%edi, %edi
666	add	%edi, %eax
667	POP	(%edi)
668	ret
669
670	CFI_PUSH     (%edi)
671
672	.p2align 4
673L(length_less16_part2):
674	movdqa	16(%eax), %xmm2
675	pcmpeqb	%xmm1, %xmm2
676	pmovmskb %xmm2, %edi
677
678	mov	%cl, %ch
679
680	mov	%dh, %cl
681	mov	$1, %edx
682	sal	%cl, %edx
683	sub	$1, %edx
684
685	and	%edx, %edi
686
687	test	%edi, %edi
688	jnz	L(length_less16_part2_return)
689
690	pcmpeqb	(%eax), %xmm1
691	pmovmskb %xmm1, %edi
692
693	mov	%ch, %cl
694	sar	%cl, %edi
695	test	%edi, %edi
696	jz	L(ret_null)
697
698	bsr	%edi, %edi
699	add	%edi, %eax
700	xor	%ch, %ch
701	add	%ecx, %eax
702	POP	(%edi)
703	ret
704
705	CFI_PUSH     (%edi)
706
707	.p2align 4
708L(length_less16_part2_return):
709	bsr	%edi, %edi
710	lea	16(%eax, %edi), %eax
711	POP	(%edi)
712	ret
713
714	CFI_PUSH     (%edi)
715
716	.p2align 4
717L(ret_null):
718	xor	%eax, %eax
719	POP	(%edi)
720	ret
721
722END (__memrchr_sse2)
723#endif
724