1/* strrchr SSE2 without bsf and bsr
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS  8
35# define ENTRANCE PUSH(%edi);
36# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
37
38# define STR1  PARMS
39# define STR2  STR1+4
40
41	atom_text_section
42ENTRY (__strrchr_sse2)
43
44	ENTRANCE
45	mov	STR1(%esp), %ecx
46	movd	STR2(%esp), %xmm1
47
48	pxor	%xmm2, %xmm2
49	mov	%ecx, %edi
50	punpcklbw %xmm1, %xmm1
51	punpcklbw %xmm1, %xmm1
52	/* ECX has OFFSET. */
53	and	$63, %ecx
54	cmp	$48, %ecx
55	pshufd	$0, %xmm1, %xmm1
56	ja	L(crosscache)
57
58/* unaligned string. */
59	movdqu	(%edi), %xmm0
60	pcmpeqb	%xmm0, %xmm2
61	pcmpeqb	%xmm1, %xmm0
62	/* Find where NULL is.  */
63	pmovmskb %xmm2, %ecx
64	/* Check if there is a match.  */
65	pmovmskb %xmm0, %eax
66	add	$16, %edi
67
68	test	%eax, %eax
69	jnz	L(unaligned_match1)
70
71	test	%ecx, %ecx
72	jnz	L(return_null)
73
74	and	$-16, %edi
75
76	PUSH	(%esi)
77	PUSH	(%ebx)
78
79	xor	%ebx, %ebx
80	jmp	L(loop)
81
82	CFI_POP	(%esi)
83	CFI_POP	(%ebx)
84
85	.p2align 4
86L(unaligned_match1):
87	test	%ecx, %ecx
88	jnz	L(prolog_find_zero_1)
89
90	PUSH	(%esi)
91	PUSH	(%ebx)
92
93	mov	%eax, %ebx
94	mov	%edi, %esi
95	and	$-16, %edi
96	jmp	L(loop)
97
98	CFI_POP	(%esi)
99	CFI_POP	(%ebx)
100
101	.p2align 4
102L(crosscache):
103/* Hancle unaligned string.  */
104	and	$15, %ecx
105	and	$-16, %edi
106	pxor	%xmm3, %xmm3
107	movdqa	(%edi), %xmm0
108	pcmpeqb	%xmm0, %xmm3
109	pcmpeqb	%xmm1, %xmm0
110	/* Find where NULL is.  */
111	pmovmskb %xmm3, %edx
112	/* Check if there is a match.  */
113	pmovmskb %xmm0, %eax
114	/* Remove the leading bytes.  */
115	shr	%cl, %edx
116	shr	%cl, %eax
117	add	$16, %edi
118
119	test	%eax, %eax
120	jnz	L(unaligned_match)
121
122	test	%edx, %edx
123	jnz	L(return_null)
124
125	PUSH	(%esi)
126	PUSH	(%ebx)
127
128	xor	%ebx, %ebx
129	jmp	L(loop)
130
131	CFI_POP	(%esi)
132	CFI_POP	(%ebx)
133
134	.p2align 4
135L(unaligned_match):
136	test	%edx, %edx
137	jnz	L(prolog_find_zero)
138
139	PUSH	(%esi)
140	PUSH	(%ebx)
141
142	mov	%eax, %ebx
143	lea	(%edi, %ecx), %esi
144
145/* Loop start on aligned string.  */
146	.p2align 4
147L(loop):
148	movdqa	(%edi), %xmm0
149	pcmpeqb	%xmm0, %xmm2
150	add	$16, %edi
151	pcmpeqb	%xmm1, %xmm0
152	pmovmskb %xmm2, %ecx
153	pmovmskb %xmm0, %eax
154	or	%eax, %ecx
155	jnz	L(matches)
156
157	movdqa	(%edi), %xmm0
158	pcmpeqb	%xmm0, %xmm2
159	add	$16, %edi
160	pcmpeqb	%xmm1, %xmm0
161	pmovmskb %xmm2, %ecx
162	pmovmskb %xmm0, %eax
163	or	%eax, %ecx
164	jnz	L(matches)
165
166	movdqa	(%edi), %xmm0
167	pcmpeqb	%xmm0, %xmm2
168	add	$16, %edi
169	pcmpeqb	%xmm1, %xmm0
170	pmovmskb %xmm2, %ecx
171	pmovmskb %xmm0, %eax
172	or	%eax, %ecx
173	jnz	L(matches)
174
175	movdqa	(%edi), %xmm0
176	pcmpeqb	%xmm0, %xmm2
177	add	$16, %edi
178	pcmpeqb	%xmm1, %xmm0
179	pmovmskb %xmm2, %ecx
180	pmovmskb %xmm0, %eax
181	or	%eax, %ecx
182	jz	L(loop)
183
184L(matches):
185	test	%eax, %eax
186	jnz	L(match)
187L(return_value):
188	test	%ebx, %ebx
189	jz	L(return_null_1)
190	mov	%ebx, %eax
191	mov	%esi, %edi
192
193	POP	(%ebx)
194	POP	(%esi)
195
196	jmp	L(match_exit)
197
198	CFI_PUSH	(%ebx)
199	CFI_PUSH	(%esi)
200
201	.p2align 4
202L(return_null_1):
203	POP	(%ebx)
204	POP	(%esi)
205
206	xor	%eax, %eax
207	RETURN
208
209	CFI_PUSH	(%ebx)
210	CFI_PUSH	(%esi)
211
212	.p2align 4
213L(match):
214	pmovmskb %xmm2, %ecx
215	test	%ecx, %ecx
216	jnz	L(find_zero)
217	mov	%eax, %ebx
218	mov	%edi, %esi
219	jmp	L(loop)
220
221	.p2align 4
222L(find_zero):
223	test	%cl, %cl
224	jz	L(find_zero_high)
225	mov	%cl, %dl
226	and	$15, %dl
227	jz	L(find_zero_8)
228	test	$0x01, %cl
229	jnz	L(FindZeroExit1)
230	test	$0x02, %cl
231	jnz	L(FindZeroExit2)
232	test	$0x04, %cl
233	jnz	L(FindZeroExit3)
234	and	$1 << 4 - 1, %eax
235	jz	L(return_value)
236
237	POP	(%ebx)
238	POP	(%esi)
239	jmp	L(match_exit)
240
241	CFI_PUSH	(%ebx)
242	CFI_PUSH	(%esi)
243
244	.p2align 4
245L(find_zero_8):
246	test	$0x10, %cl
247	jnz	L(FindZeroExit5)
248	test	$0x20, %cl
249	jnz	L(FindZeroExit6)
250	test	$0x40, %cl
251	jnz	L(FindZeroExit7)
252	and	$1 << 8 - 1, %eax
253	jz	L(return_value)
254
255	POP	(%ebx)
256	POP	(%esi)
257	jmp	L(match_exit)
258
259	CFI_PUSH	(%ebx)
260	CFI_PUSH	(%esi)
261
262	.p2align 4
263L(find_zero_high):
264	mov	%ch, %dh
265	and	$15, %dh
266	jz	L(find_zero_high_8)
267	test	$0x01, %ch
268	jnz	L(FindZeroExit9)
269	test	$0x02, %ch
270	jnz	L(FindZeroExit10)
271	test	$0x04, %ch
272	jnz	L(FindZeroExit11)
273	and	$1 << 12 - 1, %eax
274	jz	L(return_value)
275
276	POP	(%ebx)
277	POP	(%esi)
278	jmp	L(match_exit)
279
280	CFI_PUSH	(%ebx)
281	CFI_PUSH	(%esi)
282
283	.p2align 4
284L(find_zero_high_8):
285	test	$0x10, %ch
286	jnz	L(FindZeroExit13)
287	test	$0x20, %ch
288	jnz	L(FindZeroExit14)
289	test	$0x40, %ch
290	jnz	L(FindZeroExit15)
291	and	$1 << 16 - 1, %eax
292	jz	L(return_value)
293
294	POP	(%ebx)
295	POP	(%esi)
296	jmp	L(match_exit)
297
298	CFI_PUSH	(%ebx)
299	CFI_PUSH	(%esi)
300
301	.p2align 4
302L(FindZeroExit1):
303	and	$1, %eax
304	jz	L(return_value)
305
306	POP	(%ebx)
307	POP	(%esi)
308	jmp	L(match_exit)
309
310	CFI_PUSH	(%ebx)
311	CFI_PUSH	(%esi)
312
313	.p2align 4
314L(FindZeroExit2):
315	and	$1 << 2 - 1, %eax
316	jz	L(return_value)
317
318	POP	(%ebx)
319	POP	(%esi)
320	jmp	L(match_exit)
321
322	CFI_PUSH	(%ebx)
323	CFI_PUSH	(%esi)
324
325	.p2align 4
326L(FindZeroExit3):
327	and	$1 << 3 - 1, %eax
328	jz	L(return_value)
329
330	POP	(%ebx)
331	POP	(%esi)
332	jmp	L(match_exit)
333
334	CFI_PUSH	(%ebx)
335	CFI_PUSH	(%esi)
336
337	.p2align 4
338L(FindZeroExit5):
339	and	$1 << 5 - 1, %eax
340	jz	L(return_value)
341
342	POP	(%ebx)
343	POP	(%esi)
344	jmp	L(match_exit)
345
346	CFI_PUSH	(%ebx)
347	CFI_PUSH	(%esi)
348
349	.p2align 4
350L(FindZeroExit6):
351	and	$1 << 6 - 1, %eax
352	jz	L(return_value)
353
354	POP	(%ebx)
355	POP	(%esi)
356	jmp	L(match_exit)
357
358	CFI_PUSH	(%ebx)
359	CFI_PUSH	(%esi)
360
361	.p2align 4
362L(FindZeroExit7):
363	and	$1 << 7 - 1, %eax
364	jz	L(return_value)
365
366	POP	(%ebx)
367	POP	(%esi)
368	jmp	L(match_exit)
369
370	CFI_PUSH	(%ebx)
371	CFI_PUSH	(%esi)
372
373	.p2align 4
374L(FindZeroExit9):
375	and	$1 << 9 - 1, %eax
376	jz	L(return_value)
377
378	POP	(%ebx)
379	POP	(%esi)
380	jmp	L(match_exit)
381
382	CFI_PUSH	(%ebx)
383	CFI_PUSH	(%esi)
384
385	.p2align 4
386L(FindZeroExit10):
387	and	$1 << 10 - 1, %eax
388	jz	L(return_value)
389
390	POP	(%ebx)
391	POP	(%esi)
392	jmp	L(match_exit)
393
394	CFI_PUSH	(%ebx)
395	CFI_PUSH	(%esi)
396
397	.p2align 4
398L(FindZeroExit11):
399	and	$1 << 11 - 1, %eax
400	jz	L(return_value)
401
402	POP	(%ebx)
403	POP	(%esi)
404	jmp	L(match_exit)
405
406	CFI_PUSH	(%ebx)
407	CFI_PUSH	(%esi)
408
409	.p2align 4
410L(FindZeroExit13):
411	and	$1 << 13 - 1, %eax
412	jz	L(return_value)
413
414	POP	(%ebx)
415	POP	(%esi)
416	jmp	L(match_exit)
417
418	CFI_PUSH	(%ebx)
419	CFI_PUSH	(%esi)
420
421	.p2align 4
422L(FindZeroExit14):
423	and	$1 << 14 - 1, %eax
424	jz	L(return_value)
425
426	POP	(%ebx)
427	POP	(%esi)
428	jmp	L(match_exit)
429
430	CFI_PUSH	(%ebx)
431	CFI_PUSH	(%esi)
432
433	.p2align 4
434L(FindZeroExit15):
435	and	$1 << 15 - 1, %eax
436	jz	L(return_value)
437
438	POP	(%ebx)
439	POP	(%esi)
440
441	.p2align 4
442L(match_exit):
443	test	%ah, %ah
444	jnz	L(match_exit_high)
445	mov	%al, %dl
446	and	$15 << 4, %dl
447	jnz	L(match_exit_8)
448	test	$0x08, %al
449	jnz	L(Exit4)
450	test	$0x04, %al
451	jnz	L(Exit3)
452	test	$0x02, %al
453	jnz	L(Exit2)
454	lea	-16(%edi), %eax
455	RETURN
456
457	.p2align 4
458L(match_exit_8):
459	test	$0x80, %al
460	jnz	L(Exit8)
461	test	$0x40, %al
462	jnz	L(Exit7)
463	test	$0x20, %al
464	jnz	L(Exit6)
465	lea	-12(%edi), %eax
466	RETURN
467
468	.p2align 4
469L(match_exit_high):
470	mov	%ah, %dh
471	and	$15 << 4, %dh
472	jnz	L(match_exit_high_8)
473	test	$0x08, %ah
474	jnz	L(Exit12)
475	test	$0x04, %ah
476	jnz	L(Exit11)
477	test	$0x02, %ah
478	jnz	L(Exit10)
479	lea	-8(%edi), %eax
480	RETURN
481
482	.p2align 4
483L(match_exit_high_8):
484	test	$0x80, %ah
485	jnz	L(Exit16)
486	test	$0x40, %ah
487	jnz	L(Exit15)
488	test	$0x20, %ah
489	jnz	L(Exit14)
490	lea	-4(%edi), %eax
491	RETURN
492
493	.p2align 4
494L(Exit2):
495	lea	-15(%edi), %eax
496	RETURN
497
498	.p2align 4
499L(Exit3):
500	lea	-14(%edi), %eax
501	RETURN
502
503	.p2align 4
504L(Exit4):
505	lea	-13(%edi), %eax
506	RETURN
507
508	.p2align 4
509L(Exit6):
510	lea	-11(%edi), %eax
511	RETURN
512
513	.p2align 4
514L(Exit7):
515	lea	-10(%edi), %eax
516	RETURN
517
518	.p2align 4
519L(Exit8):
520	lea	-9(%edi), %eax
521	RETURN
522
523	.p2align 4
524L(Exit10):
525	lea	-7(%edi), %eax
526	RETURN
527
528	.p2align 4
529L(Exit11):
530	lea	-6(%edi), %eax
531	RETURN
532
533	.p2align 4
534L(Exit12):
535	lea	-5(%edi), %eax
536	RETURN
537
538	.p2align 4
539L(Exit14):
540	lea	-3(%edi), %eax
541	RETURN
542
543	.p2align 4
544L(Exit15):
545	lea	-2(%edi), %eax
546	RETURN
547
548	.p2align 4
549L(Exit16):
550	lea	-1(%edi), %eax
551	RETURN
552
553/* Return NULL.  */
554	.p2align 4
555L(return_null):
556	xor	%eax, %eax
557	RETURN
558
559	.p2align 4
560L(prolog_find_zero):
561	add	%ecx, %edi
562	mov     %edx, %ecx
563L(prolog_find_zero_1):
564	test	%cl, %cl
565	jz	L(prolog_find_zero_high)
566	mov	%cl, %dl
567	and	$15, %dl
568	jz	L(prolog_find_zero_8)
569	test	$0x01, %cl
570	jnz	L(PrologFindZeroExit1)
571	test	$0x02, %cl
572	jnz	L(PrologFindZeroExit2)
573	test	$0x04, %cl
574	jnz	L(PrologFindZeroExit3)
575	and	$1 << 4 - 1, %eax
576	jnz	L(match_exit)
577	xor	%eax, %eax
578	RETURN
579
580	.p2align 4
581L(prolog_find_zero_8):
582	test	$0x10, %cl
583	jnz	L(PrologFindZeroExit5)
584	test	$0x20, %cl
585	jnz	L(PrologFindZeroExit6)
586	test	$0x40, %cl
587	jnz	L(PrologFindZeroExit7)
588	and	$1 << 8 - 1, %eax
589	jnz	L(match_exit)
590	xor	%eax, %eax
591	RETURN
592
593	.p2align 4
594L(prolog_find_zero_high):
595	mov	%ch, %dh
596	and	$15, %dh
597	jz	L(prolog_find_zero_high_8)
598	test	$0x01, %ch
599	jnz	L(PrologFindZeroExit9)
600	test	$0x02, %ch
601	jnz	L(PrologFindZeroExit10)
602	test	$0x04, %ch
603	jnz	L(PrologFindZeroExit11)
604	and	$1 << 12 - 1, %eax
605	jnz	L(match_exit)
606	xor	%eax, %eax
607	RETURN
608
609	.p2align 4
610L(prolog_find_zero_high_8):
611	test	$0x10, %ch
612	jnz	L(PrologFindZeroExit13)
613	test	$0x20, %ch
614	jnz	L(PrologFindZeroExit14)
615	test	$0x40, %ch
616	jnz	L(PrologFindZeroExit15)
617	and	$1 << 16 - 1, %eax
618	jnz	L(match_exit)
619	xor	%eax, %eax
620	RETURN
621
622	.p2align 4
623L(PrologFindZeroExit1):
624	and	$1, %eax
625	jnz	L(match_exit)
626	xor	%eax, %eax
627	RETURN
628
629	.p2align 4
630L(PrologFindZeroExit2):
631	and	$1 << 2 - 1, %eax
632	jnz	L(match_exit)
633	xor	%eax, %eax
634	RETURN
635
636	.p2align 4
637L(PrologFindZeroExit3):
638	and	$1 << 3 - 1, %eax
639	jnz	L(match_exit)
640	xor	%eax, %eax
641	RETURN
642
643	.p2align 4
644L(PrologFindZeroExit5):
645	and	$1 << 5 - 1, %eax
646	jnz	L(match_exit)
647	xor	%eax, %eax
648	RETURN
649
650	.p2align 4
651L(PrologFindZeroExit6):
652	and	$1 << 6 - 1, %eax
653	jnz	L(match_exit)
654	xor	%eax, %eax
655	RETURN
656
657	.p2align 4
658L(PrologFindZeroExit7):
659	and	$1 << 7 - 1, %eax
660	jnz	L(match_exit)
661	xor	%eax, %eax
662	RETURN
663
664	.p2align 4
665L(PrologFindZeroExit9):
666	and	$1 << 9 - 1, %eax
667	jnz	L(match_exit)
668	xor	%eax, %eax
669	RETURN
670
671	.p2align 4
672L(PrologFindZeroExit10):
673	and	$1 << 10 - 1, %eax
674	jnz	L(match_exit)
675	xor	%eax, %eax
676	RETURN
677
678	.p2align 4
679L(PrologFindZeroExit11):
680	and	$1 << 11 - 1, %eax
681	jnz	L(match_exit)
682	xor	%eax, %eax
683	RETURN
684
685	.p2align 4
686L(PrologFindZeroExit13):
687	and	$1 << 13 - 1, %eax
688	jnz	L(match_exit)
689	xor	%eax, %eax
690	RETURN
691
692	.p2align 4
693L(PrologFindZeroExit14):
694	and	$1 << 14 - 1, %eax
695	jnz	L(match_exit)
696	xor	%eax, %eax
697	RETURN
698
699	.p2align 4
700L(PrologFindZeroExit15):
701	and	$1 << 15 - 1, %eax
702	jnz	L(match_exit)
703	xor	%eax, %eax
704	RETURN
705
706END (__strrchr_sse2)
707#endif
708