1/* strrchr with SSE2 with bsf and bsr
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS  4
35# define STR1  PARMS
36# define STR2  STR1+4
37
38	.text
39ENTRY (__strrchr_sse2_bsf)
40
41	mov	STR1(%esp), %ecx
42	movd	STR2(%esp), %xmm1
43
44	PUSH	(%edi)
45	pxor	%xmm2, %xmm2
46	mov	%ecx, %edi
47	punpcklbw %xmm1, %xmm1
48	punpcklbw %xmm1, %xmm1
49	/* ECX has OFFSET. */
50	and	$63, %ecx
51	cmp	$48, %ecx
52	pshufd	$0, %xmm1, %xmm1
53	ja	L(crosscashe)
54
55/* unaligned string. */
56	movdqu	(%edi), %xmm0
57	pcmpeqb	%xmm0, %xmm2
58	pcmpeqb	%xmm1, %xmm0
59	/* Find where NULL is.  */
60	pmovmskb %xmm2, %edx
61	/* Check if there is a match.  */
62	pmovmskb %xmm0, %eax
63
64	test	%eax, %eax
65	jnz	L(unaligned_match1)
66
67	test	%edx, %edx
68	jnz	L(return_null)
69
70	and	$-16, %edi
71	add	$16, %edi
72
73	PUSH	(%esi)
74	PUSH	(%ebx)
75
76	xor	%ebx, %ebx
77	jmp	L(loop)
78
79	CFI_POP	(%esi)
80	CFI_POP	(%ebx)
81
82	.p2align 4
83L(unaligned_return_value1):
84	bsf	%edx, %ecx
85	mov	$2, %edx
86	shl	%cl, %edx
87	sub	$1, %edx
88	and	%edx, %eax
89	jz	L(return_null)
90	bsr	%eax, %eax
91	add	%edi, %eax
92	POP	(%edi)
93	ret
94	CFI_PUSH	(%edi)
95
96	.p2align 4
97L(unaligned_match1):
98	test	%edx, %edx
99	jnz	L(unaligned_return_value1)
100
101	PUSH	(%esi)
102	PUSH	(%ebx)
103
104	mov	%eax, %ebx
105	lea	16(%edi), %esi
106	and	$-16, %edi
107	add	$16, %edi
108	jmp	L(loop)
109
110	CFI_POP	(%esi)
111	CFI_POP	(%ebx)
112
113	.p2align 4
114	L(crosscashe):
115/* Hancle unaligned string.  */
116	and	$15, %ecx
117	and	$-16, %edi
118	pxor	%xmm3, %xmm3
119	movdqa	(%edi), %xmm0
120	pcmpeqb	%xmm0, %xmm3
121	pcmpeqb	%xmm1, %xmm0
122	/* Find where NULL is.  */
123	pmovmskb %xmm3, %edx
124	/* Check if there is a match.  */
125	pmovmskb %xmm0, %eax
126	/* Remove the leading bytes.  */
127	shr	%cl, %edx
128	shr	%cl, %eax
129
130	test	%eax, %eax
131	jnz	L(unaligned_match)
132
133	test	%edx, %edx
134	jnz	L(return_null)
135
136	add	$16, %edi
137
138	PUSH	(%esi)
139	PUSH	(%ebx)
140
141	xor	%ebx, %ebx
142	jmp	L(loop)
143
144	CFI_POP	(%esi)
145	CFI_POP	(%ebx)
146
147	.p2align 4
148L(unaligned_return_value):
149	add	%ecx, %edi
150	bsf	%edx, %ecx
151	mov	$2, %edx
152	shl	%cl, %edx
153	sub	$1, %edx
154	and	%edx, %eax
155	jz	L(return_null)
156	bsr	%eax, %eax
157	add	%edi, %eax
158	POP	(%edi)
159	ret
160	CFI_PUSH	(%edi)
161
162	.p2align 4
163L(unaligned_match):
164	test	%edx, %edx
165	jnz	L(unaligned_return_value)
166
167	PUSH	(%esi)
168	PUSH	(%ebx)
169
170	mov	%eax, %ebx
171	add	$16, %edi
172	lea	(%edi, %ecx), %esi
173
174/* Loop start on aligned string.  */
175	.p2align 4
176L(loop):
177	movdqa	(%edi), %xmm0
178	pcmpeqb	%xmm0, %xmm2
179	add	$16, %edi
180	pcmpeqb	%xmm1, %xmm0
181	pmovmskb %xmm2, %ecx
182	pmovmskb %xmm0, %eax
183	or	%eax, %ecx
184	jnz	L(matches)
185
186	movdqa	(%edi), %xmm0
187	pcmpeqb	%xmm0, %xmm2
188	add	$16, %edi
189	pcmpeqb	%xmm1, %xmm0
190	pmovmskb %xmm2, %ecx
191	pmovmskb %xmm0, %eax
192	or	%eax, %ecx
193	jnz	L(matches)
194
195	movdqa	(%edi), %xmm0
196	pcmpeqb	%xmm0, %xmm2
197	add	$16, %edi
198	pcmpeqb	%xmm1, %xmm0
199	pmovmskb %xmm2, %ecx
200	pmovmskb %xmm0, %eax
201	or	%eax, %ecx
202	jnz	L(matches)
203
204	movdqa	(%edi), %xmm0
205	pcmpeqb	%xmm0, %xmm2
206	add	$16, %edi
207	pcmpeqb	%xmm1, %xmm0
208	pmovmskb %xmm2, %ecx
209	pmovmskb %xmm0, %eax
210	or	%eax, %ecx
211	jz	L(loop)
212
213L(matches):
214	test	%eax, %eax
215	jnz	L(match)
216L(return_value):
217	test	%ebx, %ebx
218	jz	L(return_null_1)
219	bsr	%ebx, %eax
220	add	%esi, %eax
221
222	POP	(%ebx)
223	POP	(%esi)
224
225	sub	$16, %eax
226	POP	(%edi)
227	ret
228
229	CFI_PUSH	(%edi)
230	CFI_PUSH	(%ebx)
231	CFI_PUSH	(%esi)
232
233	.p2align 4
234L(match):
235	pmovmskb %xmm2, %ecx
236	test	%ecx, %ecx
237	jnz	L(return_value_1)
238	mov	%eax, %ebx
239	mov	%edi, %esi
240	jmp	L(loop)
241
242	.p2align 4
243L(return_value_1):
244	bsf	%ecx, %ecx
245	mov	$2, %edx
246	shl	%cl, %edx
247	sub	$1, %edx
248	and	%edx, %eax
249	jz	L(return_value)
250
251	POP	(%ebx)
252	POP	(%esi)
253
254	bsr	%eax, %eax
255	add	%edi, %eax
256	sub	$16, %eax
257	POP	(%edi)
258	ret
259
260	CFI_PUSH	(%edi)
261/* Return NULL.  */
262	.p2align 4
263L(return_null):
264	xor	%eax, %eax
265	POP	(%edi)
266	ret
267
268	CFI_PUSH	(%edi)
269	CFI_PUSH	(%ebx)
270	CFI_PUSH	(%esi)
271/* Return NULL.  */
272	.p2align 4
273L(return_null_1):
274	POP	(%ebx)
275	POP	(%esi)
276	POP	(%edi)
277	xor	%eax, %eax
278	ret
279
280END (__strrchr_sse2_bsf)
281#endif
282