1/* strchr SSE2 without bsf
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# define CFI_PUSH(REG)	\
24	cfi_adjust_cfa_offset (4);	\
25	cfi_rel_offset (REG, 0)
26
27# define CFI_POP(REG)	\
28	cfi_adjust_cfa_offset (-4);	\
29	cfi_restore (REG)
30
31# define PUSH(REG) pushl REG; CFI_PUSH (REG)
32# define POP(REG) popl REG; CFI_POP (REG)
33
34# define PARMS  8
35# define ENTRANCE PUSH(%edi)
36# define RETURN  POP(%edi); ret; CFI_PUSH(%edi);
37
38# define STR1  PARMS
39# define STR2  STR1+4
40
41	atom_text_section
42ENTRY (__strchr_sse2)
43
44	ENTRANCE
45	mov	STR1(%esp), %ecx
46	movd	STR2(%esp), %xmm1
47
48	pxor	%xmm2, %xmm2
49	mov	%ecx, %edi
50	punpcklbw %xmm1, %xmm1
51	punpcklbw %xmm1, %xmm1
52	/* ECX has OFFSET. */
53	and	$15, %ecx
54	pshufd	$0, %xmm1, %xmm1
55	je	L(loop)
56
57/* Handle unaligned string.  */
58	and	$-16, %edi
59	movdqa	(%edi), %xmm0
60	pcmpeqb	%xmm0, %xmm2
61	pcmpeqb	%xmm1, %xmm0
62	/* Find where NULL is.  */
63	pmovmskb %xmm2, %edx
64	/* Check if there is a match.  */
65	pmovmskb %xmm0, %eax
66	/* Remove the leading bytes.  */
67	sarl	%cl, %edx
68	sarl	%cl, %eax
69	test	%eax, %eax
70	jz	L(unaligned_no_match)
71	/* Check which byte is a match.  */
72	/* Is there a NULL? */
73	add	%ecx, %edi
74	test	%edx, %edx
75	jz	L(match_case1)
76	jmp	L(match_case2)
77
78	.p2align 4
79L(unaligned_no_match):
80	test	%edx, %edx
81	jne	L(return_null)
82
83	pxor	%xmm2, %xmm2
84	add	$16, %edi
85
86	.p2align 4
87/* Loop start on aligned string.  */
88L(loop):
89	movdqa	(%edi), %xmm0
90	pcmpeqb	%xmm0, %xmm2
91	pcmpeqb	%xmm1, %xmm0
92	pmovmskb %xmm2, %edx
93	pmovmskb %xmm0, %eax
94	test	%eax, %eax
95	jnz	L(matches)
96	test	%edx, %edx
97	jnz	L(return_null)
98	add	$16, %edi
99
100	movdqa	(%edi), %xmm0
101	pcmpeqb	%xmm0, %xmm2
102	pcmpeqb	%xmm1, %xmm0
103	pmovmskb %xmm2, %edx
104	pmovmskb %xmm0, %eax
105	test	%eax, %eax
106	jnz	L(matches)
107	test	%edx, %edx
108	jnz	L(return_null)
109	add	$16, %edi
110
111	movdqa	(%edi), %xmm0
112	pcmpeqb	%xmm0, %xmm2
113	pcmpeqb	%xmm1, %xmm0
114	pmovmskb %xmm2, %edx
115	pmovmskb %xmm0, %eax
116	test	%eax, %eax
117	jnz	L(matches)
118	test	%edx, %edx
119	jnz	L(return_null)
120	add	$16, %edi
121
122	movdqa	(%edi), %xmm0
123	pcmpeqb	%xmm0, %xmm2
124	pcmpeqb	%xmm1, %xmm0
125	pmovmskb %xmm2, %edx
126	pmovmskb %xmm0, %eax
127	test	%eax, %eax
128	jnz	L(matches)
129	test	%edx, %edx
130	jnz	L(return_null)
131	add	$16, %edi
132	jmp	L(loop)
133
134L(matches):
135	/* There is a match.  First find where NULL is.  */
136	test	%edx, %edx
137	jz	L(match_case1)
138
139	.p2align 4
140L(match_case2):
141	test	%al, %al
142	jz	L(match_higth_case2)
143
144	mov	%al, %cl
145	and	$15, %cl
146	jnz	L(match_case2_4)
147
148	mov	%dl, %ch
149	and	$15, %ch
150	jnz	L(return_null)
151
152	test	$0x10, %al
153	jnz	L(Exit5)
154	test	$0x10, %dl
155	jnz	L(return_null)
156	test	$0x20, %al
157	jnz	L(Exit6)
158	test	$0x20, %dl
159	jnz	L(return_null)
160	test	$0x40, %al
161	jnz	L(Exit7)
162	test	$0x40, %dl
163	jnz	L(return_null)
164	lea	7(%edi), %eax
165	RETURN
166
167	.p2align 4
168L(match_case2_4):
169	test	$0x01, %al
170	jnz	L(Exit1)
171	test	$0x01, %dl
172	jnz	L(return_null)
173	test	$0x02, %al
174	jnz	L(Exit2)
175	test	$0x02, %dl
176	jnz	L(return_null)
177	test	$0x04, %al
178	jnz	L(Exit3)
179	test	$0x04, %dl
180	jnz	L(return_null)
181	lea	3(%edi), %eax
182	RETURN
183
184	.p2align 4
185L(match_higth_case2):
186	test	%dl, %dl
187	jnz	L(return_null)
188
189	mov	%ah, %cl
190	and	$15, %cl
191	jnz	L(match_case2_12)
192
193	mov	%dh, %ch
194	and	$15, %ch
195	jnz	L(return_null)
196
197	test	$0x10, %ah
198	jnz	L(Exit13)
199	test	$0x10, %dh
200	jnz	L(return_null)
201	test	$0x20, %ah
202	jnz	L(Exit14)
203	test	$0x20, %dh
204	jnz	L(return_null)
205	test	$0x40, %ah
206	jnz	L(Exit15)
207	test	$0x40, %dh
208	jnz	L(return_null)
209	lea	15(%edi), %eax
210	RETURN
211
212	.p2align 4
213L(match_case2_12):
214	test	$0x01, %ah
215	jnz	L(Exit9)
216	test	$0x01, %dh
217	jnz	L(return_null)
218	test	$0x02, %ah
219	jnz	L(Exit10)
220	test	$0x02, %dh
221	jnz	L(return_null)
222	test	$0x04, %ah
223	jnz	L(Exit11)
224	test	$0x04, %dh
225	jnz	L(return_null)
226	lea	11(%edi), %eax
227	RETURN
228
229	.p2align 4
230L(match_case1):
231	test	%al, %al
232	jz	L(match_higth_case1)
233
234	test	$0x01, %al
235	jnz	L(Exit1)
236	test	$0x02, %al
237	jnz	L(Exit2)
238	test	$0x04, %al
239	jnz	L(Exit3)
240	test	$0x08, %al
241	jnz	L(Exit4)
242	test	$0x10, %al
243	jnz	L(Exit5)
244	test	$0x20, %al
245	jnz	L(Exit6)
246	test	$0x40, %al
247	jnz	L(Exit7)
248	lea	7(%edi), %eax
249	RETURN
250
251	.p2align 4
252L(match_higth_case1):
253	test	$0x01, %ah
254	jnz	L(Exit9)
255	test	$0x02, %ah
256	jnz	L(Exit10)
257	test	$0x04, %ah
258	jnz	L(Exit11)
259	test	$0x08, %ah
260	jnz	L(Exit12)
261	test	$0x10, %ah
262	jnz	L(Exit13)
263	test	$0x20, %ah
264	jnz	L(Exit14)
265	test	$0x40, %ah
266	jnz	L(Exit15)
267	lea	15(%edi), %eax
268	RETURN
269
270	.p2align 4
271L(Exit1):
272	lea	(%edi), %eax
273	RETURN
274
275	.p2align 4
276L(Exit2):
277	lea	1(%edi), %eax
278	RETURN
279
280	.p2align 4
281L(Exit3):
282	lea	2(%edi), %eax
283	RETURN
284
285	.p2align 4
286L(Exit4):
287	lea	3(%edi), %eax
288	RETURN
289
290	.p2align 4
291L(Exit5):
292	lea	4(%edi), %eax
293	RETURN
294
295	.p2align 4
296L(Exit6):
297	lea	5(%edi), %eax
298	RETURN
299
300	.p2align 4
301L(Exit7):
302	lea	6(%edi), %eax
303	RETURN
304
305	.p2align 4
306L(Exit9):
307	lea	8(%edi), %eax
308	RETURN
309
310	.p2align 4
311L(Exit10):
312	lea	9(%edi), %eax
313	RETURN
314
315	.p2align 4
316L(Exit11):
317	lea	10(%edi), %eax
318	RETURN
319
320	.p2align 4
321L(Exit12):
322	lea	11(%edi), %eax
323	RETURN
324
325	.p2align 4
326L(Exit13):
327	lea	12(%edi), %eax
328	RETURN
329
330	.p2align 4
331L(Exit14):
332	lea	13(%edi), %eax
333	RETURN
334
335	.p2align 4
336L(Exit15):
337	lea	14(%edi), %eax
338	RETURN
339
340/* Return NULL.  */
341	.p2align 4
342L(return_null):
343	xor	%eax, %eax
344	RETURN
345
346END (__strchr_sse2)
347#endif
348