1/* wcschr with SSE2, without using bsf instructions
2   Copyright (C) 2011-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#if IS_IN (libc)
20# include <sysdep.h>
21
22# define CFI_PUSH(REG)	\
23	cfi_adjust_cfa_offset (4);	\
24	cfi_rel_offset (REG, 0)
25
26# define CFI_POP(REG)	\
27	cfi_adjust_cfa_offset (-4);	\
28	cfi_restore (REG)
29
30# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
31# define POP(REG)	popl REG; CFI_POP (REG)
32
33# define PARMS	4
34# define STR1	PARMS
35# define STR2	STR1+4
36
37	atom_text_section
38ENTRY (__wcschr_sse2)
39
40	mov	STR1(%esp), %ecx
41	movd	STR2(%esp), %xmm1
42
43	mov	%ecx, %eax
44	punpckldq %xmm1, %xmm1
45	pxor	%xmm2, %xmm2
46	punpckldq %xmm1, %xmm1
47
48	and	$63, %eax
49	cmp	$48, %eax
50	ja	L(cross_cache)
51
52	movdqu	(%ecx), %xmm0
53	pcmpeqd	%xmm0, %xmm2
54	pcmpeqd	%xmm1, %xmm0
55	pmovmskb %xmm2, %edx
56	pmovmskb %xmm0, %eax
57	or	%eax, %edx
58	jnz	L(matches)
59	and	$-16, %ecx
60	jmp	L(loop)
61
62	.p2align 4
63L(cross_cache):
64	PUSH	(%edi)
65	mov	%ecx, %edi
66	mov	%eax, %ecx
67	and	$-16, %edi
68	and	$15, %ecx
69	movdqa	(%edi), %xmm0
70	pcmpeqd	%xmm0, %xmm2
71	pcmpeqd	%xmm1, %xmm0
72	pmovmskb %xmm2, %edx
73	pmovmskb %xmm0, %eax
74
75	sarl	%cl, %edx
76	sarl	%cl, %eax
77	test	%eax, %eax
78	jz	L(unaligned_no_match)
79
80	add	%edi, %ecx
81	POP	(%edi)
82
83	test	%edx, %edx
84	jz	L(match_case1)
85	test	%al, %al
86	jz	L(match_higth_case2)
87	test	$15, %al
88	jnz	L(match_case2_4)
89	test	$15, %dl
90	jnz	L(return_null)
91	lea	4(%ecx), %eax
92	ret
93
94	CFI_PUSH (%edi)
95
96	.p2align 4
97L(unaligned_no_match):
98	mov	%edi, %ecx
99	POP	(%edi)
100
101	test	%edx, %edx
102	jnz	L(return_null)
103
104	pxor	%xmm2, %xmm2
105
106/* Loop start on aligned string.  */
107	.p2align 4
108L(loop):
109	add	$16, %ecx
110	movdqa	(%ecx), %xmm0
111	pcmpeqd	%xmm0, %xmm2
112	pcmpeqd	%xmm1, %xmm0
113	pmovmskb %xmm2, %edx
114	pmovmskb %xmm0, %eax
115	or	%eax, %edx
116	jnz	L(matches)
117	add	$16, %ecx
118
119	movdqa	(%ecx), %xmm0
120	pcmpeqd	%xmm0, %xmm2
121	pcmpeqd	%xmm1, %xmm0
122	pmovmskb %xmm2, %edx
123	pmovmskb %xmm0, %eax
124	or	%eax, %edx
125	jnz	L(matches)
126	add	$16, %ecx
127
128	movdqa	(%ecx), %xmm0
129	pcmpeqd	%xmm0, %xmm2
130	pcmpeqd	%xmm1, %xmm0
131	pmovmskb %xmm2, %edx
132	pmovmskb %xmm0, %eax
133	or	%eax, %edx
134	jnz	L(matches)
135	add	$16, %ecx
136
137	movdqa	(%ecx), %xmm0
138	pcmpeqd	%xmm0, %xmm2
139	pcmpeqd	%xmm1, %xmm0
140	pmovmskb %xmm2, %edx
141	pmovmskb %xmm0, %eax
142	or	%eax, %edx
143	jz	L(loop)
144
145	.p2align 4
146L(matches):
147	pmovmskb %xmm2, %edx
148	test	%eax, %eax
149	jz	L(return_null)
150	test	%edx, %edx
151	jz	L(match_case1)
152
153	.p2align 4
154L(match_case2):
155	test	%al, %al
156	jz	L(match_higth_case2)
157	test	$15, %al
158	jnz	L(match_case2_4)
159	test	$15, %dl
160	jnz	L(return_null)
161	lea	4(%ecx), %eax
162	ret
163
164	.p2align 4
165L(match_case2_4):
166	mov	%ecx, %eax
167	ret
168
169	.p2align 4
170L(match_higth_case2):
171	test	%dl, %dl
172	jnz	L(return_null)
173	test	$15, %ah
174	jnz	L(match_case2_12)
175	test	$15, %dh
176	jnz	L(return_null)
177	lea	12(%ecx), %eax
178	ret
179
180	.p2align 4
181L(match_case2_12):
182	lea	8(%ecx), %eax
183	ret
184
185	.p2align 4
186L(match_case1):
187	test	%al, %al
188	jz	L(match_higth_case1)
189
190	test	$0x01, %al
191	jnz	L(exit0)
192	lea	4(%ecx), %eax
193	ret
194
195	.p2align 4
196L(match_higth_case1):
197	test	$0x01, %ah
198	jnz	L(exit3)
199	lea	12(%ecx), %eax
200	ret
201
202	.p2align 4
203L(exit0):
204	mov	%ecx, %eax
205	ret
206
207	.p2align 4
208L(exit3):
209	lea	8(%ecx), %eax
210	ret
211
212	.p2align 4
213L(return_null):
214	xor	%eax, %eax
215	ret
216
217END (__wcschr_sse2)
218#endif
219