1/* memchr optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23   so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26# ifndef MEMCHR
27#  define MEMCHR	__memchr_sse2
28# endif
29# ifdef USE_AS_WMEMCHR
30#  define PCMPEQ		pcmpeqd
31#  define CHAR_PER_VEC	4
32# else
33#  define PCMPEQ		pcmpeqb
34#  define CHAR_PER_VEC	16
35# endif
36
37/* fast SSE2 version with using pmaxub and 64 byte loop */
38
39	.text
40ENTRY(MEMCHR)
41	movd	%esi, %xmm1
42	mov	%edi, %ecx
43
44# ifdef __ILP32__
45	/* Clear the upper 32 bits.  */
46	movl	%edx, %edx
47# endif
48# ifdef USE_AS_WMEMCHR
49	test	%RDX_LP, %RDX_LP
50	jz	L(return_null)
51# else
52	punpcklbw %xmm1, %xmm1
53	test	%RDX_LP, %RDX_LP
54	jz	L(return_null)
55	punpcklbw %xmm1, %xmm1
56# endif
57
58	and	$63, %ecx
59	pshufd	$0, %xmm1, %xmm1
60
61	cmp	$48, %ecx
62	ja	L(crosscache)
63
64	movdqu	(%rdi), %xmm0
65	PCMPEQ	%xmm1, %xmm0
66	pmovmskb %xmm0, %eax
67	test	%eax, %eax
68
69	jnz	L(matches_1)
70	sub	$CHAR_PER_VEC, %rdx
71	jbe	L(return_null)
72	add	$16, %rdi
73	and	$15, %ecx
74	and	$-16, %rdi
75# ifdef USE_AS_WMEMCHR
76	shr	$2, %ecx
77# endif
78	add	%rcx, %rdx
79	sub	$(CHAR_PER_VEC * 4), %rdx
80	jbe	L(exit_loop)
81	jmp	L(loop_prolog)
82
83	.p2align 4
84L(crosscache):
85	and	$15, %ecx
86	and	$-16, %rdi
87	movdqa	(%rdi), %xmm0
88
89	PCMPEQ	%xmm1, %xmm0
90	/* Check if there is a match.  */
91	pmovmskb %xmm0, %eax
92	/* Remove the leading bytes.  */
93	sar	%cl, %eax
94	test	%eax, %eax
95	je	L(unaligned_no_match)
96	/* Check which byte is a match.  */
97	bsf	%eax, %eax
98# ifdef USE_AS_WMEMCHR
99	mov	%eax, %esi
100	shr	$2, %esi
101	sub	%rsi, %rdx
102# else
103	sub	%rax, %rdx
104# endif
105	jbe	L(return_null)
106	add	%rdi, %rax
107	add	%rcx, %rax
108	ret
109
110	.p2align 4
111L(unaligned_no_match):
112	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
113	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
114	   possible addition overflow.  */
115	neg	%rcx
116	add	$16, %rcx
117# ifdef USE_AS_WMEMCHR
118	shr	$2, %ecx
119# endif
120	sub	%rcx, %rdx
121	jbe	L(return_null)
122	add	$16, %rdi
123	sub	$(CHAR_PER_VEC * 4), %rdx
124	jbe	L(exit_loop)
125
126	.p2align 4
127L(loop_prolog):
128	movdqa	(%rdi), %xmm0
129	PCMPEQ	%xmm1, %xmm0
130	pmovmskb %xmm0, %eax
131	test	%eax, %eax
132	jnz	L(matches)
133
134	movdqa	16(%rdi), %xmm2
135	PCMPEQ	%xmm1, %xmm2
136	pmovmskb %xmm2, %eax
137	test	%eax, %eax
138	jnz	L(matches16)
139
140	movdqa	32(%rdi), %xmm3
141	PCMPEQ	%xmm1, %xmm3
142	pmovmskb %xmm3, %eax
143	test	%eax, %eax
144	jnz	L(matches32)
145
146	movdqa	48(%rdi), %xmm4
147	PCMPEQ	%xmm1, %xmm4
148	add	$64, %rdi
149	pmovmskb %xmm4, %eax
150	test	%eax, %eax
151	jnz	L(matches0)
152
153	test	$0x3f, %rdi
154	jz	L(align64_loop)
155
156	sub	$(CHAR_PER_VEC * 4), %rdx
157	jbe	L(exit_loop)
158
159	movdqa	(%rdi), %xmm0
160	PCMPEQ	%xmm1, %xmm0
161	pmovmskb %xmm0, %eax
162	test	%eax, %eax
163	jnz	L(matches)
164
165	movdqa	16(%rdi), %xmm2
166	PCMPEQ	%xmm1, %xmm2
167	pmovmskb %xmm2, %eax
168	test	%eax, %eax
169	jnz	L(matches16)
170
171	movdqa	32(%rdi), %xmm3
172	PCMPEQ	%xmm1, %xmm3
173	pmovmskb %xmm3, %eax
174	test	%eax, %eax
175	jnz	L(matches32)
176
177	movdqa	48(%rdi), %xmm3
178	PCMPEQ	%xmm1, %xmm3
179	pmovmskb %xmm3, %eax
180
181	add	$64, %rdi
182	test	%eax, %eax
183	jnz	L(matches0)
184
185	mov	%rdi, %rcx
186	and	$-64, %rdi
187	and	$63, %ecx
188# ifdef USE_AS_WMEMCHR
189	shr	$2, %ecx
190# endif
191	add	%rcx, %rdx
192
193	.p2align 4
194L(align64_loop):
195	sub	$(CHAR_PER_VEC * 4), %rdx
196	jbe	L(exit_loop)
197	movdqa	(%rdi), %xmm0
198	movdqa	16(%rdi), %xmm2
199	movdqa	32(%rdi), %xmm3
200	movdqa	48(%rdi), %xmm4
201
202	PCMPEQ	%xmm1, %xmm0
203	PCMPEQ	%xmm1, %xmm2
204	PCMPEQ	%xmm1, %xmm3
205	PCMPEQ	%xmm1, %xmm4
206
207	pmaxub	%xmm0, %xmm3
208	pmaxub	%xmm2, %xmm4
209	pmaxub	%xmm3, %xmm4
210	pmovmskb %xmm4, %eax
211
212	add	$64, %rdi
213
214	test	%eax, %eax
215	jz	L(align64_loop)
216
217	sub	$64, %rdi
218
219	pmovmskb %xmm0, %eax
220	test	%eax, %eax
221	jnz	L(matches)
222
223	pmovmskb %xmm2, %eax
224	test	%eax, %eax
225	jnz	L(matches16)
226
227	movdqa	32(%rdi), %xmm3
228	PCMPEQ	%xmm1, %xmm3
229
230	PCMPEQ	48(%rdi), %xmm1
231	pmovmskb %xmm3, %eax
232	test	%eax, %eax
233	jnz	L(matches32)
234
235	pmovmskb %xmm1, %eax
236	bsf	%eax, %eax
237	lea	48(%rdi, %rax), %rax
238	ret
239
240	.p2align 4
241L(exit_loop):
242	add	$(CHAR_PER_VEC * 2), %edx
243	jle	L(exit_loop_32)
244
245	movdqa	(%rdi), %xmm0
246	PCMPEQ	%xmm1, %xmm0
247	pmovmskb %xmm0, %eax
248	test	%eax, %eax
249	jnz	L(matches)
250
251	movdqa	16(%rdi), %xmm2
252	PCMPEQ	%xmm1, %xmm2
253	pmovmskb %xmm2, %eax
254	test	%eax, %eax
255	jnz	L(matches16)
256
257	movdqa	32(%rdi), %xmm3
258	PCMPEQ	%xmm1, %xmm3
259	pmovmskb %xmm3, %eax
260	test	%eax, %eax
261	jnz	L(matches32_1)
262	sub	$CHAR_PER_VEC, %edx
263	jle	L(return_null)
264
265	PCMPEQ	48(%rdi), %xmm1
266	pmovmskb %xmm1, %eax
267	test	%eax, %eax
268	jnz	L(matches48_1)
269	xor	%eax, %eax
270	ret
271
272	.p2align 4
273L(exit_loop_32):
274	add	$(CHAR_PER_VEC * 2), %edx
275	movdqa	(%rdi), %xmm0
276	PCMPEQ	%xmm1, %xmm0
277	pmovmskb %xmm0, %eax
278	test	%eax, %eax
279	jnz	L(matches_1)
280	sub	$CHAR_PER_VEC, %edx
281	jbe	L(return_null)
282
283	PCMPEQ	16(%rdi), %xmm1
284	pmovmskb %xmm1, %eax
285	test	%eax, %eax
286	jnz	L(matches16_1)
287	xor	%eax, %eax
288	ret
289
290	.p2align 4
291L(matches0):
292	bsf	%eax, %eax
293	lea	-16(%rax, %rdi), %rax
294	ret
295
296	.p2align 4
297L(matches):
298	bsf	%eax, %eax
299	add	%rdi, %rax
300	ret
301
302	.p2align 4
303L(matches16):
304	bsf	%eax, %eax
305	lea	16(%rax, %rdi), %rax
306	ret
307
308	.p2align 4
309L(matches32):
310	bsf	%eax, %eax
311	lea	32(%rax, %rdi), %rax
312	ret
313
314	.p2align 4
315L(matches_1):
316	bsf	%eax, %eax
317# ifdef USE_AS_WMEMCHR
318	mov	%eax, %esi
319	shr	$2, %esi
320	sub	%rsi, %rdx
321# else
322	sub	%rax, %rdx
323# endif
324	jbe	L(return_null)
325	add	%rdi, %rax
326	ret
327
328	.p2align 4
329L(matches16_1):
330	bsf	%eax, %eax
331# ifdef USE_AS_WMEMCHR
332	mov	%eax, %esi
333	shr	$2, %esi
334	sub	%rsi, %rdx
335# else
336	sub	%rax, %rdx
337# endif
338	jbe	L(return_null)
339	lea	16(%rdi, %rax), %rax
340	ret
341
342	.p2align 4
343L(matches32_1):
344	bsf	%eax, %eax
345# ifdef USE_AS_WMEMCHR
346	mov	%eax, %esi
347	shr	$2, %esi
348	sub	%rsi, %rdx
349# else
350	sub	%rax, %rdx
351# endif
352	jbe	L(return_null)
353	lea	32(%rdi, %rax), %rax
354	ret
355
356	.p2align 4
357L(matches48_1):
358	bsf	%eax, %eax
359# ifdef USE_AS_WMEMCHR
360	mov	%eax, %esi
361	shr	$2, %esi
362	sub	%rsi, %rdx
363# else
364	sub	%rax, %rdx
365# endif
366	jbe	L(return_null)
367	lea	48(%rdi, %rax), %rax
368	ret
369
370	.p2align 4
371L(return_null):
372	xor	%eax, %eax
373	ret
374END(MEMCHR)
375#endif
376