1/* rawmemchr optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
23   so we need this to build for ISA V2 builds. */
24#if ISA_SHOULD_BUILD (2)
25
26# ifndef RAWMEMCHR
27#  define RAWMEMCHR	__rawmemchr_sse2
28# endif
29
30	.text
31ENTRY (RAWMEMCHR)
32	movd	%rsi, %xmm1
33	mov	%rdi, %rcx
34
35	punpcklbw %xmm1, %xmm1
36	punpcklbw %xmm1, %xmm1
37
38	and	$63, %rcx
39	pshufd	$0, %xmm1, %xmm1
40
41	cmp	$48, %rcx
42	ja	L(crosscache)
43
44	movdqu	(%rdi), %xmm0
45	pcmpeqb	%xmm1, %xmm0
46/* Check if there is a match.  */
47	pmovmskb %xmm0, %eax
48	test	%eax, %eax
49
50	jnz	L(matches)
51	add	$16, %rdi
52	and	$-16, %rdi
53	jmp	L(loop_prolog)
54
55	.p2align 4
56L(crosscache):
57	and	$15, %rcx
58	and	$-16, %rdi
59	movdqa	(%rdi), %xmm0
60
61	pcmpeqb	%xmm1, %xmm0
62/* Check if there is a match.  */
63	pmovmskb %xmm0, %eax
64/* Remove the leading bytes.  */
65	sar	%cl, %eax
66	test	%eax, %eax
67	je	L(unaligned_no_match)
68/* Check which byte is a match.  */
69	bsf	%eax, %eax
70
71	add	%rdi, %rax
72	add	%rcx, %rax
73	ret
74
75	.p2align 4
76L(unaligned_no_match):
77	add	$16, %rdi
78
79	.p2align 4
80L(loop_prolog):
81	movdqa	(%rdi), %xmm0
82	pcmpeqb	%xmm1, %xmm0
83	pmovmskb %xmm0, %eax
84	test	%eax, %eax
85	jnz	L(matches)
86
87	movdqa	16(%rdi), %xmm2
88	pcmpeqb	%xmm1, %xmm2
89	pmovmskb %xmm2, %eax
90	test	%eax, %eax
91	jnz	L(matches16)
92
93	movdqa	32(%rdi), %xmm3
94	pcmpeqb	%xmm1, %xmm3
95	pmovmskb %xmm3, %eax
96	test	%eax, %eax
97	jnz	L(matches32)
98
99	movdqa	48(%rdi), %xmm4
100	pcmpeqb	%xmm1, %xmm4
101	add	$64, %rdi
102	pmovmskb %xmm4, %eax
103	test	%eax, %eax
104	jnz	L(matches0)
105
106	test	$0x3f, %rdi
107	jz	L(align64_loop)
108
109	movdqa	(%rdi), %xmm0
110	pcmpeqb	%xmm1, %xmm0
111	pmovmskb %xmm0, %eax
112	test	%eax, %eax
113	jnz	L(matches)
114
115	movdqa	16(%rdi), %xmm2
116	pcmpeqb	%xmm1, %xmm2
117	pmovmskb %xmm2, %eax
118	test	%eax, %eax
119	jnz	L(matches16)
120
121	movdqa	32(%rdi), %xmm3
122	pcmpeqb	%xmm1, %xmm3
123	pmovmskb %xmm3, %eax
124	test	%eax, %eax
125	jnz	L(matches32)
126
127	movdqa	48(%rdi), %xmm3
128	pcmpeqb	%xmm1, %xmm3
129	pmovmskb %xmm3, %eax
130
131	add	$64, %rdi
132	test	%eax, %eax
133	jnz	L(matches0)
134
135	and	$-64, %rdi
136
137	.p2align 4
138L(align64_loop):
139	movdqa	(%rdi), %xmm0
140	movdqa	16(%rdi), %xmm2
141	movdqa	32(%rdi), %xmm3
142	movdqa	48(%rdi), %xmm4
143
144	pcmpeqb	%xmm1, %xmm0
145	pcmpeqb	%xmm1, %xmm2
146	pcmpeqb	%xmm1, %xmm3
147	pcmpeqb	%xmm1, %xmm4
148
149	pmaxub	%xmm0, %xmm3
150	pmaxub	%xmm2, %xmm4
151	pmaxub	%xmm3, %xmm4
152	pmovmskb %xmm4, %eax
153
154	add	$64, %rdi
155
156	test	%eax, %eax
157	jz	L(align64_loop)
158
159	sub	$64, %rdi
160
161	pmovmskb %xmm0, %eax
162	test	%eax, %eax
163	jnz	L(matches)
164
165	pmovmskb %xmm2, %eax
166	test	%eax, %eax
167	jnz	L(matches16)
168
169	movdqa	32(%rdi), %xmm3
170	pcmpeqb	%xmm1, %xmm3
171
172	pcmpeqb	48(%rdi), %xmm1
173	pmovmskb %xmm3, %eax
174	test	%eax, %eax
175	jnz	L(matches32)
176
177	pmovmskb %xmm1, %eax
178	bsf	%eax, %eax
179	lea	48(%rdi, %rax), %rax
180	ret
181
182	.p2align 4
183L(matches0):
184	bsf	%eax, %eax
185	lea	-16(%rax, %rdi), %rax
186	ret
187
188	.p2align 4
189L(matches):
190	bsf	%eax, %eax
191	add	%rdi, %rax
192	ret
193
194	.p2align 4
195L(matches16):
196	bsf	%eax, %eax
197	lea	16(%rax, %rdi), %rax
198	ret
199
200	.p2align 4
201L(matches32):
202	bsf	%eax, %eax
203	lea	32(%rax, %rdi), %rax
204	ret
205
206END (RAWMEMCHR)
207#endif
208