1/* strstr with unaligned loads
2   Copyright (C) 2009-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20#include "../strchr-isa-default-impl.h"
21
22ENTRY(__strstr_sse2_unaligned)
23	movzbl	(%rsi), %eax
24	testb	%al, %al
25	je	L(empty)
26	movzbl	1(%rsi), %edx
27	testb	%dl, %dl
28	je	L(strchr)
29	movd	%eax, %xmm1
30	movd	%edx, %xmm2
31	movq	%rdi, %rax
32	andl	$4095, %eax
33	punpcklbw	%xmm1, %xmm1
34	cmpq	$4031, %rax
35	punpcklbw	%xmm2, %xmm2
36	punpcklwd	%xmm1, %xmm1
37	punpcklwd	%xmm2, %xmm2
38	pshufd	$0, %xmm1, %xmm1
39	pshufd	$0, %xmm2, %xmm2
40	ja	L(cross_page)
41	movdqu	(%rdi), %xmm3
42	pxor	%xmm5, %xmm5
43	movdqu	1(%rdi), %xmm4
44	movdqa	%xmm3, %xmm6
45	pcmpeqb	%xmm1, %xmm3
46	pcmpeqb	%xmm2, %xmm4
47	movdqu	16(%rdi), %xmm0
48	pcmpeqb	%xmm5, %xmm6
49	pminub	%xmm4, %xmm3
50	movdqa	%xmm3, %xmm4
51	movdqu	17(%rdi), %xmm3
52	pcmpeqb	%xmm0, %xmm5
53	pcmpeqb	%xmm2, %xmm3
54	por	%xmm6, %xmm4
55	pcmpeqb	%xmm1, %xmm0
56	pminub	%xmm3, %xmm0
57	por	%xmm5, %xmm0
58	pmovmskb	%xmm4, %r8d
59	pmovmskb	%xmm0, %eax
60	salq	$16, %rax
61	orq	%rax, %r8
62	je	L(next_32_bytes)
63L(next_pair_index):
64	bsf	%r8, %rax
65	addq	%rdi, %rax
66	cmpb	$0, (%rax)
67	je	L(zero1)
68	movzbl	2(%rsi), %edx
69	testb	%dl, %dl
70	je	L(found1)
71	cmpb	2(%rax), %dl
72	jne	L(next_pair)
73	xorl	%edx, %edx
74	jmp	L(pair_loop_start)
75
76	.p2align 4
77L(strchr):
78	movzbl	%al, %esi
79	jmp	DEFAULT_STRCHR
80
81	.p2align 4
82L(pair_loop):
83	addq	$1, %rdx
84	cmpb	2(%rax,%rdx), %cl
85	jne	L(next_pair)
86L(pair_loop_start):
87	movzbl	3(%rsi,%rdx), %ecx
88	testb	%cl, %cl
89	jne	L(pair_loop)
90L(found1):
91	ret
92L(zero1):
93	xorl	%eax, %eax
94	ret
95
96	.p2align 4
97L(next_pair):
98	leaq	-1(%r8), %rax
99	andq	%rax, %r8
100	jne	L(next_pair_index)
101
102	.p2align 4
103L(next_32_bytes):
104	movdqu	32(%rdi), %xmm3
105	pxor	%xmm5, %xmm5
106	movdqu	33(%rdi), %xmm4
107	movdqa	%xmm3, %xmm6
108	pcmpeqb	%xmm1, %xmm3
109	pcmpeqb	%xmm2, %xmm4
110	movdqu	48(%rdi), %xmm0
111	pcmpeqb	%xmm5, %xmm6
112	pminub	%xmm4, %xmm3
113	movdqa	%xmm3, %xmm4
114	movdqu	49(%rdi), %xmm3
115	pcmpeqb	%xmm0, %xmm5
116	pcmpeqb	%xmm2, %xmm3
117	por	%xmm6, %xmm4
118	pcmpeqb	%xmm1, %xmm0
119	pminub	%xmm3, %xmm0
120	por	%xmm5, %xmm0
121	pmovmskb	%xmm4, %eax
122	salq	$32, %rax
123	pmovmskb	%xmm0, %r8d
124	salq	$48, %r8
125	orq	%rax, %r8
126	je	L(loop_header)
127L(next_pair2_index):
128	bsfq	%r8, %rax
129	addq	%rdi, %rax
130	cmpb	$0, (%rax)
131	je	L(zero2)
132	movzbl	2(%rsi), %edx
133	testb	%dl, %dl
134	je	L(found2)
135	cmpb	2(%rax), %dl
136	jne	L(next_pair2)
137	xorl	%edx, %edx
138	jmp	L(pair_loop2_start)
139
140	.p2align 4
141L(pair_loop2):
142	addq	$1, %rdx
143	cmpb	2(%rax,%rdx), %cl
144	jne	L(next_pair2)
145L(pair_loop2_start):
146	movzbl	3(%rsi,%rdx), %ecx
147	testb	%cl, %cl
148	jne	L(pair_loop2)
149L(found2):
150	ret
151	L(zero2):
152	xorl	%eax, %eax
153	ret
154L(empty):
155	mov %rdi, %rax
156	ret
157
158	.p2align 4
159L(next_pair2):
160	leaq	-1(%r8), %rax
161	andq	%rax, %r8
162	jne	L(next_pair2_index)
163L(loop_header):
164	movq	$-512, %r11
165	movq	%rdi, %r9
166
167	pxor	%xmm7, %xmm7
168	andq	$-64, %rdi
169
170	.p2align 4
171L(loop):
172	movdqa	64(%rdi), %xmm3
173	movdqu	63(%rdi), %xmm6
174	movdqa	%xmm3, %xmm0
175	pxor	%xmm2, %xmm3
176	pxor	%xmm1, %xmm6
177	movdqa	80(%rdi), %xmm10
178	por	%xmm3, %xmm6
179	pminub	%xmm10, %xmm0
180	movdqu	79(%rdi), %xmm3
181	pxor	%xmm2, %xmm10
182	pxor	%xmm1, %xmm3
183	movdqa	96(%rdi), %xmm9
184	por	%xmm10, %xmm3
185	pminub	%xmm9, %xmm0
186	pxor	%xmm2, %xmm9
187	movdqa	112(%rdi), %xmm8
188	addq	$64, %rdi
189	pminub	%xmm6, %xmm3
190	movdqu	31(%rdi), %xmm4
191	pminub	%xmm8, %xmm0
192	pxor	%xmm2, %xmm8
193	pxor	%xmm1, %xmm4
194	por	%xmm9, %xmm4
195	pminub	%xmm4, %xmm3
196	movdqu	47(%rdi), %xmm5
197	pxor	%xmm1, %xmm5
198	por	%xmm8, %xmm5
199	pminub	%xmm5, %xmm3
200	pminub	%xmm3, %xmm0
201	pcmpeqb	%xmm7, %xmm0
202	pmovmskb	%xmm0, %eax
203	testl	%eax, %eax
204	je	L(loop)
205	pminub (%rdi), %xmm6
206	pminub 32(%rdi),%xmm4
207	pminub 48(%rdi),%xmm5
208	pcmpeqb %xmm7, %xmm6
209	pcmpeqb %xmm7, %xmm5
210	pmovmskb	%xmm6, %edx
211	movdqa	16(%rdi), %xmm8
212	pcmpeqb %xmm7, %xmm4
213	movdqu  15(%rdi), %xmm0
214	pmovmskb	%xmm5, %r8d
215	movdqa  %xmm8, %xmm3
216	pmovmskb	%xmm4, %ecx
217	pcmpeqb %xmm1,%xmm0
218	pcmpeqb %xmm2,%xmm3
219	salq	$32, %rcx
220	pcmpeqb %xmm7,%xmm8
221	salq	$48, %r8
222	pminub  %xmm0,%xmm3
223	orq	%rcx, %rdx
224	por	%xmm3,%xmm8
225	orq	%rdx, %r8
226	pmovmskb	%xmm8, %eax
227	salq	$16, %rax
228	orq	%rax, %r8
229	je	L(loop)
230L(next_pair_index3):
231	bsfq	%r8, %rcx
232	addq	%rdi, %rcx
233	cmpb	$0, (%rcx)
234	je	L(zero)
235	xorl	%eax, %eax
236	movzbl	2(%rsi), %edx
237	testb	%dl, %dl
238	je	L(success3)
239	cmpb	1(%rcx), %dl
240	jne	L(next_pair3)
241	jmp	L(pair_loop_start3)
242
243	.p2align 4
244L(pair_loop3):
245	addq	$1, %rax
246	cmpb	1(%rcx,%rax), %dl
247	jne	L(next_pair3)
248L(pair_loop_start3):
249	movzbl	3(%rsi,%rax), %edx
250	testb	%dl, %dl
251	jne	L(pair_loop3)
252L(success3):
253	lea	-1(%rcx), %rax
254	ret
255
256	.p2align 4
257L(next_pair3):
258	addq	%rax, %r11
259	movq	%rdi,  %rax
260	subq	%r9, %rax
261	cmpq	%r11, %rax
262	jl	L(switch_strstr)
263	leaq	-1(%r8), %rax
264	andq	%rax, %r8
265	jne	L(next_pair_index3)
266	jmp	L(loop)
267
268	.p2align 4
269L(switch_strstr):
270	movq	%rdi, %rdi
271	jmp	__strstr_generic
272
273	.p2align 4
274L(cross_page):
275
276	movq	%rdi, %rax
277	pxor	%xmm0, %xmm0
278	andq	$-64, %rax
279	movdqa	(%rax), %xmm3
280	movdqu	-1(%rax), %xmm4
281	movdqa	%xmm3, %xmm8
282	movdqa	16(%rax), %xmm5
283	pcmpeqb	%xmm1, %xmm4
284	pcmpeqb	%xmm0, %xmm8
285	pcmpeqb	%xmm2, %xmm3
286	movdqa	%xmm5, %xmm7
287	pminub	%xmm4, %xmm3
288	movdqu	15(%rax), %xmm4
289	pcmpeqb	%xmm0, %xmm7
290	por	%xmm3, %xmm8
291	movdqa	%xmm5, %xmm3
292	movdqa	32(%rax), %xmm5
293	pcmpeqb	%xmm1, %xmm4
294	pcmpeqb	%xmm2, %xmm3
295	movdqa	%xmm5, %xmm6
296	pmovmskb	%xmm8, %ecx
297	pminub	%xmm4, %xmm3
298	movdqu	31(%rax), %xmm4
299	por	%xmm3, %xmm7
300	movdqa	%xmm5, %xmm3
301	pcmpeqb	%xmm0, %xmm6
302	movdqa	48(%rax), %xmm5
303	pcmpeqb	%xmm1, %xmm4
304	pmovmskb	%xmm7, %r8d
305	pcmpeqb	%xmm2, %xmm3
306	pcmpeqb	%xmm5, %xmm0
307	pminub	%xmm4, %xmm3
308	movdqu	47(%rax), %xmm4
309	por	%xmm3, %xmm6
310	movdqa	%xmm5, %xmm3
311	salq	$16, %r8
312	pcmpeqb	%xmm1, %xmm4
313	pcmpeqb	%xmm2, %xmm3
314	pmovmskb	%xmm6, %r10d
315	pminub	%xmm4, %xmm3
316	por	%xmm3, %xmm0
317	salq	$32, %r10
318	orq	%r10, %r8
319	orq	%rcx, %r8
320	movl	%edi, %ecx
321	pmovmskb	%xmm0, %edx
322	subl	%eax, %ecx
323	salq	$48, %rdx
324	orq	%rdx, %r8
325	shrq	%cl, %r8
326	je	L(loop_header)
327L(next_pair_index4):
328	bsfq	%r8, %rax
329	addq	%rdi, %rax
330	cmpb	$0, (%rax)
331	je	L(zero)
332
333	cmpq	%rax,%rdi
334	je	L(next_pair4)
335
336	movzbl	2(%rsi), %edx
337	testb	%dl, %dl
338	je	L(found3)
339	cmpb	1(%rax), %dl
340	jne	L(next_pair4)
341	xorl	%edx, %edx
342	jmp	L(pair_loop_start4)
343
344	.p2align 4
345L(pair_loop4):
346	addq	$1, %rdx
347	cmpb	1(%rax,%rdx), %cl
348	jne	L(next_pair4)
349L(pair_loop_start4):
350	movzbl	3(%rsi,%rdx), %ecx
351	testb	%cl, %cl
352	jne	L(pair_loop4)
353L(found3):
354	subq $1, %rax
355	ret
356
357	.p2align 4
358L(next_pair4):
359	leaq	-1(%r8), %rax
360	andq	%rax, %r8
361	jne	L(next_pair_index4)
362	jmp	L(loop_header)
363
364	.p2align 4
365L(found):
366	rep
367	ret
368
369	.p2align 4
370L(zero):
371	xorl	%eax, %eax
372	ret
373
374
375END(__strstr_sse2_unaligned)
376