1/* wcschr optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19
20#include <isa-level.h>
21
22/* ISA level >= 2 because there is no wcschr-sse4 implementations.  */
23#if ISA_SHOULD_BUILD (2)
24
25# ifndef WCSCHR
26#  define WCSCHR __wcschr_sse2
27# endif
28
29# include <sysdep.h>
30
31	.text
32ENTRY (WCSCHR)
33
34	movd	%rsi, %xmm1
35	pxor	%xmm2, %xmm2
36	mov	%rdi, %rcx
37	punpckldq %xmm1, %xmm1
38	punpckldq %xmm1, %xmm1
39
40	and	$63, %rcx
41	cmp	$48, %rcx
42	ja	L(cross_cache)
43
44	movdqu	(%rdi), %xmm0
45	pcmpeqd	%xmm0, %xmm2
46	add	$16, %rdi
47	pcmpeqd	%xmm1, %xmm0
48	pmovmskb %xmm2, %rdx
49	pmovmskb %xmm0, %rax
50	or	%rax, %rdx
51	jnz	L(matches)
52
53	and	$-16, %rdi
54
55	movdqa	(%rdi), %xmm0
56	pcmpeqd	%xmm0, %xmm2
57	add	$16, %rdi
58	pcmpeqd	%xmm1, %xmm0
59	pmovmskb %xmm2, %rdx
60	pmovmskb %xmm0, %rax
61	or	%rax, %rdx
62	jnz	L(matches)
63
64	jmp	L(loop)
65
66L(cross_cache):
67	and	$15, %rcx
68	and	$-16, %rdi
69	movdqa	(%rdi), %xmm0
70	pcmpeqd	%xmm0, %xmm2
71	pcmpeqd	%xmm1, %xmm0
72	pmovmskb %xmm2, %rdx
73	pmovmskb %xmm0, %rax
74
75	sar	%cl, %rdx
76	sar	%cl, %rax
77	test	%rax, %rax
78	je	L(unaligned_no_match)
79
80	bsf	%rax, %rax
81	test	%rdx, %rdx
82	je	L(unaligned_match)
83	bsf	%rdx, %rdx
84	cmp	%rdx, %rax
85	ja	L(return_null)
86
87L(unaligned_match):
88	add	%rdi, %rax
89	add	%rcx, %rax
90	ret
91
92	.p2align 4
93L(unaligned_no_match):
94	test	%rdx, %rdx
95	jne	L(return_null)
96	pxor	%xmm2, %xmm2
97
98	add	$16, %rdi
99
100	.p2align 4
101/* Loop start on aligned string.  */
102L(loop):
103	movdqa	(%rdi), %xmm0
104	pcmpeqd	%xmm0, %xmm2
105	add	$16, %rdi
106	pcmpeqd	%xmm1, %xmm0
107	pmovmskb %xmm2, %rdx
108	pmovmskb %xmm0, %rax
109	or	%rax, %rdx
110	jnz	L(matches)
111
112	movdqa	(%rdi), %xmm0
113	pcmpeqd	%xmm0, %xmm2
114	add	$16, %rdi
115	pcmpeqd	%xmm1, %xmm0
116	pmovmskb %xmm2, %rdx
117	pmovmskb %xmm0, %rax
118	or	%rax, %rdx
119	jnz	L(matches)
120
121	movdqa	(%rdi), %xmm0
122	pcmpeqd	%xmm0, %xmm2
123	add	$16, %rdi
124	pcmpeqd	%xmm1, %xmm0
125	pmovmskb %xmm2, %rdx
126	pmovmskb %xmm0, %rax
127	or	%rax, %rdx
128	jnz	L(matches)
129
130	movdqa	(%rdi), %xmm0
131	pcmpeqd	%xmm0, %xmm2
132	add	$16, %rdi
133	pcmpeqd	%xmm1, %xmm0
134	pmovmskb %xmm2, %rdx
135	pmovmskb %xmm0, %rax
136	or	%rax, %rdx
137	jnz	L(matches)
138	jmp	L(loop)
139
140	.p2align 4
141L(matches):
142	pmovmskb %xmm2, %rdx
143	test	%rax, %rax
144	jz	L(return_null)
145	bsf	%rax, %rax
146	test	%rdx, %rdx
147	je	L(match)
148	bsf	%rdx, %rcx
149	cmp	%rcx, %rax
150	ja	L(return_null)
151L(match):
152	sub	$16, %rdi
153	add	%rdi, %rax
154	ret
155
156	.p2align 4
157L(return_null):
158	xor	%rax, %rax
159	ret
160
161END (WCSCHR)
162#endif
163