1/* strlen optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* ISA level >= 2 for both strlen and wcslen.  wcslen uses `pminud`
22   which is SSE4.1. strlen doesn't have an ISA level == 2
23   implementation so the SSE2 implementation must be built with ISA
24   level == 2.  */
25# if ISA_SHOULD_BUILD (2)
26
27# include <sysdep.h>
28
29# ifndef STRLEN
30#  define STRLEN	__strlen_sse2
31# endif
32
33# ifdef AS_WCSLEN
34#  define PMINU		pminud
35#  define PCMPEQ		pcmpeqd
36#  define SHIFT_RETURN	shrq $2, %rax
37# else
38#  define PMINU		pminub
39#  define PCMPEQ		pcmpeqb
40#  define SHIFT_RETURN
41# endif
42
43# ifndef SECTION
44#  define SECTION(p)	p
45# endif
46
47/* Long lived register in strlen(s), strnlen(s, n) are:
48
49	%xmm3 - zero
50	%rdi   - s
51	%r10  (s+n) & (~(64-1))
52	%r11   s+n
53*/
54
55
56	.section SECTION(.text),"ax",@progbits
57ENTRY(STRLEN)
58
59/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
60# define FIND_ZERO	\
61	PCMPEQ	(%rax), %xmm0;	\
62	PCMPEQ	16(%rax), %xmm1;	\
63	PCMPEQ	32(%rax), %xmm2;	\
64	PCMPEQ	48(%rax), %xmm3;	\
65	pmovmskb	%xmm0, %esi;	\
66	pmovmskb	%xmm1, %edx;	\
67	pmovmskb	%xmm2, %r8d;	\
68	pmovmskb	%xmm3, %ecx;	\
69	salq	$16, %rdx;	\
70	salq	$16, %rcx;	\
71	orq	%rsi, %rdx;	\
72	orq	%r8, %rcx;	\
73	salq	$32, %rcx;	\
74	orq	%rcx, %rdx;
75
76# ifdef AS_STRNLEN
77/* Do not read anything when n==0.  */
78	test	%RSI_LP, %RSI_LP
79	jne	L(n_nonzero)
80	xor	%rax, %rax
81	ret
82L(n_nonzero):
83#  ifdef AS_WCSLEN
84/* Check for overflow from maxlen * sizeof(wchar_t). If it would
85   overflow the only way this program doesn't have undefined behavior
86   is if there is a null terminator in valid memory so wcslen will
87   suffice.  */
88	mov	%RSI_LP, %R10_LP
89	sar	$62, %R10_LP
90	jnz	OVERFLOW_STRLEN
91	sal	$2, %RSI_LP
92#  endif
93
94/* Initialize long lived registers.  */
95	add	%RDI_LP, %RSI_LP
96	mov	%RSI_LP, %R10_LP
97	and	$-64, %R10_LP
98	mov	%RSI_LP, %R11_LP
99# endif
100
101	pxor	%xmm0, %xmm0
102	pxor	%xmm1, %xmm1
103	pxor	%xmm2, %xmm2
104	pxor	%xmm3, %xmm3
105	movq	%rdi, %rax
106	movq	%rdi, %rcx
107	andq	$4095, %rcx
108/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
109	cmpq	$4047, %rcx
110/* We cannot unify this branching as it would be ~6 cycles slower.  */
111	ja	L(cross_page)
112
113# ifdef AS_STRNLEN
114/* Test if end is among first 64 bytes.  */
115#  define STRNLEN_PROLOG	\
116	mov	%r11, %rsi;	\
117	subq	%rax, %rsi;	\
118	andq	$-64, %rax;	\
119	testq	$-64, %rsi;	\
120	je	L(strnlen_ret)
121# else
122#  define STRNLEN_PROLOG  andq $-64, %rax;
123# endif
124
125/* Ignore bits in mask that come before start of string.  */
126# define PROLOG(lab)	\
127	movq	%rdi, %rcx;	\
128	xorq	%rax, %rcx;	\
129	STRNLEN_PROLOG;	\
130	sarq	%cl, %rdx;	\
131	test	%rdx, %rdx;	\
132	je	L(lab);	\
133	bsfq	%rdx, %rax;	\
134	SHIFT_RETURN;		\
135	ret
136
137# ifdef AS_STRNLEN
138	andq	$-16, %rax
139	FIND_ZERO
140# else
141	/* Test first 16 bytes unaligned.  */
142	movdqu	(%rax), %xmm4
143	PCMPEQ	%xmm0, %xmm4
144	pmovmskb	%xmm4, %edx
145	test	%edx, %edx
146	je 	L(next48_bytes)
147	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
148	SHIFT_RETURN
149	ret
150
151L(next48_bytes):
152/* Same as FIND_ZERO except we do not check first 16 bytes.  */
153	andq	$-16, %rax
154	PCMPEQ 16(%rax), %xmm1
155	PCMPEQ 32(%rax), %xmm2
156	PCMPEQ 48(%rax), %xmm3
157	pmovmskb	%xmm1, %edx
158	pmovmskb	%xmm2, %r8d
159	pmovmskb	%xmm3, %ecx
160	salq	$16, %rdx
161	salq	$16, %rcx
162	orq	%r8, %rcx
163	salq	$32, %rcx
164	orq	%rcx, %rdx
165# endif
166
167	/* When no zero byte is found xmm1-3 are zero so we do not have to
168	   zero them.  */
169	PROLOG(loop)
170
171	.p2align 4
172L(cross_page):
173	andq	$-64, %rax
174	FIND_ZERO
175	PROLOG(loop_init)
176
177# ifdef AS_STRNLEN
178/* We must do this check to correctly handle strnlen (s, -1).  */
179L(strnlen_ret):
180	bts	%rsi, %rdx
181	sarq	%cl, %rdx
182	test	%rdx, %rdx
183	je	L(loop_init)
184	bsfq	%rdx, %rax
185	SHIFT_RETURN
186	ret
187# endif
188	.p2align 4
189L(loop_init):
190	pxor	%xmm1, %xmm1
191	pxor	%xmm2, %xmm2
192	pxor	%xmm3, %xmm3
193# ifdef AS_STRNLEN
194	.p2align 4
195L(loop):
196
197	addq	$64, %rax
198	cmpq	%rax, %r10
199	je	L(exit_end)
200
201	movdqa	(%rax), %xmm0
202	PMINU	16(%rax), %xmm0
203	PMINU	32(%rax), %xmm0
204	PMINU	48(%rax), %xmm0
205	PCMPEQ	%xmm3, %xmm0
206	pmovmskb	%xmm0, %edx
207	testl	%edx, %edx
208	jne	L(exit)
209	jmp	L(loop)
210
211	.p2align 4
212L(exit_end):
213	cmp	%rax, %r11
214	je	L(first) /* Do not read when end is at page boundary.  */
215	pxor	%xmm0, %xmm0
216	FIND_ZERO
217
218L(first):
219	bts	%r11, %rdx
220	bsfq	%rdx, %rdx
221	addq	%rdx, %rax
222	subq	%rdi, %rax
223	SHIFT_RETURN
224	ret
225
226	.p2align 4
227L(exit):
228	pxor	%xmm0, %xmm0
229	FIND_ZERO
230
231	bsfq	%rdx, %rdx
232	addq	%rdx, %rax
233	subq	%rdi, %rax
234	SHIFT_RETURN
235	ret
236
237# else
238
239	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
240	.p2align 4
241L(loop):
242
243	movdqa	64(%rax), %xmm0
244	PMINU	80(%rax), %xmm0
245	PMINU	96(%rax), %xmm0
246	PMINU	112(%rax), %xmm0
247	PCMPEQ	%xmm3, %xmm0
248	pmovmskb	%xmm0, %edx
249	testl	%edx, %edx
250	jne	L(exit64)
251
252	subq	$-128, %rax
253
254	movdqa	(%rax), %xmm0
255	PMINU	16(%rax), %xmm0
256	PMINU	32(%rax), %xmm0
257	PMINU	48(%rax), %xmm0
258	PCMPEQ	%xmm3, %xmm0
259	pmovmskb	%xmm0, %edx
260	testl	%edx, %edx
261	jne	L(exit0)
262	jmp	L(loop)
263
264	.p2align 4
265L(exit64):
266	addq	$64, %rax
267L(exit0):
268	pxor	%xmm0, %xmm0
269	FIND_ZERO
270
271	bsfq	%rdx, %rdx
272	addq	%rdx, %rax
273	subq	%rdi, %rax
274	SHIFT_RETURN
275	ret
276
277# endif
278
279END(STRLEN)
280#endif
281