1/* Placeholder function, not used by any processor at the moment.
2   Copyright (C) 2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* UNUSED. Exists purely as reference implementation.  */
20
21#include <isa-level.h>
22
23#if ISA_SHOULD_BUILD (4)
24
25# include <sysdep.h>
26
27# ifdef USE_AS_WCSLEN
28#  define VPCMP		vpcmpd
29#  define VPTESTN	vptestnmd
30#  define VPMINU	vpminud
31#  define CHAR_SIZE	4
32# else
33#  define VPCMP		vpcmpb
34#  define VPTESTN	vptestnmb
35#  define VPMINU	vpminub
36#  define CHAR_SIZE	1
37# endif
38
39# define XMM0		xmm16
40# define PAGE_SIZE	4096
41# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
42
43# if VEC_SIZE == 64
44#  define KMOV		kmovq
45#  define KORTEST	kortestq
46#  define RAX		rax
47#  define RCX		rcx
48#  define RDX		rdx
49#  define SHR		shrq
50#  define TEXTSUFFIX	evex512
51#  define VMM0		zmm16
52#  define VMM1		zmm17
53#  define VMM2		zmm18
54#  define VMM3		zmm19
55#  define VMM4		zmm20
56#  define VMOVA		vmovdqa64
57# elif VEC_SIZE == 32
58/* Currently Unused.  */
59#  define KMOV		kmovd
60#  define KORTEST	kortestd
61#  define RAX		eax
62#  define RCX		ecx
63#  define RDX		edx
64#  define SHR		shrl
65#  define TEXTSUFFIX	evex256
66#  define VMM0		ymm16
67#  define VMM1		ymm17
68#  define VMM2		ymm18
69#  define VMM3		ymm19
70#  define VMM4		ymm20
71#  define VMOVA		vmovdqa32
72# endif
73
74	.section .text.TEXTSUFFIX, "ax", @progbits
75/* Aligning entry point to 64 byte, provides better performance for
76   one vector length string.  */
77ENTRY_P2ALIGN (STRLEN, 6)
78# ifdef USE_AS_STRNLEN
79	/* Check zero length.  */
80	test	%RSI_LP, %RSI_LP
81	jz	L(ret_max)
82#  ifdef __ILP32__
83	/* Clear the upper 32 bits.  */
84	movl	%esi, %esi
85#  endif
86# endif
87
88	movl	%edi, %eax
89	vpxorq	%XMM0, %XMM0, %XMM0
90	andl	$(PAGE_SIZE - 1), %eax
91	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
92	ja	L(page_cross)
93
94	/* Compare [w]char for null, mask bit will be set for match.  */
95	VPCMP	$0, (%rdi), %VMM0, %k0
96	KMOV	%k0, %RAX
97	test	%RAX, %RAX
98	jz	L(align_more)
99
100	bsf	%RAX, %RAX
101# ifdef USE_AS_STRNLEN
102	cmpq	%rsi, %rax
103	cmovnb	%rsi, %rax
104# endif
105	ret
106
107	/* At this point vector max length reached.  */
108# ifdef USE_AS_STRNLEN
109	.p2align 4,,3
110L(ret_max):
111	movq	%rsi, %rax
112	ret
113# endif
114
115L(align_more):
116	leaq	VEC_SIZE(%rdi), %rax
117	/* Align rax to VEC_SIZE.  */
118	andq	$-VEC_SIZE, %rax
119# ifdef USE_AS_STRNLEN
120	movq	%rax, %rdx
121	subq	%rdi, %rdx
122#  ifdef USE_AS_WCSLEN
123	SHR	$2, %RDX
124#  endif
125	/* At this point rdx contains [w]chars already compared.  */
126	subq	%rsi, %rdx
127	jae	L(ret_max)
128	negq	%rdx
129	/* At this point rdx contains number of w[char] needs to go.
130	   Now onwards rdx will keep decrementing with each compare.  */
131# endif
132
133	/* Loop unroll 4 times for 4 vector loop.  */
134	VPCMP	$0, (%rax), %VMM0, %k0
135	KMOV	%k0, %RCX
136	test	%RCX, %RCX
137	jnz	L(ret_vec_x1)
138
139# ifdef USE_AS_STRNLEN
140	subq	$CHAR_PER_VEC, %rdx
141	jbe	L(ret_max)
142# endif
143
144	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
145	KMOV	%k0, %RCX
146	test	%RCX, %RCX
147	jnz	L(ret_vec_x2)
148
149# ifdef USE_AS_STRNLEN
150	subq	$CHAR_PER_VEC, %rdx
151	jbe	L(ret_max)
152# endif
153
154	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
155	KMOV	%k0, %RCX
156	test	%RCX, %RCX
157	jnz	L(ret_vec_x3)
158
159# ifdef USE_AS_STRNLEN
160	subq	$CHAR_PER_VEC, %rdx
161	jbe	L(ret_max)
162# endif
163
164	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
165	KMOV	%k0, %RCX
166	test	%RCX, %RCX
167	jnz	L(ret_vec_x4)
168
169# ifdef USE_AS_STRNLEN
170	subq	$CHAR_PER_VEC, %rdx
171	jbe	L(ret_max)
172	/* Save pointer before 4 x VEC_SIZE alignment.  */
173	movq	%rax, %rcx
174# endif
175
176	/* Align address to VEC_SIZE * 4 for loop.  */
177	andq	$-(VEC_SIZE * 4), %rax
178
179# ifdef USE_AS_STRNLEN
180	subq	%rax, %rcx
181#  ifdef USE_AS_WCSLEN
182	SHR	$2, %RCX
183#  endif
184	/* rcx contains number of [w]char will be recompared due to
185	   alignment fixes.  rdx must be incremented by rcx to offset
186	   alignment adjustment.  */
187	addq	%rcx, %rdx
188	/* Need jump as we don't want to add/subtract rdx for first
189	   iteration of 4 x VEC_SIZE aligned loop.  */
190	jmp	L(loop_entry)
191# endif
192
193	.p2align 4,,11
194L(loop):
195# ifdef USE_AS_STRNLEN
196	subq	$(CHAR_PER_VEC * 4), %rdx
197	jbe	L(ret_max)
198L(loop_entry):
199# endif
200	/* VPMINU and VPCMP combination provide better performance as
201	   compared to alternative combinations.  */
202	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
203	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
204	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
205	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
206
207	VPTESTN	%VMM2, %VMM2, %k0
208	VPTESTN	%VMM4, %VMM4, %k1
209
210	subq	$-(VEC_SIZE * 4), %rax
211	KORTEST	%k0, %k1
212	jz	L(loop)
213
214	VPTESTN	%VMM1, %VMM1, %k2
215	KMOV	%k2, %RCX
216	test	%RCX, %RCX
217	jnz	L(ret_vec_x1)
218
219	KMOV	%k0, %RCX
220	/* At this point, if k0 is non zero, null char must be in the
221	   second vector.  */
222	test	%RCX, %RCX
223	jnz	L(ret_vec_x2)
224
225	VPTESTN	%VMM3, %VMM3, %k3
226	KMOV	%k3, %RCX
227	test	%RCX, %RCX
228	jnz	L(ret_vec_x3)
229	/* At this point null [w]char must be in the fourth vector so no
230	   need to check.  */
231	KMOV	%k1, %RCX
232
233	/* Fourth, third, second vector terminating are pretty much
234	   same, implemented this way to avoid branching and reuse code
235	   from pre loop exit condition.  */
236L(ret_vec_x4):
237	bsf	%RCX, %RCX
238	subq	%rdi, %rax
239# ifdef USE_AS_WCSLEN
240	subq	$-(VEC_SIZE * 3), %rax
241	shrq	$2, %rax
242	addq	%rcx, %rax
243# else
244	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
245# endif
246# ifdef USE_AS_STRNLEN
247	cmpq	%rsi, %rax
248	cmovnb	%rsi, %rax
249# endif
250	ret
251
252L(ret_vec_x3):
253	bsf	%RCX, %RCX
254	subq	%rdi, %rax
255# ifdef USE_AS_WCSLEN
256	subq	$-(VEC_SIZE * 2), %rax
257	shrq	$2, %rax
258	addq	%rcx, %rax
259# else
260	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
261# endif
262# ifdef USE_AS_STRNLEN
263	cmpq	%rsi, %rax
264	cmovnb	%rsi, %rax
265# endif
266	ret
267
268L(ret_vec_x2):
269	subq	$-VEC_SIZE, %rax
270L(ret_vec_x1):
271	bsf	%RCX, %RCX
272	subq	%rdi, %rax
273# ifdef USE_AS_WCSLEN
274	shrq	$2, %rax
275# endif
276	addq	%rcx, %rax
277# ifdef USE_AS_STRNLEN
278	cmpq	%rsi, %rax
279	cmovnb	%rsi, %rax
280# endif
281	ret
282
283L(page_cross):
284	movl	%eax, %ecx
285# ifdef USE_AS_WCSLEN
286	andl	$(VEC_SIZE - 1), %ecx
287	sarl	$2, %ecx
288# endif
289	/* ecx contains number of w[char] to be skipped as a result
290	   of address alignment.  */
291	xorq	%rdi, %rax
292	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
293	KMOV	%k0, %RAX
294	/* Ignore number of character for alignment adjustment.  */
295	SHR	%cl, %RAX
296	jz	L(align_more)
297
298	bsf	%RAX, %RAX
299# ifdef USE_AS_STRNLEN
300	cmpq	%rsi, %rax
301	cmovnb	%rsi, %rax
302# endif
303	ret
304
305END (STRLEN)
306#endif
307