1/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26#  define STRLEN	__strlen_evex
27# endif
28
29# define VMOVA		vmovdqa64
30
31# ifdef USE_AS_WCSLEN
32#  define VPCMP		vpcmpd
33#  define VPMINU	vpminud
34#  define SHIFT_REG ecx
35#  define CHAR_SIZE	4
36# else
37#  define VPCMP		vpcmpb
38#  define VPMINU	vpminub
39#  define SHIFT_REG edx
40#  define CHAR_SIZE	1
41# endif
42
43# define XMMZERO	xmm16
44# define YMMZERO	ymm16
45# define YMM1		ymm17
46# define YMM2		ymm18
47# define YMM3		ymm19
48# define YMM4		ymm20
49# define YMM5		ymm21
50# define YMM6		ymm22
51
52# define VEC_SIZE 32
53# define PAGE_SIZE 4096
54# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
55
56	.section .text.evex,"ax",@progbits
57ENTRY (STRLEN)
58# ifdef USE_AS_STRNLEN
59	/* Check zero length.  */
60	test	%RSI_LP, %RSI_LP
61	jz	L(zero)
62#  ifdef __ILP32__
63	/* Clear the upper 32 bits.  */
64	movl	%esi, %esi
65#  endif
66	mov	%RSI_LP, %R8_LP
67# endif
68	movl	%edi, %eax
69	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
70	/* Clear high bits from edi. Only keeping bits relevant to page
71	   cross check.  */
72	andl	$(PAGE_SIZE - 1), %eax
73	/* Check if we may cross page boundary with one vector load.  */
74	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
75	ja	L(cross_page_boundary)
76
77	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
78	   null byte.  */
79	VPCMP	$0, (%rdi), %YMMZERO, %k0
80	kmovd	%k0, %eax
81# ifdef USE_AS_STRNLEN
82	/* If length < CHAR_PER_VEC handle special.  */
83	cmpq	$CHAR_PER_VEC, %rsi
84	jbe	L(first_vec_x0)
85# endif
86	testl	%eax, %eax
87	jz	L(aligned_more)
88	tzcntl	%eax, %eax
89	ret
90# ifdef USE_AS_STRNLEN
91L(zero):
92	xorl	%eax, %eax
93	ret
94
95	.p2align 4
96L(first_vec_x0):
97	/* Set bit for max len so that tzcnt will return min of max len
98	   and position of first match.  */
99	btsq	%rsi, %rax
100	tzcntl	%eax, %eax
101	ret
102# endif
103
104	.p2align 4
105L(first_vec_x1):
106	tzcntl	%eax, %eax
107	/* Safe to use 32 bit instructions as these are only called for
108	   size = [1, 159].  */
109# ifdef USE_AS_STRNLEN
110	/* Use ecx which was computed earlier to compute correct value.
111	 */
112	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
113# else
114	subl	%edx, %edi
115#  ifdef USE_AS_WCSLEN
116	/* NB: Divide bytes by 4 to get the wchar_t count.  */
117	sarl	$2, %edi
118#  endif
119	leal	CHAR_PER_VEC(%rdi, %rax), %eax
120# endif
121	ret
122
123	.p2align 4
124L(first_vec_x2):
125	tzcntl	%eax, %eax
126	/* Safe to use 32 bit instructions as these are only called for
127	   size = [1, 159].  */
128# ifdef USE_AS_STRNLEN
129	/* Use ecx which was computed earlier to compute correct value.
130	 */
131	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
132# else
133	subl	%edx, %edi
134#  ifdef USE_AS_WCSLEN
135	/* NB: Divide bytes by 4 to get the wchar_t count.  */
136	sarl	$2, %edi
137#  endif
138	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
139# endif
140	ret
141
142	.p2align 4
143L(first_vec_x3):
144	tzcntl	%eax, %eax
145	/* Safe to use 32 bit instructions as these are only called for
146	   size = [1, 159].  */
147# ifdef USE_AS_STRNLEN
148	/* Use ecx which was computed earlier to compute correct value.
149	 */
150	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
151# else
152	subl	%edx, %edi
153#  ifdef USE_AS_WCSLEN
154	/* NB: Divide bytes by 4 to get the wchar_t count.  */
155	sarl	$2, %edi
156#  endif
157	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
158# endif
159	ret
160
161	.p2align 4
162L(first_vec_x4):
163	tzcntl	%eax, %eax
164	/* Safe to use 32 bit instructions as these are only called for
165	   size = [1, 159].  */
166# ifdef USE_AS_STRNLEN
167	/* Use ecx which was computed earlier to compute correct value.
168	 */
169	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
170# else
171	subl	%edx, %edi
172#  ifdef USE_AS_WCSLEN
173	/* NB: Divide bytes by 4 to get the wchar_t count.  */
174	sarl	$2, %edi
175#  endif
176	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
177# endif
178	ret
179
180	.p2align 5
181L(aligned_more):
182	movq	%rdi, %rdx
183	/* Align data to VEC_SIZE.  */
184	andq	$-(VEC_SIZE), %rdi
185L(cross_page_continue):
186	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
187	   since data is only aligned to VEC_SIZE.  */
188# ifdef USE_AS_STRNLEN
189	/* + CHAR_SIZE because it simplies the logic in
190	   last_4x_vec_or_less.  */
191	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
192	subq	%rdx, %rcx
193#  ifdef USE_AS_WCSLEN
194	/* NB: Divide bytes by 4 to get the wchar_t count.  */
195	sarl	$2, %ecx
196#  endif
197# endif
198	/* Load first VEC regardless.  */
199	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
200# ifdef USE_AS_STRNLEN
201	/* Adjust length. If near end handle specially.  */
202	subq	%rcx, %rsi
203	jb	L(last_4x_vec_or_less)
204# endif
205	kmovd	%k0, %eax
206	testl	%eax, %eax
207	jnz	L(first_vec_x1)
208
209	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
210	kmovd	%k0, %eax
211	test	%eax, %eax
212	jnz	L(first_vec_x2)
213
214	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
215	kmovd	%k0, %eax
216	testl	%eax, %eax
217	jnz	L(first_vec_x3)
218
219	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
220	kmovd	%k0, %eax
221	testl	%eax, %eax
222	jnz	L(first_vec_x4)
223
224	addq	$VEC_SIZE, %rdi
225# ifdef USE_AS_STRNLEN
226	/* Check if at last VEC_SIZE * 4 length.  */
227	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
228	jbe	L(last_4x_vec_or_less_load)
229	movl	%edi, %ecx
230	andl	$(VEC_SIZE * 4 - 1), %ecx
231#  ifdef USE_AS_WCSLEN
232	/* NB: Divide bytes by 4 to get the wchar_t count.  */
233	sarl	$2, %ecx
234#  endif
235	/* Readjust length.  */
236	addq	%rcx, %rsi
237# endif
238	/* Align data to VEC_SIZE * 4.  */
239	andq	$-(VEC_SIZE * 4), %rdi
240
241	/* Compare 4 * VEC at a time forward.  */
242	.p2align 4
243L(loop_4x_vec):
244	/* Load first VEC regardless.  */
245	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
246# ifdef USE_AS_STRNLEN
247	/* Break if at end of length.  */
248	subq	$(CHAR_PER_VEC * 4), %rsi
249	jb	L(last_4x_vec_or_less_cmpeq)
250# endif
251	/* Save some code size by microfusing VPMINU with the load. Since
252	   the matches in ymm2/ymm4 can only be returned if there where no
253	   matches in ymm1/ymm3 respectively there is no issue with overlap.
254	 */
255	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
256	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
257	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
258
259	VPCMP	$0, %YMM2, %YMMZERO, %k0
260	VPCMP	$0, %YMM4, %YMMZERO, %k1
261	subq	$-(VEC_SIZE * 4), %rdi
262	kortestd	%k0, %k1
263	jz	L(loop_4x_vec)
264
265	/* Check if end was in first half.  */
266	kmovd	%k0, %eax
267	subq	%rdx, %rdi
268# ifdef USE_AS_WCSLEN
269	shrq	$2, %rdi
270# endif
271	testl	%eax, %eax
272	jz	L(second_vec_return)
273
274	VPCMP	$0, %YMM1, %YMMZERO, %k2
275	kmovd	%k2, %edx
276	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
277# ifdef USE_AS_WCSLEN
278	sall	$CHAR_PER_VEC, %eax
279	orl	%edx, %eax
280	tzcntl	%eax, %eax
281# else
282	salq	$CHAR_PER_VEC, %rax
283	orq	%rdx, %rax
284	tzcntq	%rax, %rax
285# endif
286	addq	%rdi, %rax
287	ret
288
289
290# ifdef USE_AS_STRNLEN
291
292L(last_4x_vec_or_less_load):
293	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
294	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
295L(last_4x_vec_or_less_cmpeq):
296	VPCMP	$0, %YMM1, %YMMZERO, %k0
297	addq	$(VEC_SIZE * 3), %rdi
298L(last_4x_vec_or_less):
299	kmovd	%k0, %eax
300	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
301	   VEC_SIZE * 4.  */
302	testl	$(CHAR_PER_VEC * 2), %esi
303	jnz	L(last_4x_vec)
304
305	/* length may have been negative or positive by an offset of
306	   CHAR_PER_VEC * 4 depending on where this was called from. This
307	   fixes that.  */
308	andl	$(CHAR_PER_VEC * 4 - 1), %esi
309	testl	%eax, %eax
310	jnz	L(last_vec_x1_check)
311
312	/* Check the end of data.  */
313	subl	$CHAR_PER_VEC, %esi
314	jb	L(max)
315
316	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
317	kmovd	%k0, %eax
318	tzcntl	%eax, %eax
319	/* Check the end of data.  */
320	cmpl	%eax, %esi
321	jb	L(max)
322
323	subq	%rdx, %rdi
324#  ifdef USE_AS_WCSLEN
325	/* NB: Divide bytes by 4 to get the wchar_t count.  */
326	sarq	$2, %rdi
327#  endif
328	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
329	ret
330L(max):
331	movq	%r8, %rax
332	ret
333# endif
334
335	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
336	   in the 4x VEC loop can use 2 byte encoding.  */
337	.p2align 4
338L(second_vec_return):
339	VPCMP	$0, %YMM3, %YMMZERO, %k0
340	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
341# ifdef USE_AS_WCSLEN
342	kunpckbw	%k0, %k1, %k0
343	kmovd	%k0, %eax
344	tzcntl	%eax, %eax
345# else
346	kunpckdq	%k0, %k1, %k0
347	kmovq	%k0, %rax
348	tzcntq	%rax, %rax
349# endif
350	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
351	ret
352
353
354# ifdef USE_AS_STRNLEN
355L(last_vec_x1_check):
356	tzcntl	%eax, %eax
357	/* Check the end of data.  */
358	cmpl	%eax, %esi
359	jb	L(max)
360	subq	%rdx, %rdi
361#  ifdef USE_AS_WCSLEN
362	/* NB: Divide bytes by 4 to get the wchar_t count.  */
363	sarq	$2, %rdi
364#  endif
365	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
366	ret
367
368	.p2align 4
369L(last_4x_vec):
370	/* Test first 2x VEC normally.  */
371	testl	%eax, %eax
372	jnz	L(last_vec_x1)
373
374	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
375	kmovd	%k0, %eax
376	testl	%eax, %eax
377	jnz	L(last_vec_x2)
378
379	/* Normalize length.  */
380	andl	$(CHAR_PER_VEC * 4 - 1), %esi
381	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
382	kmovd	%k0, %eax
383	testl	%eax, %eax
384	jnz	L(last_vec_x3)
385
386	/* Check the end of data.  */
387	subl	$(CHAR_PER_VEC * 3), %esi
388	jb	L(max)
389
390	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
391	kmovd	%k0, %eax
392	tzcntl	%eax, %eax
393	/* Check the end of data.  */
394	cmpl	%eax, %esi
395	jb	L(max_end)
396
397	subq	%rdx, %rdi
398#  ifdef USE_AS_WCSLEN
399	/* NB: Divide bytes by 4 to get the wchar_t count.  */
400	sarq	$2, %rdi
401#  endif
402	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
403	ret
404
405	.p2align 4
406L(last_vec_x1):
407	tzcntl	%eax, %eax
408	subq	%rdx, %rdi
409#  ifdef USE_AS_WCSLEN
410	/* NB: Divide bytes by 4 to get the wchar_t count.  */
411	sarq	$2, %rdi
412#  endif
413	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
414	ret
415
416	.p2align 4
417L(last_vec_x2):
418	tzcntl	%eax, %eax
419	subq	%rdx, %rdi
420#  ifdef USE_AS_WCSLEN
421	/* NB: Divide bytes by 4 to get the wchar_t count.  */
422	sarq	$2, %rdi
423#  endif
424	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
425	ret
426
427	.p2align 4
428L(last_vec_x3):
429	tzcntl	%eax, %eax
430	subl	$(CHAR_PER_VEC * 2), %esi
431	/* Check the end of data.  */
432	cmpl	%eax, %esi
433	jb	L(max_end)
434	subq	%rdx, %rdi
435#  ifdef USE_AS_WCSLEN
436	/* NB: Divide bytes by 4 to get the wchar_t count.  */
437	sarq	$2, %rdi
438#  endif
439	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
440	ret
441L(max_end):
442	movq	%r8, %rax
443	ret
444# endif
445
446	/* Cold case for crossing page with first load.	 */
447	.p2align 4
448L(cross_page_boundary):
449	movq	%rdi, %rdx
450	/* Align data to VEC_SIZE.  */
451	andq	$-VEC_SIZE, %rdi
452	VPCMP	$0, (%rdi), %YMMZERO, %k0
453	kmovd	%k0, %eax
454	/* Remove the leading bytes.  */
455# ifdef USE_AS_WCSLEN
456	/* NB: Divide shift count by 4 since each bit in K0 represent 4
457	   bytes.  */
458	movl	%edx, %ecx
459	shrl	$2, %ecx
460	andl	$(CHAR_PER_VEC - 1), %ecx
461# endif
462	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
463	sarxl	%SHIFT_REG, %eax, %eax
464	testl	%eax, %eax
465# ifndef USE_AS_STRNLEN
466	jz	L(cross_page_continue)
467	tzcntl	%eax, %eax
468	ret
469# else
470	jnz	L(cross_page_less_vec)
471#  ifndef USE_AS_WCSLEN
472	movl	%edx, %ecx
473	andl	$(CHAR_PER_VEC - 1), %ecx
474#  endif
475	movl	$CHAR_PER_VEC, %eax
476	subl	%ecx, %eax
477	/* Check the end of data.  */
478	cmpq	%rax, %rsi
479	ja	L(cross_page_continue)
480	movl	%esi, %eax
481	ret
482L(cross_page_less_vec):
483	tzcntl	%eax, %eax
484	/* Select min of length and position of first null.  */
485	cmpq	%rax, %rsi
486	cmovb	%esi, %eax
487	ret
488# endif
489
490END (STRLEN)
491#endif
492