1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (3)
22
23# include <sysdep.h>
24
25# ifndef STRLEN
26#  define STRLEN	__strlen_avx2
27# endif
28
29# ifdef USE_AS_WCSLEN
30#  define VPCMPEQ	vpcmpeqd
31#  define VPMINU	vpminud
32#  define CHAR_SIZE	4
33# else
34#  define VPCMPEQ	vpcmpeqb
35#  define VPMINU	vpminub
36#  define CHAR_SIZE	1
37# endif
38
39# ifndef VZEROUPPER
40#  define VZEROUPPER	vzeroupper
41# endif
42
43# ifndef SECTION
44#  define SECTION(p)	p##.avx
45# endif
46
47# define VEC_SIZE 32
48# define PAGE_SIZE 4096
49# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
50
51	.section SECTION(.text),"ax",@progbits
52ENTRY (STRLEN)
53# ifdef USE_AS_STRNLEN
54	/* Check zero length.  */
55#  ifdef __ILP32__
56	/* Clear upper bits.  */
57	and	%RSI_LP, %RSI_LP
58#  else
59	test	%RSI_LP, %RSI_LP
60#  endif
61	jz	L(zero)
62	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
63	mov	%RSI_LP, %R8_LP
64# endif
65	movl	%edi, %eax
66	movq	%rdi, %rdx
67	vpxor	%xmm0, %xmm0, %xmm0
68	/* Clear high bits from edi. Only keeping bits relevant to page
69	   cross check.  */
70	andl	$(PAGE_SIZE - 1), %eax
71	/* Check if we may cross page boundary with one vector load.  */
72	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
73	ja	L(cross_page_boundary)
74
75	/* Check the first VEC_SIZE bytes.  */
76	VPCMPEQ	(%rdi), %ymm0, %ymm1
77	vpmovmskb %ymm1, %eax
78# ifdef USE_AS_STRNLEN
79	/* If length < VEC_SIZE handle special.  */
80	cmpq	$CHAR_PER_VEC, %rsi
81	jbe	L(first_vec_x0)
82# endif
83	/* If empty continue to aligned_more. Otherwise return bit
84	   position of first match.  */
85	testl	%eax, %eax
86	jz	L(aligned_more)
87	tzcntl	%eax, %eax
88# ifdef USE_AS_WCSLEN
89	/* NB: Divide bytes by 4 to get wchar_t count.  */
90	shrl	$2, %eax
91# endif
92	VZEROUPPER_RETURN
93
94# ifdef USE_AS_STRNLEN
95L(zero):
96	xorl	%eax, %eax
97	ret
98
99	.p2align 4
100L(first_vec_x0):
101	/* Set bit for max len so that tzcnt will return min of max len
102	   and position of first match.  */
103#  ifdef USE_AS_WCSLEN
104	/* NB: Multiply length by 4 to get byte count.  */
105	sall	$2, %esi
106#  endif
107	btsq	%rsi, %rax
108	tzcntl	%eax, %eax
109#  ifdef USE_AS_WCSLEN
110	/* NB: Divide bytes by 4 to get wchar_t count.  */
111	shrl	$2, %eax
112#  endif
113	VZEROUPPER_RETURN
114# endif
115
116	.p2align 4
117L(first_vec_x1):
118	tzcntl	%eax, %eax
119	/* Safe to use 32 bit instructions as these are only called for
120	   size = [1, 159].  */
121# ifdef USE_AS_STRNLEN
122	/* Use ecx which was computed earlier to compute correct value.
123	 */
124#  ifdef USE_AS_WCSLEN
125	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
126#  else
127	subl	$(VEC_SIZE * 4 + 1), %ecx
128	addl	%ecx, %eax
129#  endif
130# else
131	subl	%edx, %edi
132	incl	%edi
133	addl	%edi, %eax
134# endif
135# ifdef USE_AS_WCSLEN
136	/* NB: Divide bytes by 4 to get wchar_t count.  */
137	shrl	$2, %eax
138# endif
139	VZEROUPPER_RETURN
140
141	.p2align 4
142L(first_vec_x2):
143	tzcntl	%eax, %eax
144	/* Safe to use 32 bit instructions as these are only called for
145	   size = [1, 159].  */
146# ifdef USE_AS_STRNLEN
147	/* Use ecx which was computed earlier to compute correct value.
148	 */
149#  ifdef USE_AS_WCSLEN
150	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
151#  else
152	subl	$(VEC_SIZE * 3 + 1), %ecx
153	addl	%ecx, %eax
154#  endif
155# else
156	subl	%edx, %edi
157	addl	$(VEC_SIZE + 1), %edi
158	addl	%edi, %eax
159# endif
160# ifdef USE_AS_WCSLEN
161	/* NB: Divide bytes by 4 to get wchar_t count.  */
162	shrl	$2, %eax
163# endif
164	VZEROUPPER_RETURN
165
166	.p2align 4
167L(first_vec_x3):
168	tzcntl	%eax, %eax
169	/* Safe to use 32 bit instructions as these are only called for
170	   size = [1, 159].  */
171# ifdef USE_AS_STRNLEN
172	/* Use ecx which was computed earlier to compute correct value.
173	 */
174#  ifdef USE_AS_WCSLEN
175	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
176#  else
177	subl	$(VEC_SIZE * 2 + 1), %ecx
178	addl	%ecx, %eax
179#  endif
180# else
181	subl	%edx, %edi
182	addl	$(VEC_SIZE * 2 + 1), %edi
183	addl	%edi, %eax
184# endif
185# ifdef USE_AS_WCSLEN
186	/* NB: Divide bytes by 4 to get wchar_t count.  */
187	shrl	$2, %eax
188# endif
189	VZEROUPPER_RETURN
190
191	.p2align 4
192L(first_vec_x4):
193	tzcntl	%eax, %eax
194	/* Safe to use 32 bit instructions as these are only called for
195	   size = [1, 159].  */
196# ifdef USE_AS_STRNLEN
197	/* Use ecx which was computed earlier to compute correct value.
198	 */
199#  ifdef USE_AS_WCSLEN
200	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
201#  else
202	subl	$(VEC_SIZE + 1), %ecx
203	addl	%ecx, %eax
204#  endif
205# else
206	subl	%edx, %edi
207	addl	$(VEC_SIZE * 3 + 1), %edi
208	addl	%edi, %eax
209# endif
210# ifdef USE_AS_WCSLEN
211	/* NB: Divide bytes by 4 to get wchar_t count.  */
212	shrl	$2, %eax
213# endif
214	VZEROUPPER_RETURN
215
216	.p2align 5
217L(aligned_more):
218	/* Align data to VEC_SIZE - 1. This is the same number of
219	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
220	   code on the x4 check.  */
221	orq	$(VEC_SIZE - 1), %rdi
222L(cross_page_continue):
223	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
224	   since data is only aligned to VEC_SIZE.  */
225# ifdef USE_AS_STRNLEN
226	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
227	   because it simplies the logic in last_4x_vec_or_less.  */
228	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
229	subq	%rdx, %rcx
230#  ifdef USE_AS_WCSLEN
231	/* NB: Divide bytes by 4 to get the wchar_t count.  */
232	sarl	$2, %ecx
233#  endif
234# endif
235	/* Load first VEC regardless.  */
236	VPCMPEQ	1(%rdi), %ymm0, %ymm1
237# ifdef USE_AS_STRNLEN
238	/* Adjust length. If near end handle specially.  */
239	subq	%rcx, %rsi
240	jb	L(last_4x_vec_or_less)
241# endif
242	vpmovmskb %ymm1, %eax
243	testl	%eax, %eax
244	jnz	L(first_vec_x1)
245
246	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
247	vpmovmskb %ymm1, %eax
248	testl	%eax, %eax
249	jnz	L(first_vec_x2)
250
251	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
252	vpmovmskb %ymm1, %eax
253	testl	%eax, %eax
254	jnz	L(first_vec_x3)
255
256	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
257	vpmovmskb %ymm1, %eax
258	testl	%eax, %eax
259	jnz	L(first_vec_x4)
260
261	/* Align data to VEC_SIZE * 4 - 1.  */
262# ifdef USE_AS_STRNLEN
263	/* Before adjusting length check if at last VEC_SIZE * 4.  */
264	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
265	jbe	L(last_4x_vec_or_less_load)
266	incq	%rdi
267	movl	%edi, %ecx
268	orq	$(VEC_SIZE * 4 - 1), %rdi
269	andl	$(VEC_SIZE * 4 - 1), %ecx
270#  ifdef USE_AS_WCSLEN
271	/* NB: Divide bytes by 4 to get the wchar_t count.  */
272	sarl	$2, %ecx
273#  endif
274	/* Readjust length.  */
275	addq	%rcx, %rsi
276# else
277	incq	%rdi
278	orq	$(VEC_SIZE * 4 - 1), %rdi
279# endif
280	/* Compare 4 * VEC at a time forward.  */
281	.p2align 4
282L(loop_4x_vec):
283# ifdef USE_AS_STRNLEN
284	/* Break if at end of length.  */
285	subq	$(CHAR_PER_VEC * 4), %rsi
286	jb	L(last_4x_vec_or_less_cmpeq)
287# endif
288	/* Save some code size by microfusing VPMINU with the load.
289	   Since the matches in ymm2/ymm4 can only be returned if there
290	   where no matches in ymm1/ymm3 respectively there is no issue
291	   with overlap.  */
292	vmovdqa	1(%rdi), %ymm1
293	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
294	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
295	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
296
297	VPMINU	%ymm2, %ymm4, %ymm5
298	VPCMPEQ	%ymm5, %ymm0, %ymm5
299	vpmovmskb %ymm5, %ecx
300
301	subq	$-(VEC_SIZE * 4), %rdi
302	testl	%ecx, %ecx
303	jz	L(loop_4x_vec)
304
305
306	VPCMPEQ	%ymm1, %ymm0, %ymm1
307	vpmovmskb %ymm1, %eax
308	subq	%rdx, %rdi
309	testl	%eax, %eax
310	jnz	L(last_vec_return_x0)
311
312	VPCMPEQ	%ymm2, %ymm0, %ymm2
313	vpmovmskb %ymm2, %eax
314	testl	%eax, %eax
315	jnz	L(last_vec_return_x1)
316
317	/* Combine last 2 VEC.  */
318	VPCMPEQ	%ymm3, %ymm0, %ymm3
319	vpmovmskb %ymm3, %eax
320	/* rcx has combined result from all 4 VEC. It will only be used
321	   if the first 3 other VEC all did not contain a match.  */
322	salq	$32, %rcx
323	orq	%rcx, %rax
324	tzcntq	%rax, %rax
325	subq	$(VEC_SIZE * 2 - 1), %rdi
326	addq	%rdi, %rax
327# ifdef USE_AS_WCSLEN
328	/* NB: Divide bytes by 4 to get wchar_t count.  */
329	shrq	$2, %rax
330# endif
331	VZEROUPPER_RETURN
332
333
334# ifdef USE_AS_STRNLEN
335	.p2align 4
336L(last_4x_vec_or_less_load):
337	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
338	 */
339	subq	$-(VEC_SIZE * 4), %rdi
340L(last_4x_vec_or_less_cmpeq):
341	VPCMPEQ	1(%rdi), %ymm0, %ymm1
342L(last_4x_vec_or_less):
343#  ifdef USE_AS_WCSLEN
344	/* NB: Multiply length by 4 to get byte count.  */
345	sall	$2, %esi
346#  endif
347	vpmovmskb %ymm1, %eax
348	/* If remaining length > VEC_SIZE * 2. This works if esi is off
349	   by VEC_SIZE * 4.  */
350	testl	$(VEC_SIZE * 2), %esi
351	jnz	L(last_4x_vec)
352
353	/* length may have been negative or positive by an offset of
354	   VEC_SIZE * 4 depending on where this was called from. This fixes
355	   that.  */
356	andl	$(VEC_SIZE * 4 - 1), %esi
357	testl	%eax, %eax
358	jnz	L(last_vec_x1_check)
359
360	subl	$VEC_SIZE, %esi
361	jb	L(max)
362
363	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
364	vpmovmskb %ymm1, %eax
365	tzcntl	%eax, %eax
366	/* Check the end of data.  */
367	cmpl	%eax, %esi
368	jb	L(max)
369	subq	%rdx, %rdi
370	addl	$(VEC_SIZE + 1), %eax
371	addq	%rdi, %rax
372#  ifdef USE_AS_WCSLEN
373	/* NB: Divide bytes by 4 to get wchar_t count.  */
374	shrq	$2, %rax
375#  endif
376	VZEROUPPER_RETURN
377# endif
378
379	.p2align 4
380L(last_vec_return_x0):
381	tzcntl	%eax, %eax
382	subq	$(VEC_SIZE * 4 - 1), %rdi
383	addq	%rdi, %rax
384# ifdef USE_AS_WCSLEN
385	/* NB: Divide bytes by 4 to get wchar_t count.  */
386	shrq	$2, %rax
387# endif
388	VZEROUPPER_RETURN
389
390	.p2align 4
391L(last_vec_return_x1):
392	tzcntl	%eax, %eax
393	subq	$(VEC_SIZE * 3 - 1), %rdi
394	addq	%rdi, %rax
395# ifdef USE_AS_WCSLEN
396	/* NB: Divide bytes by 4 to get wchar_t count.  */
397	shrq	$2, %rax
398# endif
399	VZEROUPPER_RETURN
400
401# ifdef USE_AS_STRNLEN
402	.p2align 4
403L(last_vec_x1_check):
404
405	tzcntl	%eax, %eax
406	/* Check the end of data.  */
407	cmpl	%eax, %esi
408	jb	L(max)
409	subq	%rdx, %rdi
410	incl	%eax
411	addq	%rdi, %rax
412#  ifdef USE_AS_WCSLEN
413	/* NB: Divide bytes by 4 to get wchar_t count.  */
414	shrq	$2, %rax
415#  endif
416	VZEROUPPER_RETURN
417
418L(max):
419	movq	%r8, %rax
420	VZEROUPPER_RETURN
421
422	.p2align 4
423L(last_4x_vec):
424	/* Test first 2x VEC normally.  */
425	testl	%eax, %eax
426	jnz	L(last_vec_x1)
427
428	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
429	vpmovmskb %ymm1, %eax
430	testl	%eax, %eax
431	jnz	L(last_vec_x2)
432
433	/* Normalize length.  */
434	andl	$(VEC_SIZE * 4 - 1), %esi
435	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
436	vpmovmskb %ymm1, %eax
437	testl	%eax, %eax
438	jnz	L(last_vec_x3)
439
440	subl	$(VEC_SIZE * 3), %esi
441	jb	L(max)
442
443	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
444	vpmovmskb %ymm1, %eax
445	tzcntl	%eax, %eax
446	/* Check the end of data.  */
447	cmpl	%eax, %esi
448	jb	L(max)
449	subq	%rdx, %rdi
450	addl	$(VEC_SIZE * 3 + 1), %eax
451	addq	%rdi, %rax
452#  ifdef USE_AS_WCSLEN
453	/* NB: Divide bytes by 4 to get wchar_t count.  */
454	shrq	$2, %rax
455#  endif
456	VZEROUPPER_RETURN
457
458
459	.p2align 4
460L(last_vec_x1):
461	/* essentially duplicates of first_vec_x1 but use 64 bit
462	   instructions.  */
463	tzcntl	%eax, %eax
464	subq	%rdx, %rdi
465	incl	%eax
466	addq	%rdi, %rax
467#  ifdef USE_AS_WCSLEN
468	/* NB: Divide bytes by 4 to get wchar_t count.  */
469	shrq	$2, %rax
470#  endif
471	VZEROUPPER_RETURN
472
473	.p2align 4
474L(last_vec_x2):
475	/* essentially duplicates of first_vec_x1 but use 64 bit
476	   instructions.  */
477	tzcntl	%eax, %eax
478	subq	%rdx, %rdi
479	addl	$(VEC_SIZE + 1), %eax
480	addq	%rdi, %rax
481#  ifdef USE_AS_WCSLEN
482	/* NB: Divide bytes by 4 to get wchar_t count.  */
483	shrq	$2, %rax
484#  endif
485	VZEROUPPER_RETURN
486
487	.p2align 4
488L(last_vec_x3):
489	tzcntl	%eax, %eax
490	subl	$(VEC_SIZE * 2), %esi
491	/* Check the end of data.  */
492	cmpl	%eax, %esi
493	jb	L(max_end)
494	subq	%rdx, %rdi
495	addl	$(VEC_SIZE * 2 + 1), %eax
496	addq	%rdi, %rax
497#  ifdef USE_AS_WCSLEN
498	/* NB: Divide bytes by 4 to get wchar_t count.  */
499	shrq	$2, %rax
500#  endif
501	VZEROUPPER_RETURN
502L(max_end):
503	movq	%r8, %rax
504	VZEROUPPER_RETURN
505# endif
506
507	/* Cold case for crossing page with first load.  */
508	.p2align 4
509L(cross_page_boundary):
510	/* Align data to VEC_SIZE - 1.  */
511	orq	$(VEC_SIZE - 1), %rdi
512	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
513	vpmovmskb %ymm1, %eax
514	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
515	   so no need to manually mod rdx.  */
516	sarxl	%edx, %eax, %eax
517# ifdef USE_AS_STRNLEN
518	testl	%eax, %eax
519	jnz	L(cross_page_less_vec)
520	leaq	1(%rdi), %rcx
521	subq	%rdx, %rcx
522#  ifdef USE_AS_WCSLEN
523	/* NB: Divide bytes by 4 to get wchar_t count.  */
524	shrl	$2, %ecx
525#  endif
526	/* Check length.  */
527	cmpq	%rsi, %rcx
528	jb	L(cross_page_continue)
529	movq	%r8, %rax
530# else
531	testl	%eax, %eax
532	jz	L(cross_page_continue)
533	tzcntl	%eax, %eax
534#  ifdef USE_AS_WCSLEN
535	/* NB: Divide length by 4 to get wchar_t count.  */
536	shrl	$2, %eax
537#  endif
538# endif
539L(return_vzeroupper):
540	ZERO_UPPER_VEC_REGISTERS_RETURN
541
542# ifdef USE_AS_STRNLEN
543	.p2align 4
544L(cross_page_less_vec):
545	tzcntl	%eax, %eax
546#  ifdef USE_AS_WCSLEN
547	/* NB: Multiply length by 4 to get byte count.  */
548	sall	$2, %esi
549#  endif
550	cmpq	%rax, %rsi
551	cmovb	%esi, %eax
552#  ifdef USE_AS_WCSLEN
553	shrl	$2, %eax
554#  endif
555	VZEROUPPER_RETURN
556# endif
557
558END (STRLEN)
559#endif
560