1/* memchr/wmemchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20#include <sysdep.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24# ifndef MEMCHR
25#  define MEMCHR	__memchr_evex
26# endif
27
28# ifdef USE_AS_WMEMCHR
29#  define VPBROADCAST	vpbroadcastd
30#  define VPMINU	vpminud
31#  define VPCMP	vpcmpd
32#  define VPCMPEQ	vpcmpeqd
33#  define CHAR_SIZE	4
34# else
35#  define VPBROADCAST	vpbroadcastb
36#  define VPMINU	vpminub
37#  define VPCMP	vpcmpb
38#  define VPCMPEQ	vpcmpeqb
39#  define CHAR_SIZE	1
40# endif
41
42	/* In the 4x loop the RTM and non-RTM versions have data pointer
43	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
44	   This is represented by BASE_OFFSET. As well because the RTM
45	   version uses vpcmp which stores a bit per element compared where
46	   the non-RTM version uses vpcmpeq which stores a bit per byte
47	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
48	   version.  */
49# ifdef USE_IN_RTM
50#  define VZEROUPPER
51#  define BASE_OFFSET	(VEC_SIZE * 4)
52#  define RET_SCALE	CHAR_SIZE
53# else
54#  define VZEROUPPER	vzeroupper
55#  define BASE_OFFSET	0
56#  define RET_SCALE	1
57# endif
58
59	/* In the return from 4x loop memchr and rawmemchr versions have
60	   data pointers off by VEC_SIZE * 4 with memchr version being
61	   VEC_SIZE * 4 greater.  */
62# ifdef USE_AS_RAWMEMCHR
63#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
64#  define RAW_PTR_REG	rcx
65#  define ALGN_PTR_REG	rdi
66# else
67#  define RET_OFFSET	BASE_OFFSET
68#  define RAW_PTR_REG	rdi
69#  define ALGN_PTR_REG	rcx
70# endif
71
72# define XMMZERO	xmm23
73# define YMMZERO	ymm23
74# define XMMMATCH	xmm16
75# define YMMMATCH	ymm16
76# define YMM1		ymm17
77# define YMM2		ymm18
78# define YMM3		ymm19
79# define YMM4		ymm20
80# define YMM5		ymm21
81# define YMM6		ymm22
82
83# ifndef SECTION
84#  define SECTION(p)	p##.evex
85# endif
86
87# define VEC_SIZE 32
88# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
89# define PAGE_SIZE 4096
90
91	.section SECTION(.text),"ax",@progbits
92ENTRY_P2ALIGN (MEMCHR, 6)
93# ifndef USE_AS_RAWMEMCHR
94	/* Check for zero length.  */
95	test	%RDX_LP, %RDX_LP
96	jz	L(zero)
97
98#  ifdef __ILP32__
99	/* Clear the upper 32 bits.  */
100	movl	%edx, %edx
101#  endif
102# endif
103	/* Broadcast CHAR to YMMMATCH.  */
104	VPBROADCAST %esi, %YMMMATCH
105	/* Check if we may cross page boundary with one vector load.  */
106	movl	%edi, %eax
107	andl	$(PAGE_SIZE - 1), %eax
108	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
109	ja	L(cross_page_boundary)
110
111	/* Check the first VEC_SIZE bytes.  */
112	VPCMP	$0, (%rdi), %YMMMATCH, %k0
113	kmovd	%k0, %eax
114# ifndef USE_AS_RAWMEMCHR
115	/* If length < CHAR_PER_VEC handle special.  */
116	cmpq	$CHAR_PER_VEC, %rdx
117	jbe	L(first_vec_x0)
118# endif
119	testl	%eax, %eax
120	jz	L(aligned_more)
121	tzcntl	%eax, %eax
122# ifdef USE_AS_WMEMCHR
123	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
124	leaq	(%rdi, %rax, CHAR_SIZE), %rax
125# else
126	addq	%rdi, %rax
127# endif
128	ret
129
130# ifndef USE_AS_RAWMEMCHR
131L(zero):
132	xorl	%eax, %eax
133	ret
134
135	.p2align 4
136L(first_vec_x0):
137	/* Check if first match was before length. NB: tzcnt has false data-
138	   dependency on destination. eax already had a data-dependency on esi
139	   so this should have no affect here.  */
140	tzcntl	%eax, %esi
141#  ifdef USE_AS_WMEMCHR
142	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
143#  else
144	addq	%rsi, %rdi
145#  endif
146	xorl	%eax, %eax
147	cmpl	%esi, %edx
148	cmovg	%rdi, %rax
149	ret
150# endif
151
152	.p2align 4
153L(cross_page_boundary):
154	/* Save pointer before aligning as its original value is
155	   necessary for computer return address if byte is found or
156	   adjusting length if it is not and this is memchr.  */
157	movq	%rdi, %rcx
158	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
159	   for rawmemchr.  */
160	andq	$-VEC_SIZE, %ALGN_PTR_REG
161	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
162	kmovd	%k0, %r8d
163# ifdef USE_AS_WMEMCHR
164	/* NB: Divide shift count by 4 since each bit in K0 represent 4
165	   bytes.  */
166	sarl	$2, %eax
167# endif
168# ifndef USE_AS_RAWMEMCHR
169	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
170	subl	%eax, %esi
171# endif
172# ifdef USE_AS_WMEMCHR
173	andl	$(CHAR_PER_VEC - 1), %eax
174# endif
175	/* Remove the leading bytes.  */
176	sarxl	%eax, %r8d, %eax
177# ifndef USE_AS_RAWMEMCHR
178	/* Check the end of data.  */
179	cmpq	%rsi, %rdx
180	jbe	L(first_vec_x0)
181# endif
182	testl	%eax, %eax
183	jz	L(cross_page_continue)
184	tzcntl	%eax, %eax
185# ifdef USE_AS_WMEMCHR
186	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
187	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
188# else
189	addq	%RAW_PTR_REG, %rax
190# endif
191	ret
192
193	.p2align 4
194L(first_vec_x1):
195	tzcntl	%eax, %eax
196	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
197	ret
198
199	.p2align 4
200L(first_vec_x2):
201	tzcntl	%eax, %eax
202	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
203	ret
204
205	.p2align 4
206L(first_vec_x3):
207	tzcntl	%eax, %eax
208	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
209	ret
210
211	.p2align 4
212L(first_vec_x4):
213	tzcntl	%eax, %eax
214	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
215	ret
216
217	.p2align 5
218L(aligned_more):
219	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
220	   since data is only aligned to VEC_SIZE.  */
221
222# ifndef USE_AS_RAWMEMCHR
223	/* Align data to VEC_SIZE.  */
224L(cross_page_continue):
225	xorl	%ecx, %ecx
226	subl	%edi, %ecx
227	andq	$-VEC_SIZE, %rdi
228	/* esi is for adjusting length to see if near the end.  */
229	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
230#  ifdef USE_AS_WMEMCHR
231	/* NB: Divide bytes by 4 to get the wchar_t count.  */
232	sarl	$2, %esi
233#  endif
234# else
235	andq	$-VEC_SIZE, %rdi
236L(cross_page_continue):
237# endif
238	/* Load first VEC regardless.  */
239	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
240	kmovd	%k0, %eax
241# ifndef USE_AS_RAWMEMCHR
242	/* Adjust length. If near end handle specially.  */
243	subq	%rsi, %rdx
244	jbe	L(last_4x_vec_or_less)
245# endif
246	testl	%eax, %eax
247	jnz	L(first_vec_x1)
248
249	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
250	kmovd	%k0, %eax
251	testl	%eax, %eax
252	jnz	L(first_vec_x2)
253
254	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
255	kmovd	%k0, %eax
256	testl	%eax, %eax
257	jnz	L(first_vec_x3)
258
259	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
260	kmovd	%k0, %eax
261	testl	%eax, %eax
262	jnz	L(first_vec_x4)
263
264
265# ifndef USE_AS_RAWMEMCHR
266	/* Check if at last CHAR_PER_VEC * 4 length.  */
267	subq	$(CHAR_PER_VEC * 4), %rdx
268	jbe	L(last_4x_vec_or_less_cmpeq)
269	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
270	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
271
272	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
273	 */
274#  ifdef USE_AS_WMEMCHR
275	movl	%edi, %ecx
276	andq	$-(4 * VEC_SIZE), %rdi
277	subl	%edi, %ecx
278	/* NB: Divide bytes by 4 to get the wchar_t count.  */
279	sarl	$2, %ecx
280	addq	%rcx, %rdx
281#  else
282	addq	%rdi, %rdx
283	andq	$-(4 * VEC_SIZE), %rdi
284	subq	%rdi, %rdx
285#  endif
286# else
287	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
288	andq	$-(4 * VEC_SIZE), %rdi
289# endif
290# ifdef USE_IN_RTM
291	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
292# else
293	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
294	   encodable with EVEX registers (ymm16-ymm31).  */
295	vmovdqa64 %YMMMATCH, %ymm0
296# endif
297
298	/* Compare 4 * VEC at a time forward.  */
299	.p2align 4
300L(loop_4x_vec):
301	/* Two versions of the loop. One that does not require
302	   vzeroupper by not using ymm0-ymm15 and another does that require
303	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
304	   is used at all is because there is no EVEX encoding vpcmpeq and
305	   with vpcmpeq this loop can be performed more efficiently. The
306	   non-vzeroupper version is safe for RTM while the vzeroupper
307	   version should be prefered if RTM are not supported.  */
308# ifdef USE_IN_RTM
309	/* It would be possible to save some instructions using 4x VPCMP
310	   but bottleneck on port 5 makes it not woth it.  */
311	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
312	/* xor will set bytes match esi to zero.  */
313	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
314	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
315	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
316	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
317	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
318	VPCMP	$0, %YMM3, %YMMZERO, %k2
319# else
320	/* Since vptern can only take 3x vectors fastest to do 1 vec
321	   seperately with EVEX vpcmp.  */
322#  ifdef USE_AS_WMEMCHR
323	/* vptern can only accept masks for epi32/epi64 so can only save
324	   instruction using not equals mask on vptern with wmemchr.  */
325	VPCMP	$4, (%rdi), %YMMMATCH, %k1
326#  else
327	VPCMP	$0, (%rdi), %YMMMATCH, %k1
328#  endif
329	/* Compare 3x with vpcmpeq and or them all together with vptern.
330	 */
331	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
332	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
333	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
334#  ifdef USE_AS_WMEMCHR
335	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
336	   combines result from VEC0 with zero mask.  */
337	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
338	vpmovmskb %ymm4, %ecx
339#  else
340	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
341	vpternlogd $254, %ymm2, %ymm3, %ymm4
342	vpmovmskb %ymm4, %ecx
343	kmovd	%k1, %eax
344#  endif
345# endif
346
347# ifdef USE_AS_RAWMEMCHR
348	subq	$-(VEC_SIZE * 4), %rdi
349# endif
350# ifdef USE_IN_RTM
351	kortestd %k2, %k3
352# else
353#  ifdef USE_AS_WMEMCHR
354	/* ecx contains not of matches. All 1s means no matches. incl will
355	   overflow and set zeroflag if that is the case.  */
356	incl	%ecx
357#  else
358	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
359	   to ecx is not an issue because if eax is non-zero it will be
360	   used for returning the match. If it is zero the add does
361	   nothing.  */
362	addq	%rax, %rcx
363#  endif
364# endif
365# ifdef USE_AS_RAWMEMCHR
366	jz	L(loop_4x_vec)
367# else
368	jnz	L(loop_4x_vec_end)
369
370	subq	$-(VEC_SIZE * 4), %rdi
371
372	subq	$(CHAR_PER_VEC * 4), %rdx
373	ja	L(loop_4x_vec)
374
375	/* Fall through into less than 4 remaining vectors of length case.
376	 */
377	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
378	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
379	kmovd	%k0, %eax
380	VZEROUPPER
381
382L(last_4x_vec_or_less):
383	/* Check if first VEC contained match.  */
384	testl	%eax, %eax
385	jnz	L(first_vec_x1_check)
386
387	/* If remaining length > CHAR_PER_VEC * 2.  */
388	addl	$(CHAR_PER_VEC * 2), %edx
389	jg	L(last_4x_vec)
390
391L(last_2x_vec):
392	/* If remaining length < CHAR_PER_VEC.  */
393	addl	$CHAR_PER_VEC, %edx
394	jle	L(zero_end)
395
396	/* Check VEC2 and compare any match with remaining length.  */
397	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
398	kmovd	%k0, %eax
399	tzcntl	%eax, %eax
400	cmpl	%eax, %edx
401	jbe	L(set_zero_end)
402	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
403L(zero_end):
404	ret
405
406L(set_zero_end):
407	xorl	%eax, %eax
408	ret
409
410	.p2align 4
411L(first_vec_x1_check):
412	/* eax must be non-zero. Use bsfl to save code size.  */
413	bsfl	%eax, %eax
414	/* Adjust length.  */
415	subl	$-(CHAR_PER_VEC * 4), %edx
416	/* Check if match within remaining length.  */
417	cmpl	%eax, %edx
418	jbe	L(set_zero_end)
419	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
420	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
421	ret
422
423	.p2align 4
424L(loop_4x_vec_end):
425# endif
426	/* rawmemchr will fall through into this if match was found in
427	   loop.  */
428
429# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
430	/* k1 has not of matches with VEC1.  */
431	kmovd	%k1, %eax
432#  ifdef USE_AS_WMEMCHR
433	subl	$((1 << CHAR_PER_VEC) - 1), %eax
434#  else
435	incl	%eax
436#  endif
437# else
438	/* eax already has matches for VEC1.  */
439	testl	%eax, %eax
440# endif
441	jnz	L(last_vec_x1_return)
442
443# ifdef USE_IN_RTM
444	VPCMP	$0, %YMM2, %YMMZERO, %k0
445	kmovd	%k0, %eax
446# else
447	vpmovmskb %ymm2, %eax
448# endif
449	testl	%eax, %eax
450	jnz	L(last_vec_x2_return)
451
452# ifdef USE_IN_RTM
453	kmovd	%k2, %eax
454	testl	%eax, %eax
455	jnz	L(last_vec_x3_return)
456
457	kmovd	%k3, %eax
458	tzcntl	%eax, %eax
459	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
460# else
461	vpmovmskb %ymm3, %eax
462	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
463	salq	$VEC_SIZE, %rcx
464	orq	%rcx, %rax
465	tzcntq	%rax, %rax
466	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
467	VZEROUPPER
468# endif
469	ret
470
471	.p2align 4,, 10
472L(last_vec_x1_return):
473	tzcntl	%eax, %eax
474# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
475	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
476	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
477# else
478	addq	%rdi, %rax
479# endif
480	VZEROUPPER
481	ret
482
483	.p2align 4
484L(last_vec_x2_return):
485	tzcntl	%eax, %eax
486	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
487	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
488	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
489	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
490	VZEROUPPER
491	ret
492
493# ifdef USE_IN_RTM
494	.p2align 4
495L(last_vec_x3_return):
496	tzcntl	%eax, %eax
497	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
498	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
499	ret
500# endif
501
502# ifndef USE_AS_RAWMEMCHR
503	.p2align 4,, 5
504L(last_4x_vec_or_less_cmpeq):
505	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
506	kmovd	%k0, %eax
507	subq	$-(VEC_SIZE * 4), %rdi
508	/* Check first VEC regardless.  */
509	testl	%eax, %eax
510	jnz	L(first_vec_x1_check)
511
512	/* If remaining length <= CHAR_PER_VEC * 2.  */
513	addl	$(CHAR_PER_VEC * 2), %edx
514	jle	L(last_2x_vec)
515
516	.p2align 4
517L(last_4x_vec):
518	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
519	kmovd	%k0, %eax
520	testl	%eax, %eax
521	jnz	L(last_vec_x2)
522
523
524	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
525	kmovd	%k0, %eax
526	/* Create mask for possible matches within remaining length.  */
527#  ifdef USE_AS_WMEMCHR
528	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
529	bzhil	%edx, %ecx, %ecx
530#  else
531	movq	$-1, %rcx
532	bzhiq	%rdx, %rcx, %rcx
533#  endif
534	/* Test matches in data against length match.  */
535	andl	%ecx, %eax
536	jnz	L(last_vec_x3)
537
538	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
539	   remaining length was found to be > CHAR_PER_VEC * 2.  */
540	subl	$CHAR_PER_VEC, %edx
541	jbe	L(zero_end2)
542
543
544	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
545	kmovd	%k0, %eax
546	/* Shift remaining length mask for last VEC.  */
547#  ifdef USE_AS_WMEMCHR
548	shrl	$CHAR_PER_VEC, %ecx
549#  else
550	shrq	$CHAR_PER_VEC, %rcx
551#  endif
552	andl	%ecx, %eax
553	jz	L(zero_end2)
554	bsfl	%eax, %eax
555	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
556L(zero_end2):
557	ret
558
559L(last_vec_x2):
560	tzcntl	%eax, %eax
561	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
562	ret
563
564	.p2align 4
565L(last_vec_x3):
566	tzcntl	%eax, %eax
567	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
568	ret
569# endif
570	/* 7 bytes from next cache line.  */
571END (MEMCHR)
572#endif
573