1/* memrchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24# include "evex256-vecs.h"
25# if VEC_SIZE != 32
26#  error "VEC_SIZE != 32 unimplemented"
27# endif
28
29# ifndef MEMRCHR
30#  define MEMRCHR				__memrchr_evex
31# endif
32
33# define PAGE_SIZE			4096
34# define VECMATCH			VEC(0)
35
36	.section SECTION(.text), "ax", @progbits
37ENTRY_P2ALIGN(MEMRCHR, 6)
38# ifdef __ILP32__
39	/* Clear upper bits.  */
40	and	%RDX_LP, %RDX_LP
41# else
42	test	%RDX_LP, %RDX_LP
43# endif
44	jz	L(zero_0)
45
46	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
47	   correct page cross check and 2) it correctly sets up end ptr to be
48	   subtract by lzcnt aligned.  */
49	leaq	-1(%rdi, %rdx), %rax
50	vpbroadcastb %esi, %VECMATCH
51
52	/* Check if we can load 1x VEC without cross a page.  */
53	testl	$(PAGE_SIZE - VEC_SIZE), %eax
54	jz	L(page_cross)
55
56	/* Don't use rax for pointer here because EVEX has better encoding with
57	   offset % VEC_SIZE == 0.  */
58	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
59	kmovd	%k0, %ecx
60
61	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
62	cmpq	$VEC_SIZE, %rdx
63	ja	L(more_1x_vec)
64L(ret_vec_x0_test):
65
66	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
67	   will guarantee edx (len) is less than it.  */
68	lzcntl	%ecx, %ecx
69	cmpl	%ecx, %edx
70	jle	L(zero_0)
71	subq	%rcx, %rax
72	ret
73
74	/* Fits in aligning bytes of first cache line.  */
75L(zero_0):
76	xorl	%eax, %eax
77	ret
78
79	.p2align 4,, 9
80L(ret_vec_x0_dec):
81	decq	%rax
82L(ret_vec_x0):
83	lzcntl	%ecx, %ecx
84	subq	%rcx, %rax
85	ret
86
87	.p2align 4,, 10
88L(more_1x_vec):
89	testl	%ecx, %ecx
90	jnz	L(ret_vec_x0)
91
92	/* Align rax (pointer to string).  */
93	andq	$-VEC_SIZE, %rax
94
95	/* Recompute length after aligning.  */
96	movq	%rax, %rdx
97
98	/* Need no matter what.  */
99	vpcmpb	$0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
100	kmovd	%k0, %ecx
101
102	subq	%rdi, %rdx
103
104	cmpq	$(VEC_SIZE * 2), %rdx
105	ja	L(more_2x_vec)
106L(last_2x_vec):
107
108	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
109	decq	%rax
110	cmpl	$VEC_SIZE, %edx
111	jbe	L(ret_vec_x0_test)
112
113	testl	%ecx, %ecx
114	jnz	L(ret_vec_x0)
115
116	/* Don't use rax for pointer here because EVEX has better encoding with
117	   offset % VEC_SIZE == 0.  */
118	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
119	kmovd	%k0, %ecx
120	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
121	lzcntq	%rcx, %rcx
122	cmpl	%ecx, %edx
123	jle	L(zero_0)
124	subq	%rcx, %rax
125	ret
126
127	/* Inexpensive place to put this regarding code size / target alignments
128	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
129	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
130	   in first cache line.  */
131L(page_cross):
132	movq	%rax, %rsi
133	andq	$-VEC_SIZE, %rsi
134	vpcmpb	$0, (%rsi), %VECMATCH, %k0
135	kmovd	%k0, %r8d
136	/* Shift out negative alignment (because we are starting from endptr and
137	   working backwards).  */
138	movl	%eax, %ecx
139	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
140	notl	%ecx
141	shlxl	%ecx, %r8d, %ecx
142	cmpq	%rdi, %rsi
143	ja	L(more_1x_vec)
144	lzcntl	%ecx, %ecx
145	cmpl	%ecx, %edx
146	jle	L(zero_1)
147	subq	%rcx, %rax
148	ret
149
150	/* Continue creating zero labels that fit in aligning bytes and get
151	   2-byte encoding / are in the same cache line as condition.  */
152L(zero_1):
153	xorl	%eax, %eax
154	ret
155
156	.p2align 4,, 8
157L(ret_vec_x1):
158	/* This will naturally add 32 to position.  */
159	bsrl	%ecx, %ecx
160	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
161	ret
162
163	.p2align 4,, 8
164L(more_2x_vec):
165	testl	%ecx, %ecx
166	jnz	L(ret_vec_x0_dec)
167
168	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
169	kmovd	%k0, %ecx
170	testl	%ecx, %ecx
171	jnz	L(ret_vec_x1)
172
173	/* Need no matter what.  */
174	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
175	kmovd	%k0, %ecx
176
177	subq	$(VEC_SIZE * 4), %rdx
178	ja	L(more_4x_vec)
179
180	cmpl	$(VEC_SIZE * -1), %edx
181	jle	L(ret_vec_x2_test)
182L(last_vec):
183	testl	%ecx, %ecx
184	jnz	L(ret_vec_x2)
185
186
187	/* Need no matter what.  */
188	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
189	kmovd	%k0, %ecx
190	lzcntl	%ecx, %ecx
191	subq	$(VEC_SIZE * 3 + 1), %rax
192	subq	%rcx, %rax
193	cmpq	%rax, %rdi
194	ja	L(zero_1)
195	ret
196
197	.p2align 4,, 8
198L(ret_vec_x2_test):
199	lzcntl	%ecx, %ecx
200	subq	$(VEC_SIZE * 2 + 1), %rax
201	subq	%rcx, %rax
202	cmpq	%rax, %rdi
203	ja	L(zero_1)
204	ret
205
206	.p2align 4,, 8
207L(ret_vec_x2):
208	bsrl	%ecx, %ecx
209	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
210	ret
211
212	.p2align 4,, 8
213L(ret_vec_x3):
214	bsrl	%ecx, %ecx
215	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
216	ret
217
218	.p2align 4,, 8
219L(more_4x_vec):
220	testl	%ecx, %ecx
221	jnz	L(ret_vec_x2)
222
223	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
224	kmovd	%k0, %ecx
225
226	testl	%ecx, %ecx
227	jnz	L(ret_vec_x3)
228
229	/* Check if near end before re-aligning (otherwise might do an
230	   unnecessary loop iteration).  */
231	addq	$-(VEC_SIZE * 4), %rax
232	cmpq	$(VEC_SIZE * 4), %rdx
233	jbe	L(last_4x_vec)
234
235	decq	%rax
236	andq	$-(VEC_SIZE * 4), %rax
237	movq	%rdi, %rdx
238	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
239	   lengths that overflow can be valid and break the comparison.  */
240	andq	$-(VEC_SIZE * 4), %rdx
241
242	.p2align 4
243L(loop_4x_vec):
244	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
245	   on).  */
246	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
247
248	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
249	vpxorq	(VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
250	vpxorq	(VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
251	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
252
253	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
254	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
255	vpminub	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
256	vptestnmb %VEC(3), %VEC(3), %k2
257
258	/* Any 1s and we found CHAR.  */
259	kortestd %k2, %k4
260	jnz	L(loop_end)
261
262	addq	$-(VEC_SIZE * 4), %rax
263	cmpq	%rdx, %rax
264	jne	L(loop_4x_vec)
265
266	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
267	subq	$-(VEC_SIZE * 4), %rdx
268	movq	%rdx, %rax
269	subl	%edi, %edx
270L(last_4x_vec):
271
272	/* Used no matter what.  */
273	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
274	kmovd	%k0, %ecx
275
276	cmpl	$(VEC_SIZE * 2), %edx
277	jbe	L(last_2x_vec)
278
279	testl	%ecx, %ecx
280	jnz	L(ret_vec_x0_dec)
281
282
283	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
284	kmovd	%k0, %ecx
285
286	testl	%ecx, %ecx
287	jnz	L(ret_vec_x1)
288
289	/* Used no matter what.  */
290	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
291	kmovd	%k0, %ecx
292
293	cmpl	$(VEC_SIZE * 3), %edx
294	ja	L(last_vec)
295
296	lzcntl	%ecx, %ecx
297	subq	$(VEC_SIZE * 2 + 1), %rax
298	subq	%rcx, %rax
299	cmpq	%rax, %rdi
300	jbe	L(ret_1)
301	xorl	%eax, %eax
302L(ret_1):
303	ret
304
305	.p2align 4,, 6
306L(loop_end):
307	kmovd	%k1, %ecx
308	notl	%ecx
309	testl	%ecx, %ecx
310	jnz	L(ret_vec_x0_end)
311
312	vptestnmb %VEC(2), %VEC(2), %k0
313	kmovd	%k0, %ecx
314	testl	%ecx, %ecx
315	jnz	L(ret_vec_x1_end)
316
317	kmovd	%k2, %ecx
318	kmovd	%k4, %esi
319	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
320	   then it won't affect the result in esi (VEC4). If ecx is non-zero
321	   then CHAR in VEC3 and bsrq will use that position.  */
322	salq	$32, %rcx
323	orq	%rsi, %rcx
324	bsrq	%rcx, %rcx
325	addq	%rcx, %rax
326	ret
327	.p2align 4,, 4
328L(ret_vec_x0_end):
329	addq	$(VEC_SIZE), %rax
330L(ret_vec_x1_end):
331	bsrl	%ecx, %ecx
332	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
333	ret
334
335END(MEMRCHR)
336#endif
337