1/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21#if ISA_SHOULD_BUILD (4)
22
23# include <sysdep.h>
24
25# ifndef STRRCHR
26#  define STRRCHR	__strrchr_evex
27# endif
28
29# define VMOVU	vmovdqu64
30# define VMOVA	vmovdqa64
31
32# ifdef USE_AS_WCSRCHR
33#  define SHIFT_REG	esi
34
35#  define kunpck	kunpckbw
36#  define kmov_2x	kmovd
37#  define maskz_2x	ecx
38#  define maskm_2x	eax
39#  define CHAR_SIZE	4
40#  define VPMIN	vpminud
41#  define VPTESTN	vptestnmd
42#  define VPBROADCAST	vpbroadcastd
43#  define VPCMP	vpcmpd
44# else
45#  define SHIFT_REG	edi
46
47#  define kunpck	kunpckdq
48#  define kmov_2x	kmovq
49#  define maskz_2x	rcx
50#  define maskm_2x	rax
51
52#  define CHAR_SIZE	1
53#  define VPMIN	vpminub
54#  define VPTESTN	vptestnmb
55#  define VPBROADCAST	vpbroadcastb
56#  define VPCMP	vpcmpb
57# endif
58
59# define XMMZERO	xmm16
60# define YMMZERO	ymm16
61# define YMMMATCH	ymm17
62# define YMMSAVE	ymm18
63
64# define YMM1	ymm19
65# define YMM2	ymm20
66# define YMM3	ymm21
67# define YMM4	ymm22
68# define YMM5	ymm23
69# define YMM6	ymm24
70# define YMM7	ymm25
71# define YMM8	ymm26
72
73
74# define VEC_SIZE	32
75# define PAGE_SIZE	4096
76	.section .text.evex, "ax", @progbits
77ENTRY(STRRCHR)
78	movl	%edi, %eax
79	/* Broadcast CHAR to YMMMATCH.  */
80	VPBROADCAST %esi, %YMMMATCH
81
82	andl	$(PAGE_SIZE - 1), %eax
83	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
84	jg	L(cross_page_boundary)
85
86L(page_cross_continue):
87	VMOVU	(%rdi), %YMM1
88	/* k0 has a 1 for each zero CHAR in YMM1.  */
89	VPTESTN	%YMM1, %YMM1, %k0
90	kmovd	%k0, %ecx
91	testl	%ecx, %ecx
92	jz	L(aligned_more)
93	/* fallthrough: zero CHAR in first VEC.  */
94
95	/* K1 has a 1 for each search CHAR match in YMM1.  */
96	VPCMP	$0, %YMMMATCH, %YMM1, %k1
97	kmovd	%k1, %eax
98	/* Build mask up until first zero CHAR (used to mask of
99	   potential search CHAR matches past the end of the string).
100	 */
101	blsmskl	%ecx, %ecx
102	andl	%ecx, %eax
103	jz	L(ret0)
104	/* Get last match (the `andl` removed any out of bounds
105	   matches).  */
106	bsrl	%eax, %eax
107# ifdef USE_AS_WCSRCHR
108	leaq	(%rdi, %rax, CHAR_SIZE), %rax
109# else
110	addq	%rdi, %rax
111# endif
112L(ret0):
113	ret
114
115	/* Returns for first vec x1/x2/x3 have hard coded backward
116	   search path for earlier matches.  */
117	.p2align 4,, 6
118L(first_vec_x1):
119	VPCMP	$0, %YMMMATCH, %YMM2, %k1
120	kmovd	%k1, %eax
121	blsmskl	%ecx, %ecx
122	/* eax non-zero if search CHAR in range.  */
123	andl	%ecx, %eax
124	jnz	L(first_vec_x1_return)
125
126	/* fallthrough: no match in YMM2 then need to check for earlier
127	   matches (in YMM1).  */
128	.p2align 4,, 4
129L(first_vec_x0_test):
130	VPCMP	$0, %YMMMATCH, %YMM1, %k1
131	kmovd	%k1, %eax
132	testl	%eax, %eax
133	jz	L(ret1)
134	bsrl	%eax, %eax
135# ifdef USE_AS_WCSRCHR
136	leaq	(%rsi, %rax, CHAR_SIZE), %rax
137# else
138	addq	%rsi, %rax
139# endif
140L(ret1):
141	ret
142
143	.p2align 4,, 10
144L(first_vec_x1_or_x2):
145	VPCMP	$0, %YMM3, %YMMMATCH, %k3
146	VPCMP	$0, %YMM2, %YMMMATCH, %k2
147	/* K2 and K3 have 1 for any search CHAR match. Test if any
148	   matches between either of them. Otherwise check YMM1.  */
149	kortestd %k2, %k3
150	jz	L(first_vec_x0_test)
151
152	/* Guranteed that YMM2 and YMM3 are within range so merge the
153	   two bitmasks then get last result.  */
154	kunpck	%k2, %k3, %k3
155	kmovq	%k3, %rax
156	bsrq	%rax, %rax
157	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
158	ret
159
160	.p2align 4,, 6
161L(first_vec_x3):
162	VPCMP	$0, %YMMMATCH, %YMM4, %k1
163	kmovd	%k1, %eax
164	blsmskl	%ecx, %ecx
165	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
166	andl	%ecx, %eax
167	jz	L(first_vec_x1_or_x2)
168	bsrl	%eax, %eax
169	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
170	ret
171
172	.p2align 4,, 6
173L(first_vec_x0_x1_test):
174	VPCMP	$0, %YMMMATCH, %YMM2, %k1
175	kmovd	%k1, %eax
176	/* Check YMM2 for last match first. If no match try YMM1.  */
177	testl	%eax, %eax
178	jz	L(first_vec_x0_test)
179	.p2align 4,, 4
180L(first_vec_x1_return):
181	bsrl	%eax, %eax
182	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
183	ret
184
185	.p2align 4,, 10
186L(first_vec_x2):
187	VPCMP	$0, %YMMMATCH, %YMM3, %k1
188	kmovd	%k1, %eax
189	blsmskl	%ecx, %ecx
190	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
191	 */
192	andl	%ecx, %eax
193	jz	L(first_vec_x0_x1_test)
194	bsrl	%eax, %eax
195	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
196	ret
197
198
199	.p2align 4
200L(aligned_more):
201	/* Need to keep original pointer incase YMM1 has last match.  */
202	movq	%rdi, %rsi
203	andq	$-VEC_SIZE, %rdi
204	VMOVU	VEC_SIZE(%rdi), %YMM2
205	VPTESTN	%YMM2, %YMM2, %k0
206	kmovd	%k0, %ecx
207	testl	%ecx, %ecx
208	jnz	L(first_vec_x1)
209
210	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
211	VPTESTN	%YMM3, %YMM3, %k0
212	kmovd	%k0, %ecx
213	testl	%ecx, %ecx
214	jnz	L(first_vec_x2)
215
216	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
217	VPTESTN	%YMM4, %YMM4, %k0
218	kmovd	%k0, %ecx
219	movq	%rdi, %r8
220	testl	%ecx, %ecx
221	jnz	L(first_vec_x3)
222
223	andq	$-(VEC_SIZE * 2), %rdi
224	.p2align 4
225L(first_aligned_loop):
226	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
227	   they don't store a match.  */
228	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
229	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
230
231	VPCMP	$0, %YMM5, %YMMMATCH, %k2
232	vpxord	%YMM6, %YMMMATCH, %YMM7
233
234	VPMIN	%YMM5, %YMM6, %YMM8
235	VPMIN	%YMM8, %YMM7, %YMM7
236
237	VPTESTN	%YMM7, %YMM7, %k1
238	subq	$(VEC_SIZE * -2), %rdi
239	kortestd %k1, %k2
240	jz	L(first_aligned_loop)
241
242	VPCMP	$0, %YMM6, %YMMMATCH, %k3
243	VPTESTN	%YMM8, %YMM8, %k1
244	ktestd	%k1, %k1
245	jz	L(second_aligned_loop_prep)
246
247	kortestd %k2, %k3
248	jnz	L(return_first_aligned_loop)
249
250	.p2align 4,, 6
251L(first_vec_x1_or_x2_or_x3):
252	VPCMP	$0, %YMM4, %YMMMATCH, %k4
253	kmovd	%k4, %eax
254	testl	%eax, %eax
255	jz	L(first_vec_x1_or_x2)
256	bsrl	%eax, %eax
257	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
258	ret
259
260	.p2align 4,, 8
261L(return_first_aligned_loop):
262	VPTESTN	%YMM5, %YMM5, %k0
263	kunpck	%k0, %k1, %k0
264	kmov_2x	%k0, %maskz_2x
265
266	blsmsk	%maskz_2x, %maskz_2x
267	kunpck	%k2, %k3, %k3
268	kmov_2x	%k3, %maskm_2x
269	and	%maskz_2x, %maskm_2x
270	jz	L(first_vec_x1_or_x2_or_x3)
271
272	bsr	%maskm_2x, %maskm_2x
273	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
274	ret
275
276	.p2align 4
277	/* We can throw away the work done for the first 4x checks here
278	   as we have a later match. This is the 'fast' path persay.
279	 */
280L(second_aligned_loop_prep):
281L(second_aligned_loop_set_furthest_match):
282	movq	%rdi, %rsi
283	kunpck	%k2, %k3, %k4
284
285	.p2align 4
286L(second_aligned_loop):
287	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
288	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
289
290	VPCMP	$0, %YMM1, %YMMMATCH, %k2
291	vpxord	%YMM2, %YMMMATCH, %YMM3
292
293	VPMIN	%YMM1, %YMM2, %YMM4
294	VPMIN	%YMM3, %YMM4, %YMM3
295
296	VPTESTN	%YMM3, %YMM3, %k1
297	subq	$(VEC_SIZE * -2), %rdi
298	kortestd %k1, %k2
299	jz	L(second_aligned_loop)
300
301	VPCMP	$0, %YMM2, %YMMMATCH, %k3
302	VPTESTN	%YMM4, %YMM4, %k1
303	ktestd	%k1, %k1
304	jz	L(second_aligned_loop_set_furthest_match)
305
306	kortestd %k2, %k3
307	/* branch here because there is a significant advantage interms
308	   of output dependency chance in using edx.  */
309	jnz	L(return_new_match)
310L(return_old_match):
311	kmovq	%k4, %rax
312	bsrq	%rax, %rax
313	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
314	ret
315
316L(return_new_match):
317	VPTESTN	%YMM1, %YMM1, %k0
318	kunpck	%k0, %k1, %k0
319	kmov_2x	%k0, %maskz_2x
320
321	blsmsk	%maskz_2x, %maskz_2x
322	kunpck	%k2, %k3, %k3
323	kmov_2x	%k3, %maskm_2x
324	and	%maskz_2x, %maskm_2x
325	jz	L(return_old_match)
326
327	bsr	%maskm_2x, %maskm_2x
328	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
329	ret
330
331L(cross_page_boundary):
332	/* eax contains all the page offset bits of src (rdi). `xor rdi,
333	   rax` sets pointer will all page offset bits cleared so
334	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
335	   before page cross (guranteed to be safe to read). Doing this
336	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
337	   a bit of code size.  */
338	xorq	%rdi, %rax
339	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
340	VPTESTN	%YMM1, %YMM1, %k0
341	kmovd	%k0, %ecx
342
343	/* Shift out zero CHAR matches that are before the begining of
344	   src (rdi).  */
345# ifdef USE_AS_WCSRCHR
346	movl	%edi, %esi
347	andl	$(VEC_SIZE - 1), %esi
348	shrl	$2, %esi
349# endif
350	shrxl	%SHIFT_REG, %ecx, %ecx
351
352	testl	%ecx, %ecx
353	jz	L(page_cross_continue)
354
355	/* Found zero CHAR so need to test for search CHAR.  */
356	VPCMP	$0, %YMMMATCH, %YMM1, %k1
357	kmovd	%k1, %eax
358	/* Shift out search CHAR matches that are before the begining of
359	   src (rdi).  */
360	shrxl	%SHIFT_REG, %eax, %eax
361
362	/* Check if any search CHAR match in range.  */
363	blsmskl	%ecx, %ecx
364	andl	%ecx, %eax
365	jz	L(ret3)
366	bsrl	%eax, %eax
367# ifdef USE_AS_WCSRCHR
368	leaq	(%rdi, %rax, CHAR_SIZE), %rax
369# else
370	addq	%rdi, %rax
371# endif
372L(ret3):
373	ret
374
375END(STRRCHR)
376#endif
377