1/* memrchr optimized with SSE2.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <isa-level.h>
20
21/* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22   so we need this to build for ISA V2 builds. */
23#if ISA_SHOULD_BUILD (2)
24
25# ifndef MEMRCHR
26#  define MEMRCHR __memrchr_sse2
27# endif
28
29# include <sysdep.h>
30# define VEC_SIZE			16
31# define PAGE_SIZE			4096
32
33	.text
34ENTRY_P2ALIGN(MEMRCHR, 6)
35# ifdef __ILP32__
36	/* Clear upper bits.  */
37	mov	%RDX_LP, %RDX_LP
38# endif
39	movd	%esi, %xmm0
40
41	/* Get end pointer.  */
42	leaq	(%rdx, %rdi), %rcx
43
44	punpcklbw %xmm0, %xmm0
45	punpcklwd %xmm0, %xmm0
46	pshufd	$0, %xmm0, %xmm0
47
48	/* Check if we can load 1x VEC without cross a page.  */
49	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
50	jz	L(page_cross)
51
52	/* NB: This load happens regardless of whether rdx (len) is zero. Since
53	   it doesn't cross a page and the standard gurantees any pointer have
54	   at least one-valid byte this load must be safe. For the entire
55	   history of the x86 memrchr implementation this has been possible so
56	   no code "should" be relying on a zero-length check before this load.
57	   The zero-length check is moved to the page cross case because it is
58	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
59	   into 2-cache lines.  */
60	movups	-(VEC_SIZE)(%rcx), %xmm1
61	pcmpeqb	%xmm0, %xmm1
62	pmovmskb %xmm1, %eax
63
64	subq	$VEC_SIZE, %rdx
65	ja	L(more_1x_vec)
66L(ret_vec_x0_test):
67	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
68	   zero.  */
69	bsrl	%eax, %eax
70	jz	L(ret_0)
71	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
72	   if out of bounds.  */
73	addl	%edx, %eax
74	jl	L(zero_0)
75	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
76	   ptr.  */
77	addq	%rdi, %rax
78L(ret_0):
79	ret
80
81	.p2align 4,, 5
82L(ret_vec_x0):
83	bsrl	%eax, %eax
84	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
85	ret
86
87	.p2align 4,, 2
88L(zero_0):
89	xorl	%eax, %eax
90	ret
91
92
93	.p2align 4,, 8
94L(more_1x_vec):
95	testl	%eax, %eax
96	jnz	L(ret_vec_x0)
97
98	/* Align rcx (pointer to string).  */
99	decq	%rcx
100	andq	$-VEC_SIZE, %rcx
101
102	movq	%rcx, %rdx
103	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
104	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
105	   it adds more frontend uops (even if the moves can be eliminated) and
106	   some percentage of the time actual backend uops.  */
107	movaps	-(VEC_SIZE)(%rcx), %xmm1
108	pcmpeqb	%xmm0, %xmm1
109	subq	%rdi, %rdx
110	pmovmskb %xmm1, %eax
111
112	cmpq	$(VEC_SIZE * 2), %rdx
113	ja	L(more_2x_vec)
114L(last_2x_vec):
115	subl	$VEC_SIZE, %edx
116	jbe	L(ret_vec_x0_test)
117
118	testl	%eax, %eax
119	jnz	L(ret_vec_x0)
120
121	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
122	pcmpeqb	%xmm0, %xmm1
123	pmovmskb %xmm1, %eax
124
125	subl	$VEC_SIZE, %edx
126	bsrl	%eax, %eax
127	jz	L(ret_1)
128	addl	%edx, %eax
129	jl	L(zero_0)
130	addq	%rdi, %rax
131L(ret_1):
132	ret
133
134	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
135	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
136	   lines.  Naturally aligned % 16 to 8-bytes.  */
137L(page_cross):
138	/* Zero length check.  */
139	testq	%rdx, %rdx
140	jz	L(zero_0)
141
142	leaq	-1(%rcx), %r8
143	andq	$-(VEC_SIZE), %r8
144
145	movaps	(%r8), %xmm1
146	pcmpeqb	%xmm0, %xmm1
147	pmovmskb %xmm1, %esi
148	/* Shift out negative alignment (because we are starting from endptr and
149	   working backwards).  */
150	negl	%ecx
151	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
152	   explicitly.  */
153	andl	$(VEC_SIZE - 1), %ecx
154	shl	%cl, %esi
155	movzwl	%si, %eax
156	leaq	(%rdi, %rdx), %rcx
157	cmpq	%rdi, %r8
158	ja	L(more_1x_vec)
159	subl	$VEC_SIZE, %edx
160	bsrl	%eax, %eax
161	jz	L(ret_2)
162	addl	%edx, %eax
163	jl	L(zero_1)
164	addq	%rdi, %rax
165L(ret_2):
166	ret
167
168	/* Fits in aliging bytes.  */
169L(zero_1):
170	xorl	%eax, %eax
171	ret
172
173	.p2align 4,, 5
174L(ret_vec_x1):
175	bsrl	%eax, %eax
176	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
177	ret
178
179	.p2align 4,, 8
180L(more_2x_vec):
181	testl	%eax, %eax
182	jnz	L(ret_vec_x0)
183
184	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
185	pcmpeqb	%xmm0, %xmm1
186	pmovmskb %xmm1, %eax
187	testl	%eax, %eax
188	jnz	L(ret_vec_x1)
189
190
191	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
192	pcmpeqb	%xmm0, %xmm1
193	pmovmskb %xmm1, %eax
194
195	subq	$(VEC_SIZE * 4), %rdx
196	ja	L(more_4x_vec)
197
198	addl	$(VEC_SIZE), %edx
199	jle	L(ret_vec_x2_test)
200
201L(last_vec):
202	testl	%eax, %eax
203	jnz	L(ret_vec_x2)
204
205	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
206	pcmpeqb	%xmm0, %xmm1
207	pmovmskb %xmm1, %eax
208
209	subl	$(VEC_SIZE), %edx
210	bsrl	%eax, %eax
211	jz	L(ret_3)
212	addl	%edx, %eax
213	jl	L(zero_2)
214	addq	%rdi, %rax
215L(ret_3):
216	ret
217
218	.p2align 4,, 6
219L(ret_vec_x2_test):
220	bsrl	%eax, %eax
221	jz	L(zero_2)
222	addl	%edx, %eax
223	jl	L(zero_2)
224	addq	%rdi, %rax
225	ret
226
227L(zero_2):
228	xorl	%eax, %eax
229	ret
230
231
232	.p2align 4,, 5
233L(ret_vec_x2):
234	bsrl	%eax, %eax
235	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
236	ret
237
238	.p2align 4,, 5
239L(ret_vec_x3):
240	bsrl	%eax, %eax
241	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
242	ret
243
244	.p2align 4,, 8
245L(more_4x_vec):
246	testl	%eax, %eax
247	jnz	L(ret_vec_x2)
248
249	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
250	pcmpeqb	%xmm0, %xmm1
251	pmovmskb %xmm1, %eax
252
253	testl	%eax, %eax
254	jnz	L(ret_vec_x3)
255
256	addq	$-(VEC_SIZE * 4), %rcx
257	cmpq	$(VEC_SIZE * 4), %rdx
258	jbe	L(last_4x_vec)
259
260	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
261	   keeping the code from spilling to the next cache line.  */
262	addq	$(VEC_SIZE * 4 - 1), %rcx
263	andq	$-(VEC_SIZE * 4), %rcx
264	leaq	(VEC_SIZE * 4)(%rdi), %rdx
265	andq	$-(VEC_SIZE * 4), %rdx
266
267	.p2align 4,, 11
268L(loop_4x_vec):
269	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
270	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
271	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
272	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
273	pcmpeqb	%xmm0, %xmm1
274	pcmpeqb	%xmm0, %xmm2
275	pcmpeqb	%xmm0, %xmm3
276	pcmpeqb	%xmm0, %xmm4
277
278	por	%xmm1, %xmm2
279	por	%xmm3, %xmm4
280	por	%xmm2, %xmm4
281
282	pmovmskb %xmm4, %esi
283	testl	%esi, %esi
284	jnz	L(loop_end)
285
286	addq	$-(VEC_SIZE * 4), %rcx
287	cmpq	%rdx, %rcx
288	jne	L(loop_4x_vec)
289
290	subl	%edi, %edx
291
292	/* Ends up being 1-byte nop.  */
293	.p2align 4,, 2
294L(last_4x_vec):
295	movaps	-(VEC_SIZE)(%rcx), %xmm1
296	pcmpeqb	%xmm0, %xmm1
297	pmovmskb %xmm1, %eax
298
299	cmpl	$(VEC_SIZE * 2), %edx
300	jbe	L(last_2x_vec)
301
302	testl	%eax, %eax
303	jnz	L(ret_vec_x0)
304
305
306	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
307	pcmpeqb	%xmm0, %xmm1
308	pmovmskb %xmm1, %eax
309
310	testl	%eax, %eax
311	jnz	L(ret_vec_end)
312
313	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
314	pcmpeqb	%xmm0, %xmm1
315	pmovmskb %xmm1, %eax
316
317	subl	$(VEC_SIZE * 3), %edx
318	ja	L(last_vec)
319	bsrl	%eax, %eax
320	jz	L(ret_4)
321	addl	%edx, %eax
322	jl	L(zero_3)
323	addq	%rdi, %rax
324L(ret_4):
325	ret
326
327	/* Ends up being 1-byte nop.  */
328	.p2align 4,, 3
329L(loop_end):
330	pmovmskb %xmm1, %eax
331	sall	$16, %eax
332	jnz	L(ret_vec_end)
333
334	pmovmskb %xmm2, %eax
335	testl	%eax, %eax
336	jnz	L(ret_vec_end)
337
338	pmovmskb %xmm3, %eax
339	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
340	   then it won't affect the result in esi (VEC4). If ecx is non-zero
341	   then CHAR in VEC3 and bsrq will use that position.  */
342	sall	$16, %eax
343	orl	%esi, %eax
344	bsrl	%eax, %eax
345	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
346	ret
347
348L(ret_vec_end):
349	bsrl	%eax, %eax
350	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
351	ret
352	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
353	   aligning bytes.  */
354L(zero_3):
355	xorl	%eax, %eax
356	ret
357	/* 2-bytes from next cache line.  */
358END(MEMRCHR)
359#endif
360