1/* Optimized memrchr implementation for PowerPC64/POWER8.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5])  */
22
23#ifndef MEMRCHR
24# define MEMRCHR __memrchr
25#endif
26	.machine  power8
27ENTRY_TOCLESS (MEMRCHR)
28	CALL_MCOUNT 3
29	add	r7, r3, r5      /* Calculate the last acceptable address.  */
30	neg	r0, r7
31	addi	r7, r7, -1
32	mr	r10, r3
33	clrrdi	r6, r7, 7
34	li	r9, 3<<5
35	dcbt	r9, r6, 8       /* Stream hint, decreasing addresses.  */
36
37	/* Replicate BYTE to doubleword.  */
38	insrdi	r4, r4, 8, 48
39	insrdi	r4, r4, 16, 32
40	insrdi	r4, r4, 32, 0
41	li	r6, -8
42	li	r9, -1
43	rlwinm	r0, r0, 3, 26, 28 /* Calculate padding.  */
44	clrrdi	r8, r7, 3
45	srd	r9, r9, r0
46	cmpldi	r5, 32
47	clrrdi	r0, r10, 3
48	ble	L(small_range)
49
50#ifdef __LITTLE_ENDIAN__
51	ldx	r12, 0, r8
52#else
53	ldbrx	r12, 0, r8      /* Load reversed doubleword from memory.  */
54#endif
55	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
56	and	r3, r3, r9
57	cmpldi	cr7, r3, 0      /* If r3 == 0, no BYTEs have been found.  */
58	bne	cr7, L(done)
59
60	/* Are we now aligned to a quadword boundary?  If so, skip to
61	   the main loop.  Otherwise, go through the alignment code.  */
62	andi.	r12, r8, 15
63	beq	cr0, L(align_qw)
64
65	/* Handle DWORD2 of pair.  */
66#ifdef __LITTLE_ENDIAN__
67	ldx	r12, r8, r6
68#else
69	ldbrx	r12, r8, r6
70#endif
71	addi	r8, r8, -8
72	cmpb	r3, r12, r4
73	cmpldi	cr7, r3, 0
74	bne	cr7, L(done)
75
76	.align	4
77	/* At this point, r8 is 16B aligned.  */
78L(align_qw):
79	sub	r5, r8, r0
80	vspltisb	v0, 0
81	/* Precompute vbpermq constant.  */
82	vspltisb	v10, 3
83	li	r0, 0
84	lvsl	v11, r0, r0
85	vslb	v10, v11, v10
86	mtvrd	v1, r4
87	vspltb	v1, v1, 7
88	cmpldi	r5, 64
89	ble	L(tail64)
90	/* Are we 64-byte aligned? If so, jump to the vectorized loop.
91	   Note: aligning to 64-byte will necessarily slow down performance for
92	   strings around 64 bytes in length due to the extra comparisons
93	   required to check alignment for the vectorized loop.  This is a
94	   necessary tradeoff we are willing to take in order to speed up the
95	   calculation for larger strings.  */
96	andi.	r11, r8, 63
97	beq	cr0, L(preloop_64B)
98	/* In order to begin the 64B loop, it needs to be 64
99	   bytes aligned.  So read until it is 64B aligned.  */
100	addi	r8, r8, -16
101	lvx	v4, 0, r8
102	vcmpequb	v6, v1, v4
103	vcmpequb.	v11, v0, v6
104	bnl	cr6, L(found_16B)
105	addi	r5, r5, -16
106
107	andi.	r11, r8, 63
108	beq	cr0, L(preloop_64B)
109	addi	r8, r8, -16
110	lvx	v4, 0, r8
111	vcmpequb	v6, v1, v4
112	vcmpequb.	v11, v0, v6
113	bnl	cr6, L(found_16B)
114	addi	r5, r5, -16
115
116	andi.	r11, r8, 63
117	beq	cr0, L(preloop_64B)
118	addi	r8, r8, -16
119	lvx	v4, 0, r8
120	vcmpequb	v6, v1, v4
121	vcmpequb.	v11, v0, v6
122	bnl	cr6, L(found_16B)
123	addi	r5, r5, -16
124	/* At this point it should be 64B aligned.
125	   Prepare for the 64B loop.  */
126L(preloop_64B):
127	cmpldi	r5, 64		/* Check if r5 < 64.  */
128	ble	L(tail64)
129	srdi	r9, r5, 6	/* Number of loop iterations.  */
130	mtctr	r9		/* Setup the counter.  */
131	li	r11, 16		/* Load required offsets.  */
132	li	r9, 32
133	li	r7, 48
134
135	/* Handle r5 > 64.  Loop over the bytes in strides of 64B.  */
136	.align 4
137L(loop):
138	addi	r8, r8, -64	/* Adjust address for the next iteration.  */
139	lvx	v2, 0, r8	/* Load 4 quadwords.  */
140	lvx	v3, r8, r11
141	lvx	v4, v8, r9
142	lvx	v5, v8, r7
143	vcmpequb	v6, v1, v2
144	vcmpequb	v7, v1, v3
145	vcmpequb	v8, v1, v4
146	vcmpequb	v9, v1, v5
147	vor	v11, v6, v7
148	vor	v12, v8, v9
149	vor	v11, v11, v12	/* Compare and merge into one VR for speed.  */
150	vcmpequb.	v11, v0, v11
151	bnl	cr6, L(found)
152	bdnz	L(loop)
153	clrldi	r5, r5, 58
154
155	/* Handle remainder of 64B loop or r5 > 64.  */
156	.align	4
157L(tail64):
158	cmpldi	r5, 0
159	beq	L(null)
160	addi	r8, r8, -16
161	lvx	v4, 0, r8
162	vcmpequb	v6, v1, v4
163	vcmpequb.	v11, v0, v6
164	bnl	cr6, L(found_16B)
165	cmpldi	cr6, r5, 16
166	ble	cr6, L(null)
167	addi	r5, r5, -16
168
169	addi	r8, r8, -16
170	lvx	v4, 0, r8
171	vcmpequb	v6, v1, v4
172	vcmpequb.	v11, v0, v6
173	bnl	cr6, L(found_16B)
174	cmpldi	cr6, r5, 16
175	ble	cr6, L(null)
176	addi	r5, r5, -16
177
178	addi	r8, r8, -16
179	lvx	v4, 0, r8
180	vcmpequb	v6, v1, v4
181	vcmpequb.	v11, v0, v6
182	bnl	cr6, L(found_16B)
183	cmpldi	cr6, r5, 16
184	ble	cr6, L(null)
185	addi	r5, r5, -16
186
187	addi	r8, r8, -16
188	lvx	v4, 0, r8
189	vcmpequb	v6, v1, v4
190	vcmpequb.	v11, v0, v6
191	bnl	cr6, L(found_16B)
192	li	r3, 0
193	blr
194
195	/* Found a match in 64B loop.  */
196	.align	4
197L(found):
198	/* Permute the first bit of each byte into bits 48-63.  */
199	vbpermq	v6, v6, v10
200	vbpermq	v7, v7, v10
201	vbpermq	v8, v8, v10
202	vbpermq	v9, v9, v10
203	/* Shift each component into its correct position for merging.  */
204#ifdef __LITTLE_ENDIAN__
205	vsldoi	v7, v7, v7, 2
206	vsldoi	v8, v8, v8, 4
207	vsldoi	v9, v9, v9, 6
208#else
209	vsldoi	v6, v6, v6, 6
210	vsldoi	v7, v7, v7, 4
211	vsldoi	v8, v8, v8, 2
212#endif
213	/* Merge the results and move to a GPR.  */
214	vor	v11, v6, v7
215	vor	v4, v9, v8
216	vor	v4, v11, v4
217	mfvrd	r5, v4
218#ifdef __LITTLE_ENDIAN__
219	cntlzd	r6, r5	/* Count leading zeros before the match.  */
220#else
221	addi	r6, r5, -1
222	andc	r6, r6, r5
223	popcntd	r6, r6
224#endif
225	addi	r8, r8, 63
226	sub	r3, r8, r6	/* Compute final address.  */
227	cmpld	cr7, r3, r10
228	bgelr	cr7
229	li	r3, 0
230	blr
231
232	/* Found a match in last 16 bytes.  */
233	.align	4
234L(found_16B):
235	cmpld	r8, r10		/* Are we on the last QW?  */
236	bge	L(last)
237	/* Now discard bytes before starting address.  */
238	sub	r9, r10, r8
239	mtvrd	v9, r9
240	vspltisb	v8, 3
241	/* Mask unwanted bytes.  */
242#ifdef __LITTLE_ENDIAN__
243	lvsr	v7, 0, r10
244	vperm   v6, v0, v6, v7
245	vsldoi	v9, v0, v9, 8
246	vsl	v9, v9, v8
247	vslo	v6, v6, v9
248#else
249	lvsl	v7, 0, r10
250	vperm   v6, v6, v0, v7
251	vsldoi	v9, v0, v9, 8
252	vsl	v9, v9, v8
253	vsro	v6, v6, v9
254#endif
255L(last):
256	/* Permute the first bit of each byte into bits 48-63.  */
257	vbpermq	v6, v6, v10
258	/* Shift each component into its correct position for merging.  */
259#ifdef __LITTLE_ENDIAN__
260	vsldoi	v6, v6, v6, 6
261	mfvrd	r7, v6
262	cntlzd	r6, r7	/* Count leading zeros before the match.  */
263#else
264	mfvrd	r7, v6
265	addi	r6, r7, -1
266	andc	r6, r6, r7
267	popcntd	r6, r6
268#endif
269	addi	r8, r8, 15
270	sub	r3, r8, r6	/* Compute final address.  */
271	cmpld	r6, r5
272	bltlr
273	li	r3, 0
274	blr
275
276	/* r3 has the output of the cmpb instruction, that is, it contains
277	   0xff in the same position as BYTE in the original
278	   word from the string.  Use that to calculate the pointer.
279	   We need to make sure BYTE is *before* the end of the
280	   range.  */
281L(done):
282	cntlzd	r9, r3	      /* Count leading zeros before the match.  */
283	cmpld	r8, r0         /* Are we on the last word?  */
284	srdi	r6, r9, 3	      /* Convert leading zeros to bytes.  */
285	addi	r0, r6, -7
286	sub	r3, r8, r0
287	cmpld	cr7, r3, r10
288	bnelr
289	bgelr	cr7
290	li	r3, 0
291	blr
292
293	.align	4
294L(null):
295	li	r3, 0
296	blr
297
298/* Deals with size <= 32.  */
299	.align	4
300L(small_range):
301	cmpldi	r5, 0
302	beq	L(null)
303
304#ifdef __LITTLE_ENDIAN__
305	ldx	r12, 0, r8
306#else
307	ldbrx	r12, 0, r8      /* Load reversed doubleword from memory.  */
308#endif
309	cmpb	r3, r12, r4     /* Check for BYTE in DWORD1.  */
310	and	r3, r3, r9
311	cmpldi	cr7, r3, 0
312	bne	cr7, L(done)
313
314	/* Are we done already?  */
315	cmpld	r8, r0
316	addi	r8, r8, -8
317	beqlr
318
319	.align	5
320L(loop_small):
321#ifdef __LITTLE_ENDIAN__
322	ldx	r12, 0, r8
323#else
324	ldbrx	r12, 0, r8
325#endif
326	cmpb	r3, r12, r4
327	cmpld	r8, r0
328	cmpldi	cr7, r3, 0
329	bne	cr7, L(done)
330	addi	r8, r8, -8
331	bne	L(loop_small)
332	blr
333
334END (MEMRCHR)
335weak_alias (__memrchr, memrchr)
336libc_hidden_builtin_def (memrchr)
337