1/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
2   Copyright (C) 2017-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* char *[r3] strrchr (char *s [r3], int c [r4])  */
22
23#ifdef __LITTLE_ENDIAN__
24/* Find the match position from v6 and place result in r6.  */
25# define CALCULATE_MATCH() \
26	vbpermq	v6, v6, v10; \
27	vsldoi	v6, v6, v6, 6; \
28	mfvrd	r7, v6; \
29	cntlzd	r6, r7; \
30	subfic	r6, r6, 15;
31/*
32 * Find the first null position to mask bytes after null.
33 * (reg): vcmpequb result: v2 for 1st qw v3 for 2nd qw.
34 * Result placed at v2.
35 */
36# define FIND_NULL_POS(reg) \
37	vspltisb	v11, -1; \
38	vadduqm	v11, reg, v11; \
39	vandc	v11, v11, reg; \
40	vpopcntd	v2, v11; \
41	vspltb	v11, v2, 15; \
42	vcmpequb.	v11, v11, v9; \
43	blt	cr6, 1f; \
44	vsldoi	v9, v0, v9, 1; \
45	vslo	v2, v2, v9; \
461: \
47	vsumsws	v2, v2, v0;
48#else
49# define CALCULATE_MATCH() \
50	vbpermq	v6, v6, v10; \
51	mfvrd	r7, v6; \
52	addi	r6, r7, -1; \
53	andc	r6, r6, r7; \
54	popcntd	r6, r6; \
55	subfic	r6, r6, 15;
56# define FIND_NULL_POS(reg) \
57	vclzd	v2, reg; \
58	vspltb	v11, v2, 7; \
59	vcmpequb.	v11, v11, v9; \
60	blt	cr6, 1f; \
61	vsldoi	v9, v0, v9, 1; \
62	vsro	v2, v2, v9; \
631: \
64	vsumsws	v2, v2, v0;
65#endif	/* !__LITTLE_ENDIAN__  */
66
67#ifndef STRRCHR
68# define STRRCHR strrchr
69#endif
70	.machine  power8
71ENTRY_TOCLESS (STRRCHR)
72	CALL_MCOUNT 2
73	dcbt	0,r3
74	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
75	cmpdi	cr7,r4,0
76	ld	r12,0(r8)     /* Load doubleword from memory.  */
77	li	r9,0	      /* Used to store last occurence.  */
78	li	r0,0	      /* Doubleword with null chars to use
79				 with cmpb.  */
80
81	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
82
83	beq	cr7,L(null_match)
84
85	/* Replicate byte to doubleword.  */
86	insrdi	r4,r4,8,48
87	insrdi	r4,r4,16,32
88	insrdi	r4,r4,32,0
89
90	/* r4 is changed now.  If it's passed more chars, then
91	   check for null again.  */
92	cmpdi	cr7,r4,0
93	beq	cr7,L(null_match)
94	/* Now r4 has a doubleword of c bytes and r0 has
95	   a doubleword of null bytes.  */
96
97	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
98	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
99
100	/* Move the doublewords left and right to discard the bits that are
101	   not part of the string and bring them back as zeros.  */
102#ifdef __LITTLE_ENDIAN__
103	srd	r10,r10,r6
104	srd	r11,r11,r6
105	sld	r10,r10,r6
106	sld	r11,r11,r6
107#else
108	sld	r10,r10,r6
109	sld	r11,r11,r6
110	srd	r10,r10,r6
111	srd	r11,r11,r6
112#endif
113	or	r5,r10,r11    /* OR the results to speed things up.  */
114	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
115				 have been found.  */
116	bne	cr7,L(done)
117
118L(align):
119	andi.	r12, r8, 15
120
121	/* Are we now aligned to a doubleword boundary?  If so, skip to
122	   the main loop.  Otherwise, go through the alignment code.  */
123
124	bne	cr0, L(loop)
125
126	/* Handle WORD2 of pair.  */
127	ldu	r12,8(r8)
128	cmpb	r10,r12,r4
129	cmpb	r11,r12,r0
130	or	r5,r10,r11
131	cmpdi	cr7,r5,0
132	bne	cr7,L(done)
133	b	L(loop)	      /* We branch here (rather than falling through)
134				 to skip the nops due to heavy alignment
135				 of the loop below.  */
136	.p2align  5
137L(loop):
138	/* Load two doublewords, compare and merge in a
139	   single register for speed.  This is an attempt
140	   to speed up the null-checking process for bigger strings.  */
141	ld	r12,8(r8)
142	ldu	r7,16(r8)
143	cmpb	r10,r12,r4
144	cmpb	r11,r12,r0
145	cmpb	r6,r7,r4
146	cmpb	r7,r7,r0
147	or	r12,r10,r11
148	or	r5,r6,r7
149	or	r5,r12,r5
150	cmpdi	cr7,r5,0
151	beq	cr7,L(vector)
152
153	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
154	   the first doubleword and decrement the address in case the first
155	   doubleword really contains a c/null byte.  */
156	cmpdi	cr6,r12,0
157	addi	r8,r8,-8
158	bne	cr6,L(done)
159
160	/* The c/null byte must be in the second doubleword.  Adjust the
161	   address again and move the result of cmpb to r10 so we can calculate
162	   the pointer.  */
163
164	mr	r10,r6
165	mr	r11,r7
166	addi	r8,r8,8
167
168	/* r10/r11 have the output of the cmpb instructions, that is,
169	   0xff in the same position as the c/null byte in the original
170	   doubleword from the string.  Use that to calculate the pointer.  */
171
172L(done):
173	/* If there are more than one 0xff in r11, find the first position of
174	   0xff in r11 and fill r10 with 0 from that position.  */
175	cmpdi	cr7,r11,0
176	beq	cr7,L(no_null)
177#ifdef __LITTLE_ENDIAN__
178	addi	r3,r11,-1
179	andc	r3,r3,r11
180	popcntd r0,r3
181#else
182	cntlzd	r0,r11
183#endif
184	subfic	r0,r0,63
185	li	r6,-1
186#ifdef __LITTLE_ENDIAN__
187	srd	r0,r6,r0
188#else
189	sld	r0,r6,r0
190#endif
191	and	r10,r0,r10
192L(no_null):
193#ifdef __LITTLE_ENDIAN__
194	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
195	addi	r3,r10,-1
196	andc	r3,r3,r10
197	addi	r10,r11,-1
198	andc	r10,r10,r11
199	cmpld	cr7,r3,r10
200	bgt	cr7,L(no_match)
201#else
202	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
203	andc	r3,r3,r10
204	popcntd	r0,r3
205	cmpld	cr7,r11,r10
206	bgt	cr7,L(no_match)
207#endif
208	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
209	subfic	r0,r0,7
210	add	r9,r8,r0      /* Return address of the matching c byte
211				 or null in case c was not found.  */
212	li	r0,0
213	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
214	beq	cr7,L(align)
215
216	.align	4
217L(no_match):
218	mr	r3,r9
219	blr
220
221/* Check the first 32B in GPR's and move to vectorized loop.  */
222	.p2align  5
223L(vector):
224	addi	r3, r8, 8
225	/* Make sure 32B aligned.  */
226	andi.	r10, r3, 31
227	bne	cr0, L(loop)
228	vspltisb	v0, 0
229	/* Precompute vbpermq constant.  */
230	vspltisb	v10, 3
231	lvsl	v11, r0, r0
232	vslb	v10, v11, v10
233	mtvrd	v1, r4
234	li	r5, 16
235	vspltb	v1, v1, 7
236	/* Compare 32 bytes in each loop.  */
237L(continue):
238	lvx	v4, 0, r3
239	lvx	v5, r3, r5
240	vcmpequb	v2, v0, v4
241	vcmpequb	v3, v0, v5
242	vcmpequb	v6, v1, v4
243	vcmpequb	v7, v1, v5
244	vor	v8, v2, v3
245	vor	v9, v6, v7
246	vor	v11, v8, v9
247	vcmpequb.	v11, v0, v11
248	addi	r3, r3, 32
249	blt	cr6, L(continue)
250	vcmpequb.	v8, v0, v8
251	blt	cr6, L(match)
252
253	/* One (or both) of the quadwords contains c/null.  */
254	vspltisb	v8, 2
255	vspltisb	v9, 5
256	/* Precompute values used for comparison.  */
257	vsl	v9, v8, v9	/* v9 = 0x4040404040404040.  */
258	vaddubm	v8, v9, v9
259	vsldoi	v8, v0, v8, 1	/* v8 = 0x80.  */
260
261	/* Check if null is in second qw.  */
262	vcmpequb.	v11, v0, v2
263	blt	cr6, L(secondqw)
264
265	/* Null found in first qw.  */
266	addi	r8, r3, -32
267	/* Calculate the null position.  */
268	FIND_NULL_POS(v2)
269	/* Check if null is in the first byte.  */
270	vcmpequb.	v11, v0, v2
271	blt	cr6, L(no_match)
272	vsububm	v2, v8, v2
273	/* Mask unwanted bytes after null.  */
274#ifdef __LITTLE_ENDIAN__
275	vslo	v6, v6, v2
276	vsro	v6, v6, v2
277#else
278	vsro	v6, v6, v2
279	vslo	v6, v6, v2
280#endif
281	vcmpequb.	v11, v0, v6
282	blt	cr6, L(no_match)
283	/* Found a match before null.  */
284	CALCULATE_MATCH()
285	add	r3, r8, r6
286	blr
287
288L(secondqw):
289	addi	r8, r3, -16
290	FIND_NULL_POS(v3)
291	vcmpequb.	v11, v0, v2
292	blt	cr6, L(no_match1)
293	vsububm	v2, v8, v2
294	/* Mask unwanted bytes after null.  */
295#ifdef __LITTLE_ENDIAN__
296	vslo	v7, v7, v2
297	vsro	v7, v7, v2
298#else
299	vsro	v7, v7, v2
300	vslo	v7, v7, v2
301#endif
302	vcmpequb.	v11, v0, v7
303	blt	cr6, L(no_match1)
304	addi	r8, r8, 16
305	vor	v6, v0, v7
306L(no_match1):
307	addi	r8, r8, -16
308	vcmpequb.	v11, v0, v6
309	blt	cr6, L(no_match)
310	/* Found a match before null.  */
311	CALCULATE_MATCH()
312	add	r3, r8, r6
313	blr
314
315L(match):
316	/* One (or both) of the quadwords contains a match.  */
317	mr	r8, r3
318	vcmpequb.	v8, v0, v7
319	blt	cr6, L(firstqw)
320	/* Match found in second qw.  */
321	addi	r8, r8, 16
322	vor	v6, v0, v7
323L(firstqw):
324	addi	r8, r8, -32
325	CALCULATE_MATCH()
326	add	r9, r8, r6      /* Compute final length.  */
327	b	L(continue)
328/* We are here because strrchr was called with a null byte.  */
329	.align	4
330L(null_match):
331	/* r0 has a doubleword of null bytes.  */
332
333	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
334
335	/* Move the doublewords left and right to discard the bits that are
336	   not part of the string and bring them back as zeros.  */
337#ifdef __LITTLE_ENDIAN__
338	srd	r5,r5,r6
339	sld	r5,r5,r6
340#else
341	sld	r5,r5,r6
342	srd	r5,r5,r6
343#endif
344	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
345				 have been found.  */
346	bne	cr7,L(done_null)
347
348	andi.	r12, r8, 15
349
350	/* Are we now aligned to a quadword boundary?  If so, skip to
351	   the main loop.  Otherwise, go through the alignment code.  */
352
353	bne	cr0, L(loop_null)
354
355	/* Handle WORD2 of pair.  */
356	ldu	r12,8(r8)
357	cmpb	r5,r12,r0
358	cmpdi	cr7,r5,0
359	bne	cr7,L(done_null)
360	b	L(loop_null)  /* We branch here (rather than falling through)
361				 to skip the nops due to heavy alignment
362				 of the loop below.  */
363
364	/* Main loop to look for the end of the string.  Since it's a
365	   small loop (< 8 instructions), align it to 32-bytes.  */
366	.p2align  5
367L(loop_null):
368	/* Load two doublewords, compare and merge in a
369	   single register for speed.  This is an attempt
370	   to speed up the null-checking process for bigger strings.  */
371	ld	r12,8(r8)
372	ldu	r11,16(r8)
373	cmpb	r5,r12,r0
374	cmpb	r10,r11,r0
375	or	r6,r5,r10
376	cmpdi	cr7,r6,0
377	beq	cr7,L(vector1)
378
379	/* OK, one (or both) of the doublewords contains a null byte.  Check
380	   the first doubleword and decrement the address in case the first
381	   doubleword really contains a null byte.  */
382
383	cmpdi	cr6,r5,0
384	addi	r8,r8,-8
385	bne	cr6,L(done_null)
386
387	/* The null byte must be in the second doubleword.  Adjust the address
388	   again and move the result of cmpb to r10 so we can calculate the
389	   pointer.  */
390
391	mr	r5,r10
392	addi	r8,r8,8
393
394	/* r5 has the output of the cmpb instruction, that is, it contains
395	   0xff in the same position as the null byte in the original
396	   doubleword from the string.  Use that to calculate the pointer.  */
397L(done_null):
398#ifdef __LITTLE_ENDIAN__
399	addi	r0,r5,-1
400	andc	r0,r0,r5
401	popcntd	r0,r0
402#else
403	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
404#endif
405	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
406	add	r3,r8,r0      /* Return address of the matching null byte.  */
407	blr
408/* Check the first 32B in GPR's and move to vectorized loop.  */
409	.p2align  5
410L(vector1):
411	addi	r3, r8, 8
412	/* Make sure 32B aligned.  */
413	andi.	r10, r3, 31
414	bne	cr0, L(loop_null)
415	vspltisb	v0, 0
416	/* Precompute vbpermq constant.  */
417	vspltisb	v10, 3
418	lvsl	v11, r0, r0
419	vslb	v10, v11, v10
420	li	r5, 16
421	/* Compare 32 bytes in each loop.  */
422L(continue1):
423	lvx	v4, 0, r3
424	lvx	v5, r3, r5
425	vcmpequb	v2, v0, v4
426	vcmpequb	v3, v0, v5
427	vor	v8, v2, v3
428	vcmpequb.	v11, v0, v8
429	addi	r3, r3, 32
430	blt	cr6, L(continue1)
431	addi	r3, r3, -32
432	vbpermq	v2, v2, v10
433	vbpermq	v3, v3, v10
434	/* Shift each component into its correct position for merging.  */
435#ifdef __LITTLE_ENDIAN__
436	vsldoi	v3, v3, v3, 2
437#else
438	vsldoi	v2, v2, v2, 6
439	vsldoi	v3, v3, v3, 4
440#endif
441	/* Merge the results and move to a GPR.  */
442	vor	v4, v3, v2
443	mfvrd	r5, v4
444#ifdef __LITTLE_ENDIAN__
445	addi	r6, r5, -1
446	andc	r6, r6, r5
447	popcntd	r6, r6
448#else
449	cntlzd	r6, r5  /* Count leading zeros before the match.  */
450#endif
451	add	r3, r3, r6      /* Compute final length.  */
452	blr
453END_GEN_TB (STRRCHR, TB_TOCLESS)
454weak_alias (strrchr, rindex)
455libc_hidden_builtin_def (strrchr)
456