1/* Optimized strlen implementation for POWER10 LE.
2   Copyright (C) 2021-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21/* To reuse the code for rawmemchr, we have some extra steps compared to the
22   strlen implementation:
23      - Sum the initial value of r3 with the position at which the char was
24        found, to guarantee we return a pointer and not the length.
25      - In the main loop, subtract each byte by the char we are looking for,
26        so we can keep using vminub to quickly check 64B at once.  */
27#ifdef USE_AS_RAWMEMCHR
28# ifndef RAWMEMCHR
29#  define FUNCNAME __rawmemchr
30# else
31#  define FUNCNAME RAWMEMCHR
32# endif
33# define MCOUNT_NARGS 2
34# define VREG_ZERO v20
35# define OFF_START_LOOP 256
36# define RAWMEMCHR_SUBTRACT_VECTORS \
37	vsububm   v4,v4,v18;	    \
38	vsububm   v5,v5,v18;	    \
39	vsububm   v6,v6,v18;	    \
40	vsububm   v7,v7,v18;
41# define TAIL(vreg,increment)	   \
42	vctzlsbb  r4,vreg;	   \
43	addi	  r4,r4,increment; \
44	add	  r3,r5,r4;	   \
45	blr
46
47#else /* strlen */
48
49# ifndef STRLEN
50#  define FUNCNAME __strlen
51#  define DEFINE_STRLEN_HIDDEN_DEF 1
52# else
53#  define FUNCNAME STRLEN
54# endif
55# define MCOUNT_NARGS 1
56# define VREG_ZERO v18
57# define OFF_START_LOOP 192
58# define TAIL(vreg,increment)	   \
59	vctzlsbb  r4,vreg;	   \
60	subf	  r3,r3,r5;	   \
61	addi	  r4,r4,increment; \
62	add	  r3,r3,r4;	   \
63	blr
64#endif /* USE_AS_RAWMEMCHR */
65
66/* TODO: Replace macros by the actual instructions when minimum binutils becomes
67   >= 2.35.  This is used to keep compatibility with older versions.  */
68#define VEXTRACTBM(rt,vrb)	 \
69	.long(((4)<<(32-6))	 \
70	      | ((rt)<<(32-11))	 \
71	      | ((8)<<(32-16))	 \
72	      | ((vrb)<<(32-21)) \
73	      | 1602)
74
75#define LXVP(xtp,dq,ra)		   \
76	.long(((6)<<(32-6))		   \
77	      | ((((xtp)-32)>>1)<<(32-10)) \
78	      | ((1)<<(32-11))		   \
79	      | ((ra)<<(32-16))		   \
80	      | dq)
81
82#define CHECK16(vreg,offset,addr,label) \
83	lxv	  vreg+32,offset(addr);	\
84	vcmpequb. vreg,vreg,v18;	\
85	bne	  cr6,L(label);
86
87/* Load 4 quadwords, merge into one VR for speed and check for NULLs.  r6 has #
88   of bytes already checked.  */
89#define CHECK64(offset,addr,label)	    \
90	li	  r6,offset;		    \
91	LXVP(v4+32,offset,addr);	    \
92	LXVP(v6+32,offset+32,addr);	    \
93	RAWMEMCHR_SUBTRACT_VECTORS;	    \
94	vminub	  v14,v4,v5;		    \
95	vminub	  v15,v6,v7;		    \
96	vminub	  v16,v14,v15;		    \
97	vcmpequb. v0,v16,VREG_ZERO;	    \
98	bne	  cr6,L(label)
99
100/* Implements the function
101
102   int [r3] strlen (const void *s [r3])
103
104   but when USE_AS_RAWMEMCHR is set, implements the function
105
106   void* [r3] rawmemchr (const void *s [r3], int c [r4])
107
108   The implementation can load bytes past a matching byte, but only
109   up to the next 64B boundary, so it never crosses a page.  */
110
111.machine power9
112
113ENTRY_TOCLESS (FUNCNAME, 4)
114	CALL_MCOUNT MCOUNT_NARGS
115
116#ifdef USE_AS_RAWMEMCHR
117	xori	r5,r4,0xff
118
119	mtvsrd	v18+32,r4	/* matching char in v18  */
120	mtvsrd	v19+32,r5	/* non matching char in v19  */
121
122	vspltb	v18,v18,7	/* replicate  */
123	vspltb	v19,v19,7	/* replicate  */
124#else
125	vspltisb  v19,-1
126#endif
127	vspltisb  VREG_ZERO,0
128
129	/* Next 16B-aligned address. Prepare address for L(aligned).  */
130	addi	  r5,r3,16
131	clrrdi	  r5,r5,4
132
133	/* Align data and fill bytes not loaded with non matching char.	 */
134	lvx	  v0,0,r3
135	lvsr	  v1,0,r3
136	vperm	  v0,v19,v0,v1
137
138	vcmpequb. v6,v0,v18
139	beq	  cr6,L(aligned)
140
141#ifdef USE_AS_RAWMEMCHR
142	vctzlsbb  r6,v6
143	add	  r3,r3,r6
144#else
145	vctzlsbb  r3,v6
146#endif
147	blr
148
149	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
150	   optimized for longer strings, so checking the first bytes in 16B
151	   chunks benefits a lot small strings.  */
152	.p2align 5
153L(aligned):
154#ifdef USE_AS_RAWMEMCHR
155	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
156				  choose how we will perform the main loop.  */
157#endif
158	/* Prepare address for the loop.  */
159	addi	  r4,r3,OFF_START_LOOP
160	clrrdi	  r4,r4,6
161
162	CHECK16(v0,0,r5,tail1)
163	CHECK16(v1,16,r5,tail2)
164	CHECK16(v2,32,r5,tail3)
165	CHECK16(v3,48,r5,tail4)
166	CHECK16(v4,64,r5,tail5)
167	CHECK16(v5,80,r5,tail6)
168	CHECK16(v6,96,r5,tail7)
169	CHECK16(v7,112,r5,tail8)
170	CHECK16(v8,128,r5,tail9)
171	CHECK16(v9,144,r5,tail10)
172	CHECK16(v10,160,r5,tail11)
173#ifdef USE_AS_RAWMEMCHR
174	CHECK16(v0,176,r5,tail12)
175	CHECK16(v1,192,r5,tail13)
176	CHECK16(v2,208,r5,tail14)
177	CHECK16(v3,224,r5,tail15)
178#endif
179
180	addi	  r5,r4,128
181
182#ifdef USE_AS_RAWMEMCHR
183	/* If c == 0, use the same loop as strlen, without the vsububm.  */
184	beq	cr5,L(loop)
185
186	/* This is very similar to the block after L(loop), the difference is
187	   that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
188	   each byte loaded by the char we are looking for, this way we can keep
189	   using vminub to merge the results and checking for nulls.  */
190	.p2align 5
191L(rawmemchr_loop):
192	CHECK64(0,r4,pre_tail_64b)
193	CHECK64(64,r4,pre_tail_64b)
194	addi	  r4,r4,256
195
196	CHECK64(0,r5,tail_64b)
197	CHECK64(64,r5,tail_64b)
198	addi	  r5,r5,256
199
200	b	  L(rawmemchr_loop)
201#endif
202	/* Switch to a more aggressive approach checking 64B each time.  Use 2
203	   pointers 128B apart and unroll the loop once to make the pointer
204	   updates and usages separated enough to avoid stalls waiting for
205	   address calculation.  */
206	.p2align 5
207L(loop):
208#undef RAWMEMCHR_SUBTRACT_VECTORS
209#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
210	CHECK64(0,r4,pre_tail_64b)
211	CHECK64(64,r4,pre_tail_64b)
212	addi	  r4,r4,256
213
214	CHECK64(0,r5,tail_64b)
215	CHECK64(64,r5,tail_64b)
216	addi	  r5,r5,256
217
218	b	  L(loop)
219
220	.p2align  5
221L(pre_tail_64b):
222	mr	r5,r4
223L(tail_64b):
224	/* OK, we found a null byte.  Let's look for it in the current 64-byte
225	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
226	   low 16B bytes into vx+1, and the high into vx, so the order here is
227	   v5, v4, v7, v6.  */
228	vcmpequb  v1,v5,VREG_ZERO
229	vcmpequb  v2,v4,VREG_ZERO
230	vcmpequb  v3,v7,VREG_ZERO
231	vcmpequb  v4,v6,VREG_ZERO
232
233	/* Take into account the other 64B blocks we had already checked.  */
234	add	r5,r5,r6
235
236	/* Extract first bit of each byte.  */
237	VEXTRACTBM(r7,v1)
238	VEXTRACTBM(r8,v2)
239	VEXTRACTBM(r9,v3)
240	VEXTRACTBM(r10,v4)
241
242	/* Shift each value into their corresponding position.  */
243	sldi	  r8,r8,16
244	sldi	  r9,r9,32
245	sldi	  r10,r10,48
246
247	/* Merge the results.  */
248	or	  r7,r7,r8
249	or	  r8,r9,r10
250	or	  r10,r8,r7
251
252	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
253#ifndef USE_AS_RAWMEMCHR
254	subf	  r5,r3,r5
255#endif
256	add	  r3,r5,r0	  /* Compute final length.  */
257	blr
258
259	.p2align  5
260L(tail1):
261	TAIL(v0,0)
262
263	.p2align  5
264L(tail2):
265	TAIL(v1,16)
266
267	.p2align  5
268L(tail3):
269	TAIL(v2,32)
270
271	.p2align  5
272L(tail4):
273	TAIL(v3,48)
274
275	.p2align  5
276L(tail5):
277	TAIL(v4,64)
278
279	.p2align  5
280L(tail6):
281	TAIL(v5,80)
282
283	.p2align  5
284L(tail7):
285	TAIL(v6,96)
286
287	.p2align  5
288L(tail8):
289	TAIL(v7,112)
290
291	.p2align  5
292L(tail9):
293	TAIL(v8,128)
294
295	.p2align  5
296L(tail10):
297	TAIL(v9,144)
298
299	.p2align  5
300L(tail11):
301	TAIL(v10,160)
302
303#ifdef USE_AS_RAWMEMCHR
304	.p2align  5
305L(tail12):
306	TAIL(v0,176)
307
308	.p2align  5
309L(tail13):
310	TAIL(v1,192)
311
312	.p2align  5
313L(tail14):
314	TAIL(v2,208)
315
316	.p2align  5
317L(tail15):
318	TAIL(v3,224)
319#endif
320
321END (FUNCNAME)
322
323#ifdef USE_AS_RAWMEMCHR
324weak_alias (__rawmemchr,rawmemchr)
325libc_hidden_builtin_def (__rawmemchr)
326#else
327# ifdef DEFINE_STRLEN_HIDDEN_DEF
328weak_alias (__strlen, strlen)
329libc_hidden_builtin_def (strlen)
330# endif
331#endif
332