1/* Optimized strnlen implementation for POWER8 using a vmx loop.
2
3   Copyright (C) 2017-2022 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13   You should have received a copy of the GNU Lesser General Public
14   License along with the GNU C Library; if not, see
15   <https://www.gnu.org/licenses/>.  */
16
17/* It is implemented the following heuristic:
18	1. Case maxlen <= 32: align the pointer to 8 bytes to loop through
19	reading doublewords. Uses the POWER7 algorithm.
20	2. Case maxlen > 32: check for null bytes in the first 16 bytes using
21	unaligned accesses. Return length if found. Otherwise:
22		2.1 Case maxlen < 64: deduct the bytes previously read, align
23		the pointer to 16 bytes and loop through reading quadwords
24		until find null bytes or reach maxlen.
25		2.2 Case maxlen > 64: deduct the bytes previously read, align
26		the pointer to 64 bytes and set up a counter to loop through
27		reading in strides of 64 bytes. In case it finished the loop
28		with null bytes not found, process the remainder bytes by
29		switching to the loop to heuristic in 2.1.  */
30
31#include <sysdep.h>
32
33/* Define default page size to 4KB.  */
34#define PAGE_SIZE 4096
35
36
37/* int [r3] strnlen (char *s [r3], size_t maxlen [r4])  */
38	.machine  power8
39ENTRY_TOCLESS (__strnlen)
40	CALL_MCOUNT 2
41	dcbt	0,r3
42
43	cmpldi	r4,32           /* Check if maxlen <= 32.  */
44	ble	L(small_range)  /* If maxlen <= 32.  */
45
46	/* Upcoming 16 bytes unaligned accesses cannot cross the page boundary
47	   otherwise the processor throws an memory access error.
48	   Use following code to check there is room for such as accesses:
49	     (((size_t) s) % PAGE_SIZE > (PAGE_SIZE - 16)
50	   If it is disallowed then switch to the code that handles
51	   the string when maxlen <= 32.  */
52	clrldi	r10,r3,52
53	cmpldi  cr7,r10,PAGE_SIZE-16
54	bgt     cr7,L(small_range)	/* If less than 16B of page end.  */
55
56	/* Compute our permute constant r8.  */
57	li	r7,0
58	/* Compute a bpermd constant to move bit 0 of each word into
59	   a halfword value, and count trailing zeros.  */
60#ifdef __LITTLE_ENDIAN__
61	li	r8,0x2820
62	oris	r8,r8,0x3830
63	sldi	r8,r8,32
64	ori	r8,r8,0x0800
65	oris	r8,r8,0x1810
66#else
67	li	r8,0x1018
68	oris	r8,r8,0x0008
69	sldi	r8,r8,32
70	ori	r8,r8,0x3038
71	oris	r8,r8,0x2028
72#endif
73
74	/* maxlen > 32. Optimistically check for null bytes in the first
75	   16 bytes of the string using unaligned accesses.  */
76	ld	r5,0(r3)
77	ld	r6,8(r3)
78	cmpb	r10,r7,r5		/* Check for null bytes in DWORD1.  */
79	cmpb	r11,r7,r6		/* Check for null bytes in DWORD2.  */
80	or.	r7,r10,r11
81	bne	cr0, L(early_find)	/* If found null bytes.  */
82
83	/* At this point maxlen > 32 and null bytes were not found at first
84	   16 bytes. Prepare for loop using VMX.  */
85
86	/* r3 == s, r4 == maxlen. All other volatile regs are unused now.  */
87
88	addi	r5,r3,16	/* Align up, or just add the 16B we
89				   already checked.  */
90	li	r0,15
91	and	r7,r5,r0	/* Find offset into 16B alignment.  */
92	andc	r5,r5,r0	/* Quadword align up s to the next quadword.  */
93	li	r0,16
94	subf	r0,r7,r0
95	subf	r4,r0,r4	/* Deduct unaligned bytes from maxlen.  */
96
97
98	/* Compute offsets for vmx loads, and precompute the vbpermq
99	   constants for both the 64B and 16B loops.  */
100	li	r6,0
101	vspltisb  v0,0
102	vspltisb  v10,3
103	lvsl	  v11,r6,r6
104	vslb	  v10,v11,v10
105
106	cmpldi  r4,64		/* Check maxlen < 64.  */
107	blt	L(smaller)	/* If maxlen < 64 */
108
109	/* In order to begin the 64B loop, it needs to be 64
110	   bytes aligned. So read quadwords until it is aligned or found null
111	   bytes. At worst case it will be aligned after the fourth iteration,
112	   so unroll the loop to avoid counter checking.  */
113	andi.   r7,r5,63		/* Check if is 64 bytes aligned.  */
114	beq     cr0,L(preloop_64B)	/* If it is already 64B aligned.  */
115	lvx     v1,r5,r6
116	vcmpequb.       v1,v1,v0
117	addi    r5,r5,16
118	addi    r4,r4,-16		/* Decrement maxlen in 16 bytes. */
119	bne     cr6,L(found_aligning64B) /* If found null bytes.  */
120
121	/* Unroll 2x above code block until aligned or find null bytes.  */
122	andi.   r7,r5,63
123	beq     cr0,L(preloop_64B)
124	lvx     v1,r5,r6
125	vcmpequb.      v1,v1,v0
126	addi    r5,r5,16
127	addi    r4,r4,-16
128	bne     cr6,L(found_aligning64B)
129
130	andi.   r7,r5,63
131	beq     cr0,L(preloop_64B)
132	lvx     v1,r5,r6
133	vcmpequb.      v1,v1,v0
134	addi    r5,r5,16
135	addi    r4,r4,-16
136	bne     cr6,L(found_aligning64B)
137
138	/* At this point it should be 16 bytes aligned.
139	   Prepare for the 64B loop.  */
140	.p2align 4
141L(preloop_64B):
142	/* Check if maxlen became is less than 64, therefore disallowing the
143	   64B loop. If it happened switch to the 16B loop code.  */
144	cmpldi  r4,64		/* Check if maxlen < 64.  */
145	blt     L(smaller)	/* If maxlen < 64.  */
146	/* Set some constant values.  */
147	li      r7,16
148	li      r10,32
149	li      r9,48
150
151	/* Compute the number of 64 bytes iterations needed.  */
152	srdi	r11,r4,6	/* Compute loop count (maxlen / 64).  */
153	andi.	r4,r4,63	/* Set maxlen the remainder (maxlen % 64).  */
154	mtctr	r11		/* Move loop count to counter register.  */
155
156	/* Handle maxlen > 64. Loop over the bytes in strides of 64B.  */
157	.p2align 4
158L(loop_64B):
159	lvx	v1,r5,r6	/* r5 is the pointer to s.  */
160	lvx	v2,r5,r7
161	lvx	v3,r5,r10
162	lvx	v4,r5,r9
163	/* Compare the four 16B vectors to obtain the least 16 values.
164	   Null bytes should emerge into v7, then check for null bytes.  */
165	vminub	v5,v1,v2
166	vminub	v6,v3,v4
167	vminub	v7,v5,v6
168	vcmpequb. v7,v7,v0		/* Check for null bytes.  */
169	addi	r5,r5,64		/* Add pointer to next iteraction.  */
170	bne	cr6,L(found_64B)	/* If found null bytes.  */
171	bdnz	L(loop_64B)		/* Continue the loop if count > 0. */
172
173/* Hit loop end without null match. So branch to handle the remainder.  */
174
175	/* Prepare a 16B loop to handle two cases:
176		1. If 32 > maxlen < 64.
177		2. If maxlen >= 64, and reached end of the 64B loop with null
178		bytes not found. Thus handle the remainder bytes here. */
179	.p2align 4
180L(smaller):
181        cmpldi  r4,0            /* Check maxlen is zero.  */
182        beq     L(done)         /* If maxlen is zero.  */
183
184	/* Place rounded up number of qw's to check into a vmx
185	   register, and use some vector tricks to minimize
186	   branching.  */
187        mtvrd    v7,r4          /* copy maxlen from gpr to vector register. */
188        vspltisb v5,1
189        vspltisb v6,15
190        vspltb   v2,v7,7
191        vaddubs  v3,v5,v6
192
193#ifdef __LITTLE_ENDIAN__
194	vspltish v5,1           /* Compute 16 in each byte.  */
195#endif
196
197	/* Loop in 16B aligned incremements now. */
198	.p2align 4
199L(loop_16B):
200	lvx     v1,r5,r6        /* Load quadword into vector register.  */
201	addi    r5,r5,16        /* Increment address to next 16B block.  */
202	vor     v7,v2,v2        /* Save loop count (v2) into v7. */
203	vsububs v2,v2,v3        /* Subtract 16B from count, saturate at 0. */
204	vminub  v4,v1,v2
205	vcmpequb. v4,v4,v0      /* Checking for null bytes.  */
206	beq     cr6,L(loop_16B) /* If null bytes not found.  */
207
208	vcmpequb  v1,v1,v0
209	vbpermq   v1,v1,v10
210#ifdef __LITTLE_ENDIAN__
211	vsubuhm  v2,v1,v5       /* Form a mask of trailing zeros.  */
212	vandc    v2,v2,v1
213	vpopcnth v1,v2          /* count of trailing zeros, 16 if none.  */
214#else
215	vclzh    v1,v1          /* count the leading zeros, 16 if none.  */
216#endif
217	/* Truncate to maximum allowable offset.  */
218	vcmpgtub v2,v1,v7       /* Compare and truncate for matches beyond
219				   maxlen.  */
220	vsel     v1,v1,v7,v2    /* 0-16 is now in byte 7.  */
221
222	mfvrd   r0,v1
223	addi    r5,r5,-16       /* Undo speculative bump.  */
224	extsb   r0,r0           /* Clear whatever gunk is in the high 56b.  */
225	add     r5,r5,r0        /* Add the offset of whatever was found.  */
226L(done):
227	subf    r3,r3,r5        /* Length is equal to the offset of null byte
228				   matched minus the pointer to s.  */
229	blr                     /* Done.  */
230
231	/* Handle case of maxlen > 64 and found null bytes in last block
232	   of 64 bytes read.  */
233	.p2align 4
234L(found_64B):
235	/* A zero was found. Reduce the result.  */
236	vcmpequb  v1,v1,v0
237	vcmpequb  v2,v2,v0
238	vcmpequb  v3,v3,v0
239	vcmpequb  v4,v4,v0
240
241	/* Permute the first bit of each byte into bits 48-63.  */
242	vbpermq	v1,v1,v10
243	vbpermq	v2,v2,v10
244	vbpermq	v3,v3,v10
245	vbpermq	v4,v4,v10
246
247	/* Shift each component into its correct position for merging.  */
248#ifdef __LITTLE_ENDIAN__
249	vsldoi	v2,v2,v2,2
250	vsldoi	v3,v3,v3,4
251	vsldoi	v4,v4,v4,6
252#else
253	vsldoi	v1,v1,v1,6
254	vsldoi	v2,v2,v2,4
255	vsldoi	v3,v3,v3,2
256#endif
257
258	/* Merge the results and move to a GPR.  */
259	vor	v1,v2,v1
260	vor	v2,v3,v4
261	vor	v4,v1,v2
262
263	/* Adjust address to the start of the current 64B block.  */
264	addi	r5,r5,-64
265
266	mfvrd	r10,v4
267#ifdef __LITTLE_ENDIAN__
268	addi	r9,r10,-1	/* Form a mask from trailing zeros.  */
269	andc	r9,r9,r10
270	popcntd	r0,r9		/* Count the bits in the mask.  */
271#else
272	cntlzd	r0,r10		/* Count leading zeros before the match.  */
273#endif
274	subf	r5,r3,r5
275	add	r3,r5,r0	/* Compute final length.  */
276	blr                     /* Done.  */
277
278	/* Handle case where null bytes were found while aligning
279	   as a preparation for the 64B loop.  */
280	.p2align 4
281L(found_aligning64B):
282	vbpermq v1,v1,v10
283#ifdef __LITTLE_ENDIAN__
284	mfvrd   r10,v1
285	addi    r9,r10,-1       /* Form a mask from trailing zeros.  */
286	andc    r9,r9,r10
287	popcntd r0,r9           /* Count the bits in the mask.  */
288#else
289	vsldoi  v1,v1,v1,6
290	mfvrd   r10,v1
291	cntlzd  r0,r10          /* Count leading zeros before the match.  */
292#endif
293	addi    r5,r5,-16	/* Adjust address to offset of last 16 bytes
294				   read.  */
295	/* Calculate length as subtracted the pointer to s of last 16 bytes
296	   offset, added with the bytes before the match.  */
297	subf    r5,r3,r5
298	add     r3,r5,r0
299	blr			/* Done.  */
300
301	/* Handle case of maxlen > 32 and found a null bytes within the first
302	   16 bytes of s.  */
303	.p2align 4
304L(early_find):
305	bpermd	r5,r8,r10        /* r8 contains the bit permute constants.  */
306	bpermd	r6,r8,r11
307	sldi	r5,r5,8
308	or	r5,r5,r6	/* r5 should hold a 16B mask of
309				   a potential 0.  */
310	cntlzd	r5,r5		/* Count leading zeros.  */
311	addi	r3,r5,-48	/* Deduct the 48 leading zeros always
312				   present.  */
313	blr			/* Done.  */
314
315	/* Handle case of maxlen <= 32. Use the POWER7 algorithm.  */
316	.p2align 4
317L(small_range):
318	clrrdi	r8,r3,3  	/* Align the pointer to 8B.  */
319	li	r0,0
320	/* Register's content at this point:
321	   r3 == pointer to s, r4 == maxlen, r8 == pointer to s aligned to 8B,
322	   r7 == last acceptable address. */
323	cmpldi	r4,0                 /* Check if maxlen is zero.  */
324	beq	L(end_max)	     /* If maxlen is zero.  */
325
326	/* Calculate the last acceptable address and check for possible
327	   addition overflow by using satured math:
328	   r7 = r3 + r4
329	   r7 |= -(r7 < x)  */
330	add     r7,r3,r4
331	subfc   r6,r3,r7
332	subfe   r9,r9,r9
333	extsw   r6,r9
334	or      r7,r7,r6
335	addi    r7,r7,-1
336
337	clrrdi	r7,r7,3              /* Align to 8B address of last
338					acceptable address.  */
339
340	rlwinm	r6,r3,3,26,28        /* Calculate padding.  */
341	ld	r12,0(r8)            /* Load aligned doubleword.  */
342	cmpb	r10,r12,r0           /* Check for null bytes. */
343#ifdef __LITTLE_ENDIAN__
344	srd	r10,r10,r6
345	sld	r10,r10,r6
346#else
347	sld	r10,r10,r6
348	srd	r10,r10,r6
349#endif /* __LITTLE_ENDIAN__  */
350	cmpldi	cr7,r10,0
351	bne	cr7,L(done_small)    /* If found null byte.  */
352
353	cmpld	r8,r7                /* Check if reached maxlen.  */
354	beq	L(end_max)	     /* If reached maxlen.  */
355
356	/* Still handling case of maxlen <= 32. Read doubleword aligned until
357	   find null bytes or reach maxlen.  */
358	.p2align 4
359L(loop_small):
360	ldu	r12,8(r8)         /* Load next doubleword and update r8.  */
361	cmpb	r10,r12,r0        /* Check for null bytes.  */
362	cmpldi	cr6,r10,0
363	bne	cr6,L(done_small) /* If found null bytes.  */
364	cmpld	r8,r7             /* Check if reached maxlen. */
365	bne	L(loop_small)	  /* If it has more bytes to read.  */
366	mr	r3,r4             /* Reached maxlen with null bytes not found.
367				     Length is equal to maxlen.  */
368	blr			  /* Done.  */
369
370	/* Still handling case of maxlen <= 32. Found null bytes.
371	   Registers: r10 == match bits within doubleword, r8 == address of
372	   last doubleword read, r3 == pointer to s, r4 == maxlen.  */
373	.p2align 4
374L(done_small):
375#ifdef __LITTLE_ENDIAN__
376	/* Count trailing zeros.  */
377	addi	r0,r10,-1
378	andc	r0,r0,r10
379	popcntd	r0,r0
380#else
381	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
382#endif
383	sub	r3,r8,r3      /* Calculate total of bytes before the match.  */
384	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
385	add	r3,r3,r0      /* Length until the match.  */
386	cmpld	r3,r4         /* Check length is greater than maxlen.  */
387	blelr
388	mr	r3,r4	      /* If length is greater than maxlen, return
389				 maxlen.  */
390	blr
391
392	/* Handle case of reached maxlen with null bytes not found.  */
393	.p2align 4
394L(end_max):
395	mr	r3,r4	/* Length is equal to maxlen.  */
396	blr		/* Done.  */
397
398
399END (__strnlen)
400libc_hidden_def (__strnlen)
401weak_alias (__strnlen, strnlen)
402libc_hidden_def (strnlen)
403