1/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
2   Copyright (C) 2015-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STPNCPY
22# ifndef STPNCPY
23#   define FUNC_NAME __stpncpy
24# else
25#   define FUNC_NAME STPNCPY
26# endif
27#else
28# ifndef STRNCPY
29#  define FUNC_NAME strncpy
30# else
31#  define FUNC_NAME STRNCPY
32# endif
33#endif  /* !USE_AS_STPNCPY  */
34
35#ifndef MEMSET
36/* For builds without IFUNC support, local calls should be made to internal
37   GLIBC symbol (created by libc_hidden_builtin_def).  */
38# ifdef SHARED
39#  define MEMSET_is_local
40#  define MEMSET   __GI_memset
41# else
42#  define MEMSET   memset
43# endif
44#endif
45
46#define FRAMESIZE (FRAME_MIN_SIZE+48)
47
48/* Implements the function
49
50   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
51
52   or
53
54   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
55
56   if USE_AS_STPCPY is defined.
57
58   The implementation uses unaligned doubleword access to avoid specialized
59   code paths depending of data alignment.  Although recent powerpc64 uses
60   64K as default, the page cross handling assumes minimum page size of
61   4k.  */
62
63	.machine  power8
64#ifdef MEMSET_is_local
65ENTRY_TOCLESS (FUNC_NAME, 4)
66#else
67ENTRY (FUNC_NAME, 4)
68#endif
69	CALL_MCOUNT 3
70
71        /* Check if the [src]+15 will cross a 4K page by checking if the bit
72           indicating the page size changes.  Basically:
73
74           uint64_t srcin = (uint64_t)src;
75           uint64_t ob = srcin & 4096UL;
76           uint64_t nb = (srcin+15UL) & 4096UL;
77           if (ob ^ nb)
78             goto pagecross;  */
79
80	addi	r10,r4,16
81	rlwinm	r9,r4,0,19,19
82
83	/* Save some non-volatile registers on the stack.  */
84	std	r26,-48(r1)
85	std	r27,-40(r1)
86
87	rlwinm	r8,r10,0,19,19
88
89	std	r28,-32(r1)
90	std	r29,-24(r1)
91
92	cmpld	cr7,r9,r8
93
94	std	r30,-16(r1)
95	std	r31,-8(r1)
96
97	/* Update CFI.  */
98	cfi_offset(r26, -48)
99	cfi_offset(r27, -40)
100	cfi_offset(r28, -32)
101	cfi_offset(r29, -24)
102	cfi_offset(r30, -16)
103	cfi_offset(r31, -8)
104
105	beq	cr7,L(unaligned_lt_16)
106	rldicl	r9,r4,0,61
107	subfic	r8,r9,8
108	cmpld	cr7,r5,r8
109	bgt 	cr7,L(pagecross)
110
111	/* At this points there is 1 to 15 bytes to check and write.  Since it could
112	   be either from first unaligned 16 bytes access or from bulk copy, the code
113	   uses an unrolled byte read/write instead of trying to analyze the cmpb
114	   results.  */
115L(short_path):
116	mr	r9,r3
117L(short_path_1):
118	/* Return if there are no more bytes to be written.  */
119	cmpdi	cr7,r5,0
120	beq	cr7,L(short_path_loop_end_1)
121L(short_path_2):
122	/* Copy one char from src (r4) and write it to dest (r9).  If it is the
123	   end-of-string, start the null padding.  Continue, otherwise.  */
124	lbz	r10,0(r4)
125	cmpdi	cr7,r10,0
126	stb	r10,0(r9)
127	beq	cr7,L(zero_pad_start_1)
128	/* If there are no more bytes to be written, return.  */
129	cmpdi	cr0,r5,1
130	addi	r8,r9,1
131	addi	r6,r5,-1
132	beq	cr0,L(short_path_loop_end_0)
133	/* Copy another char from src (r4) to dest (r9).  Check again if it is
134	   the end-of-string.  If so, start the null padding.  */
135	lbz	r10,1(r4)
136	cmpdi	cr7,r10,0
137	stb	r10,1(r9)
138	beq	cr7,L(zero_pad_start_prepare_1)
139	/* Eagerly decrement r5 by 3, which is the number of bytes already
140	   written, plus one write that will be performed later on.  */
141	addi	r10,r5,-3
142	b	L(short_path_loop_1)
143
144	.align	4
145L(short_path_loop):
146	/* At this point, the induction variable, r5, as well as the pointers
147	   to dest and src (r9 and r4, respectivelly) have been updated.
148
149	   Note: The registers r7 and r10 are induction variables derived from
150	   r5.  They are used to determine if the total number of writes has
151	   been reached at every other write.
152
153	   Copy one char from src (r4) and write it to dest (r9).  If it is the
154	   end-of-string, start the null padding.  Continue, otherwise.  */
155	lbz	r8,0(r4)
156	addi	r7,r10,-2
157	cmpdi	cr5,r8,0
158	stb	r8,0(r9)
159	beq	cr5,L(zero_pad_start_1)
160	beq	cr7,L(short_path_loop_end_0)
161	/* Copy another char from src (r4) to dest (r9).  Check again if it is
162	   the end-of-string.  If so, start the null padding.  */
163	lbz	r8,1(r4)
164	cmpdi	cr7,r8,0
165	stb	r8,1(r9)
166	beq	cr7,L(zero_pad_start)
167	mr	r10,r7
168L(short_path_loop_1):
169	/* This block is reached after two chars have been already written to
170	   dest.  Nevertheless, r5 (the induction variable), r9 (the pointer to
171	   dest), and r4 (the pointer to src) have not yet been updated.
172
173	   At this point:
174	     r5 holds the count of bytes yet to be written plus 2.
175	     r9 points to the last two chars that were already written to dest.
176	     r4 points to the last two chars that were already copied from src.
177
178	   The algorithm continues by decrementing r5, the induction variable,
179	   so that it reflects the last two writes.  The pointers to dest (r9)
180	   and to src (r4) are increment by two, for the same reason.
181
182	   Note: Register r10 is another induction variable, derived from r5,
183	   which determines if the total number of writes has been reached.  */
184	addic.	r5,r5,-2
185	addi	r9,r9,2
186	cmpdi	cr7,r10,0 /* Eagerly check if the next write is the last.  */
187	addi	r4,r4,2
188	addi	r6,r9,1
189	bne	cr0,L(short_path_loop) /* Check if the total number of writes
190					  has been reached at every other
191					  write.  */
192#ifdef USE_AS_STPNCPY
193	mr	r3,r9
194	b	L(short_path_loop_end)
195#endif
196
197L(short_path_loop_end_0):
198#ifdef USE_AS_STPNCPY
199	addi	r3,r9,1
200	b	L(short_path_loop_end)
201#endif
202L(short_path_loop_end_1):
203#ifdef USE_AS_STPNCPY
204	mr	r3,r9
205#endif
206L(short_path_loop_end):
207	/* Restore non-volatile registers.  */
208	ld	r26,-48(r1)
209	ld	r27,-40(r1)
210	ld	r28,-32(r1)
211	ld	r29,-24(r1)
212	ld	r30,-16(r1)
213	ld	r31,-8(r1)
214	blr
215
216	/* This code pads the remainder of dest with NULL bytes.  The algorithm
217	   calculates the remaining size and calls memset.  */
218	.align	4
219L(zero_pad_start):
220	mr	r5,r10
221	mr	r9,r6
222L(zero_pad_start_1):
223	/* At this point:
224	     - r5 holds the number of bytes that still have to be written to
225	       dest.
226	     - r9 points to the position, in dest, where the first null byte
227	       will be written.
228	   The above statements are true both when control reaches this label
229	   from a branch or when falling through the previous lines.  */
230#ifndef USE_AS_STPNCPY
231	mr	r30,r3       /* Save the return value of strncpy.  */
232#endif
233	/* Prepare the call to memset.  */
234	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
235	li	r4,0         /* Byte to be written (zero).  */
236
237	/* We delayed the creation of the stack frame, as well as the saving of
238	   the link register, because only at this point, we are sure that
239	   doing so is actually needed.  */
240
241	/* Save the link register.  */
242	mflr	r0
243	std	r0,16(r1)
244
245	/* Create the stack frame.  */
246	stdu	r1,-FRAMESIZE(r1)
247	cfi_adjust_cfa_offset(FRAMESIZE)
248	cfi_offset(lr, 16)
249
250	bl	MEMSET
251#ifndef MEMSET_is_local
252	nop
253#endif
254
255	ld	r0,FRAMESIZE+16(r1)
256
257#ifndef USE_AS_STPNCPY
258	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
259				dest.  For stpncpy, the return value is the
260				same as return value of memset.  */
261#endif
262
263	/* Restore non-volatile registers and return.  */
264	ld	r26,FRAMESIZE-48(r1)
265	ld	r27,FRAMESIZE-40(r1)
266	ld	r28,FRAMESIZE-32(r1)
267	ld	r29,FRAMESIZE-24(r1)
268	ld	r30,FRAMESIZE-16(r1)
269	ld	r31,FRAMESIZE-8(r1)
270	/* Restore the stack frame.  */
271	addi	r1,r1,FRAMESIZE
272	cfi_adjust_cfa_offset(-FRAMESIZE)
273	/* Restore the link register.  */
274	mtlr	r0
275	cfi_restore(lr)
276	blr
277
278	/* The common case where [src]+16 will not cross a 4K page boundary.
279	   In this case the code fast check the first 16 bytes by using doubleword
280	   read/compares and update destiny if neither total size or null byte
281	   is found in destiny. */
282	.align	4
283L(unaligned_lt_16):
284	cmpldi	cr7,r5,7
285	ble	cr7,L(short_path)
286	ld	r7,0(r4)
287	li	r8,0
288	cmpb	r8,r7,r8
289	cmpdi	cr7,r8,0
290	bne	cr7,L(short_path_prepare_2)
291	addi	r6,r5,-8
292	std	r7,0(r3)
293	addi	r9,r3,8
294	cmpldi	cr7,r6,7
295	addi	r7,r4,8
296	ble	cr7,L(short_path_prepare_1_1)
297	ld	r4,8(r4)
298	cmpb	r8,r4,r8
299	cmpdi	cr7,r8,0
300	bne	cr7,L(short_path_prepare_2_1)
301	std	r4,8(r3)
302	addi	r29,r3,16
303	addi	r5,r5,-16
304	/* Neither the null byte was found or total length was reached,
305	   align to 16 bytes and issue a bulk copy/compare.  */
306	b	L(align_to_16b)
307
308	/* In the case of 4k page boundary cross, the algorithm first align
309	   the address to a doubleword, calculate a mask based on alignment
310	   to ignore the bytes and continue using doubleword.  */
311	.align	4
312L(pagecross):
313	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
314	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
315	sldi	r9,r9,3		/* Calculate padding.  */
316	ld	r7,0(r11)	/* Load doubleword from memory.  */
317#ifdef __LITTLE_ENDIAN__
318	sld	r9,r6,r9	/* MASK = MASK << padding.  */
319#else
320	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
321#endif
322	orc	r9,r7,r9	/* Mask bits that are not part of the
323				   string.  */
324	li	r7,0
325	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
326	cmpdi	cr7,r9,0
327	bne	cr7,L(short_path_prepare_2)
328	subf	r8,r8,r5	/* Adjust total length.  */
329	cmpldi	cr7,r8,8	/* Check if length was reached.  */
330	ble	cr7,L(short_path_prepare_2)
331
332	/* For next checks we have aligned address, so we check for more
333	   three doublewords to make sure we can read 16 unaligned bytes
334	   to start the bulk copy with 16 aligned addresses.  */
335	ld	r7,8(r11)
336	cmpb	r9,r7,r9
337	cmpdi	cr7,r9,0
338	bne	cr7,L(short_path_prepare_2)
339	addi	r7,r8,-8
340	cmpldi	cr7,r7,8
341	ble	cr7,L(short_path_prepare_2)
342	ld	r7,16(r11)
343	cmpb	r9,r7,r9
344	cmpdi	cr7,r9,0
345	bne	cr7,L(short_path_prepare_2)
346	addi	r8,r8,-16
347	cmpldi	cr7,r8,8
348	ble	cr7,L(short_path_prepare_2)
349	ld	r8,24(r11)
350	cmpb	r9,r8,r9
351	cmpdi	cr7,r9,0
352	bne	cr7,L(short_path_prepare_2)
353
354	/* No null byte found in the 32 bytes readed and length not reached,
355	   read source again using unaligned loads and store them.  */
356	ld	r9,0(r4)
357	addi	r29,r3,16
358	addi	r5,r5,-16
359	std	r9,0(r3)
360	ld	r9,8(r4)
361	std	r9,8(r3)
362
363	/* Align source to 16 bytes and adjust destiny and size.  */
364L(align_to_16b):
365	rldicl	r9,r10,0,60
366	rldicr	r28,r10,0,59
367	add	r12,r5,r9
368	subf	r29,r9,r29
369
370	/* The bulk read/compare/copy loads two doublewords, compare and merge
371	   in a single register for speed.  This is an attempt to speed up the
372	   null-checking process for bigger strings.  */
373
374	cmpldi	cr7,r12,15
375	ble	cr7,L(short_path_prepare_1_2)
376
377	/* Main loop for large sizes, unrolled 2 times to get better use of
378	   pipeline.  */
379	ld	r8,0(28)
380	ld	r10,8(28)
381	li	r9,0
382	cmpb	r7,r8,r9
383	cmpb	r9,r10,r9
384	or.	r6,r9,r7
385	bne	cr0,L(short_path_prepare_2_3)
386	addi	r5,r12,-16
387	addi	r4,r28,16
388	std	r8,0(r29)
389	std	r10,8(r29)
390	cmpldi	cr7,r5,15
391	addi	r9,r29,16
392	ble	cr7,L(short_path_1)
393	mr	r11,r28
394	mr	r6,r29
395	li	r30,0
396	subfic	r26,r4,48
397	subfic	r27,r9,48
398
399	b	L(loop_16b)
400
401	.align	4
402L(loop_start):
403	ld	r31,0(r11)
404	ld	r10,8(r11)
405	cmpb	r0,r31,r7
406	cmpb	r8,r10,r7
407	or.	r7,r0,r8
408	addi	r5,r5,-32
409	cmpldi	cr7,r5,15
410	add	r4,r4,r26
411	add	r9,r9,r27
412	bne	cr0,L(short_path_prepare_2_2)
413	add	r4,r28,r4
414	std	r31,0(r6)
415	add	r9,r29,r9
416	std	r10,8(r6)
417	ble	cr7,L(short_path_1)
418
419L(loop_16b):
420	ld	r10,16(r11)
421	ld	r0,24(r11)
422	cmpb	r8,r10,r30
423	cmpb	r7,r0,r30
424	or.	r7,r8,r7
425	addi	r12,r12,-32
426	cmpldi	cr7,r12,15
427	addi	r11,r11,32
428	bne	cr0,L(short_path_2)
429	std	r10,16(r6)
430	addi	r6,r6,32
431	std	r0,-8(r6)
432	bgt	cr7,L(loop_start)
433
434	mr	r5,r12
435	mr	r4,r11
436	mr	r9,r6
437	b	L(short_path_1)
438
439	.align	4
440L(short_path_prepare_1_1):
441	mr	r5,r6
442	mr	r4,r7
443	b	L(short_path_1)
444L(short_path_prepare_1_2):
445	mr	r5,r12
446	mr	r4,r28
447	mr	r9,r29
448	b	L(short_path_1)
449L(short_path_prepare_2):
450	mr	r9,r3
451	b	L(short_path_2)
452L(short_path_prepare_2_1):
453	mr	r5,r6
454	mr	r4,r7
455	b	L(short_path_2)
456L(short_path_prepare_2_2):
457	mr	r5,r12
458	mr	r4,r11
459	mr	r9,r6
460	b	L(short_path_2)
461L(short_path_prepare_2_3):
462	mr	r5,r12
463	mr	r4,r28
464	mr	r9,r29
465	b	L(short_path_2)
466L(zero_pad_start_prepare_1):
467	mr	r5,r6
468	mr	r9,r8
469	b	L(zero_pad_start_1)
470END (FUNC_NAME)
471
472#ifndef USE_AS_STPNCPY
473libc_hidden_builtin_def (strncpy)
474#endif
475