1/* Optimized strcpy implementation for PowerPC64/POWER9.
2   Copyright (C) 2020-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_STPCPY
22# ifndef STPCPY
23#   define FUNC_NAME __stpcpy
24# else
25#   define FUNC_NAME STPCPY
26# endif
27#else
28# ifndef STRCPY
29#  define FUNC_NAME strcpy
30# else
31#  define FUNC_NAME STRCPY
32# endif
33#endif  /* !USE_AS_STPCPY  */
34
35/* Implements the function
36
37   char * [r3] strcpy (char *dest [r3], const char *src [r4])
38
39   or
40
41   char * [r3] stpcpy (char *dest [r3], const char *src [r4])
42
43   if USE_AS_STPCPY is defined.
44
45   The implementation can load bytes past a null terminator, but only
46   up to the next 16B boundary, so it never crosses a page.  */
47
48/* Load quadword at addr+offset to vreg, check for null bytes,
49   and branch to label if any are found.  */
50#define CHECK16(vreg,offset,addr,label) \
51	lxv	vreg+32,offset(addr);	\
52	vcmpequb. v6,vreg,v18;	\
53	bne	cr6,L(label);
54
55.machine power9
56ENTRY_TOCLESS (FUNC_NAME, 4)
57	CALL_MCOUNT 2
58
59	vspltisb v18,0		/* Zeroes in v18  */
60	vspltisb v19,-1 	/* 0xFF bytes in v19  */
61
62	/* Next 16B-aligned address. Prepare address for L(loop).  */
63	addi	r5,r4,16
64	clrrdi	r5,r5,4
65	subf	r8,r4,r5
66	add	r11,r3,r8
67
68	/* Align data and fill bytes not loaded with non matching char.  */
69	lvx	v0,0,r4
70	lvsr	v1,0,r4
71	vperm	v0,v19,v0,v1
72
73	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
74	beq	cr6,L(no_null)
75
76	/* There's a null byte.  */
77	vctzlsbb r8,v6		/* Number of trailing zeroes  */
78	addi	r9,r8,1 	/* Add null byte.  */
79	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
80	stxvl	32+v0,r3,r10	/* Partial store  */
81
82#ifdef USE_AS_STPCPY
83	/* stpcpy returns the dest address plus the size not counting the
84	   final '\0'.  */
85	add	r3,r3,r8
86#endif
87	blr
88
89L(no_null):
90	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
91	stxvl	32+v0,r3,r10	/* Partial store  */
92
93	.p2align 4
94L(loop):
95	CHECK16(v0,0,r5,tail1)
96	CHECK16(v1,16,r5,tail2)
97	CHECK16(v2,32,r5,tail3)
98	CHECK16(v3,48,r5,tail4)
99	CHECK16(v4,64,r5,tail5)
100	CHECK16(v5,80,r5,tail6)
101
102	stxv	32+v0,0(r11)
103	stxv	32+v1,16(r11)
104	stxv	32+v2,32(r11)
105	stxv	32+v3,48(r11)
106	stxv	32+v4,64(r11)
107	stxv	32+v5,80(r11)
108
109	addi	r5,r5,96
110	addi	r11,r11,96
111
112	b	L(loop)
113
114	.p2align 4
115L(tail1):
116	vctzlsbb r8,v6		/* Number of trailing zeroes  */
117	addi	r9,r8,1		/* Add null terminator  */
118	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
119	stxvl	32+v0,r11,r9	/* Partial store  */
120#ifdef USE_AS_STPCPY
121	/* stpcpy returns the dest address plus the size not counting the
122	   final '\0'.  */
123	add	r3,r11,r8
124#endif
125	blr
126
127	.p2align 4
128L(tail2):
129	stxv	32+v0,0(r11)
130	vctzlsbb r8,v6
131	addi	r9,r8,1
132	sldi	r9,r9,56
133	addi	r11,r11,16
134	stxvl	32+v1,r11,r9
135#ifdef USE_AS_STPCPY
136	add	r3,r11,r8
137#endif
138	blr
139
140	.p2align 4
141L(tail3):
142	stxv	32+v0,0(r11)
143	stxv	32+v1,16(r11)
144	vctzlsbb r8,v6
145	addi	r9,r8,1
146	sldi	r9,r9,56
147	addi	r11,r11,32
148	stxvl	32+v2,r11,r9
149#ifdef USE_AS_STPCPY
150	add	r3,r11,r8
151#endif
152	blr
153
154	.p2align 4
155L(tail4):
156	stxv	32+v0,0(r11)
157	stxv	32+v1,16(r11)
158	stxv	32+v2,32(r11)
159	vctzlsbb r8,v6
160	addi	r9,r8,1
161	sldi	r9,r9,56
162	addi	r11,r11,48
163	stxvl	32+v3,r11,r9
164#ifdef USE_AS_STPCPY
165	add	r3,r11,r8
166#endif
167	blr
168
169	.p2align 4
170L(tail5):
171	stxv	32+v0,0(r11)
172	stxv	32+v1,16(r11)
173	stxv	32+v2,32(r11)
174	stxv	32+v3,48(r11)
175	vctzlsbb r8,v6
176	addi	r9,r8,1
177	sldi	r9,r9,56
178	addi	r11,r11,64
179	stxvl	32+v4,r11,r9
180#ifdef USE_AS_STPCPY
181	add	r3,r11,r8
182#endif
183	blr
184
185	.p2align 4
186L(tail6):
187	stxv	32+v0,0(r11)
188	stxv	32+v1,16(r11)
189	stxv	32+v2,32(r11)
190	stxv	32+v3,48(r11)
191	stxv	32+v4,64(r11)
192	vctzlsbb r8,v6
193	addi	r9,r8,1
194	sldi	r9,r9,56
195	addi	r11,r11,80
196	stxvl	32+v5,r11,r9
197#ifdef USE_AS_STPCPY
198	add	r3,r11,r8
199#endif
200	blr
201
202END (FUNC_NAME)
203#ifndef USE_AS_STPCPY
204libc_hidden_builtin_def (strcpy)
205#endif
206