1/* strcpy/stpcpy - copy a string returning pointer to start/end.
2   Copyright (C) 2013-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
20
21   To test the page crossing code path more thoroughly, compile with
22   -DSTRCPY_TEST_PAGE_CROSS - this will force all unaligned copies through
23   the slower entry path.  This option is not intended for production use.  */
24
25#include <sysdep.h>
26
27/* Assumptions:
28 *
29 * ARMv8-a, AArch64, Advanced SIMD.
30 * MTE compatible.
31 */
32
33/* Arguments and results.  */
34#define dstin		x0
35#define srcin		x1
36#define result		x0
37
38#define src		x2
39#define dst		x3
40#define len		x4
41#define synd		x4
42#define	tmp		x5
43#define shift		x5
44#define data1		x6
45#define dataw1		w6
46#define data2		x7
47#define dataw2		w7
48
49#define dataq		q0
50#define vdata		v0
51#define vhas_nul	v1
52#define vend		v2
53#define dend		d2
54#define dataq2		q1
55
56#ifdef BUILD_STPCPY
57# define STRCPY __stpcpy
58# define IFSTPCPY(X,...) X,__VA_ARGS__
59#else
60# define STRCPY strcpy
61# define IFSTPCPY(X,...)
62#endif
63
64/*
65   Core algorithm:
66   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
67   per byte. We take 4 bits of every comparison byte with shift right and narrow
68   by 4 instruction. Since the bits in the nibble mask reflect the order in
69   which things occur in the original string, counting leading zeros identifies
70   exactly which byte matched.  */
71
72ENTRY (STRCPY)
73	PTR_ARG (0)
74	PTR_ARG (1)
75	bic	src, srcin, 15
76	ld1	{vdata.16b}, [src]
77	cmeq	vhas_nul.16b, vdata.16b, 0
78	lsl	shift, srcin, 2
79	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
80	fmov	synd, dend
81	lsr	synd, synd, shift
82	cbnz	synd, L(tail)
83
84	ldr	dataq, [src, 16]!
85	cmeq	vhas_nul.16b, vdata.16b, 0
86	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
87	fmov	synd, dend
88	cbz	synd, L(start_loop)
89
90#ifndef __AARCH64EB__
91	rbit	synd, synd
92#endif
93	sub	tmp, src, srcin
94	clz	len, synd
95	add	len, tmp, len, lsr 2
96	tbz	len, 4, L(less16)
97	sub	tmp, len, 15
98	ldr	dataq, [srcin]
99	ldr	dataq2, [srcin, tmp]
100	str	dataq, [dstin]
101	str	dataq2, [dstin, tmp]
102	IFSTPCPY (add result, dstin, len)
103	ret
104
105	.p2align 4,,8
106L(tail):
107	rbit	synd, synd
108	clz	len, synd
109	lsr	len, len, 2
110
111	.p2align 4
112L(less16):
113	tbz	len, 3, L(less8)
114	sub	tmp, len, 7
115	ldr	data1, [srcin]
116	ldr	data2, [srcin, tmp]
117	str	data1, [dstin]
118	str	data2, [dstin, tmp]
119	IFSTPCPY (add result, dstin, len)
120	ret
121
122	.p2align 4
123L(less8):
124	subs	tmp, len, 3
125	b.lo	L(less4)
126	ldr	dataw1, [srcin]
127	ldr	dataw2, [srcin, tmp]
128	str	dataw1, [dstin]
129	str	dataw2, [dstin, tmp]
130	IFSTPCPY (add result, dstin, len)
131	ret
132
133L(less4):
134	cbz	len, L(zerobyte)
135	ldrh	dataw1, [srcin]
136	strh	dataw1, [dstin]
137L(zerobyte):
138	strb	wzr, [dstin, len]
139	IFSTPCPY (add result, dstin, len)
140	ret
141
142	.p2align 4
143L(start_loop):
144	sub	len, src, srcin
145	ldr	dataq2, [srcin]
146	add	dst, dstin, len
147	str	dataq2, [dstin]
148
149	.p2align 5
150L(loop):
151	str	dataq, [dst], 16
152	ldr	dataq, [src, 16]!
153	cmeq	vhas_nul.16b, vdata.16b, 0
154	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
155	fmov	synd, dend
156	cbz	synd, L(loop)
157
158	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
159	fmov	synd, dend
160#ifndef __AARCH64EB__
161	rbit	synd, synd
162#endif
163	clz	len, synd
164	lsr	len, len, 2
165	sub	tmp, len, 15
166	ldr	dataq, [src, tmp]
167	str	dataq, [dst, tmp]
168	IFSTPCPY (add result, dst, len)
169	ret
170
171END (STRCPY)
172
173#ifdef BUILD_STPCPY
174weak_alias (__stpcpy, stpcpy)
175libc_hidden_def (__stpcpy)
176libc_hidden_builtin_def (stpcpy)
177#else
178libc_hidden_builtin_def (strcpy)
179#endif
180