1/* Copyright (C) 2000-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3
4   The GNU C Library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8
9   The GNU C Library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13
14   You should have received a copy of the GNU Lesser General Public
15   License along with the GNU C Library.  If not, see
16   <https://www.gnu.org/licenses/>.  */
17
18/* Copy a null-terminated string from SRC to DST.
19
20   This is an internal routine used by strcpy, stpcpy, and strcat.
21   As such, it uses special linkage conventions to make implementation
22   of these public functions more efficient.
23
24   On input:
25	t9 = return address
26	a0 = DST
27	a1 = SRC
28
29   On output:
30	t8  = bitmask (with one bit set) indicating the last byte written
31	a0  = unaligned address of the last *word* written
32
33   Furthermore, v0, a3-a5, t11, and t12 are untouched.
34*/
35
36
37#include <sysdep.h>
38
39	.arch ev6
40	.set noat
41	.set noreorder
42
43	.text
44	.type	__stxcpy, @function
45	.globl	__stxcpy
46	.usepv	__stxcpy, no
47
48	cfi_startproc
49	cfi_return_column (t9)
50
51	/* On entry to this basic block:
52	   t0 == the first destination word for masking back in
53	   t1 == the first source word.  */
54	.align 4
55stxcpy_aligned:
56	/* Create the 1st output word and detect 0's in the 1st input word.  */
57	lda	t2, -1		# E : build a mask against false zero
58	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
59	mskqh	t1, a1, t3	# U :
60	ornot	t1, t2, t2	# E : (stall)
61
62	mskql	t0, a1, t0	# U : assemble the first output word
63	cmpbge	zero, t2, t10	# E : bits set iff null found
64	or	t0, t3, t1	# E : (stall)
65	bne	t10, $a_eos	# U : (stall)
66
67	/* On entry to this basic block:
68	   t0 == the first destination word for masking back in
69	   t1 == a source word not containing a null.  */
70	/* Nops here to separate store quads from load quads */
71
72$a_loop:
73	stq_u	t1, 0(a0)	# L :
74	addq	a0, 8, a0	# E :
75	nop
76	nop
77
78	ldq_u	t1, 0(a1)	# L : Latency=3
79	addq	a1, 8, a1	# E :
80	cmpbge	zero, t1, t10	# E : (3 cycle stall)
81	beq	t10, $a_loop	# U : (stall for t10)
82
83	/* Take care of the final (partial) word store.
84	   On entry to this basic block we have:
85	   t1 == the source word containing the null
86	   t10 == the cmpbge mask that found it.  */
87$a_eos:
88	negq	t10, t6		# E : find low bit set
89	and	t10, t6, t8	# E : (stall)
90	/* For the sake of the cache, don't read a destination word
91	   if we're not going to need it.  */
92	and	t8, 0x80, t6	# E : (stall)
93	bne	t6, 1f		# U : (stall)
94
95	/* We're doing a partial word store and so need to combine
96	   our source and original destination words.  */
97	ldq_u	t0, 0(a0)	# L : Latency=3
98	subq	t8, 1, t6	# E :
99	zapnot	t1, t6, t1	# U : clear src bytes >= null (stall)
100	or	t8, t6, t10	# E : (stall)
101
102	zap	t0, t10, t0	# E : clear dst bytes <= null
103	or	t0, t1, t1	# E : (stall)
104	nop
105	nop
106
1071:	stq_u	t1, 0(a0)	# L :
108	ret	(t9)		# L0 : Latency=3
109	nop
110	nop
111
112	.align 4
113__stxcpy:
114	/* Are source and destination co-aligned?  */
115	xor	a0, a1, t0	# E :
116	unop			# E :
117	and	t0, 7, t0	# E : (stall)
118	bne	t0, $unaligned	# U : (stall)
119
120	/* We are co-aligned; take care of a partial first word.  */
121	ldq_u	t1, 0(a1)		# L : load first src word
122	and	a0, 7, t0		# E : take care not to load a word ...
123	addq	a1, 8, a1		# E :
124	beq	t0, stxcpy_aligned	# U : ... if we wont need it (stall)
125
126	ldq_u	t0, 0(a0)	# L :
127	br	stxcpy_aligned	# L0 : Latency=3
128	nop
129	nop
130
131
132/* The source and destination are not co-aligned.  Align the destination
133   and cope.  We have to be very careful about not reading too much and
134   causing a SEGV.  */
135
136	.align 4
137$u_head:
138	/* We know just enough now to be able to assemble the first
139	   full source word.  We can still find a zero at the end of it
140	   that prevents us from outputting the whole thing.
141
142	   On entry to this basic block:
143	   t0 == the first dest word, for masking back in, if needed else 0
144	   t1 == the low bits of the first source word
145	   t6 == bytemask that is -1 in dest word bytes */
146
147	ldq_u	t2, 8(a1)	# L :
148	addq	a1, 8, a1	# E :
149	extql	t1, a1, t1	# U : (stall on a1)
150	extqh	t2, a1, t4	# U : (stall on a1)
151
152	mskql	t0, a0, t0	# U :
153	or	t1, t4, t1	# E :
154	mskqh	t1, a0, t1	# U : (stall on t1)
155	or	t0, t1, t1	# E : (stall on t1)
156
157	or	t1, t6, t6	# E :
158	cmpbge	zero, t6, t10	# E : (stall)
159	lda	t6, -1		# E : for masking just below
160	bne	t10, $u_final	# U : (stall)
161
162	mskql	t6, a1, t6		# U : mask out the bits we have
163	or	t6, t2, t2		# E :   already extracted before (stall)
164	cmpbge	zero, t2, t10		# E :   testing eos (stall)
165	bne	t10, $u_late_head_exit	# U : (stall)
166
167	/* Finally, we've got all the stupid leading edge cases taken care
168	   of and we can set up to enter the main loop.  */
169
170	stq_u	t1, 0(a0)	# L : store first output word
171	addq	a0, 8, a0	# E :
172	extql	t2, a1, t0	# U : position ho-bits of lo word
173	ldq_u	t2, 8(a1)	# U : read next high-order source word
174
175	addq	a1, 8, a1	# E :
176	cmpbge	zero, t2, t10	# E : (stall for t2)
177	nop			# E :
178	bne	t10, $u_eos	# U : (stall)
179
180	/* Unaligned copy main loop.  In order to avoid reading too much,
181	   the loop is structured to detect zeros in aligned source words.
182	   This has, unfortunately, effectively pulled half of a loop
183	   iteration out into the head and half into the tail, but it does
184	   prevent nastiness from accumulating in the very thing we want
185	   to run as fast as possible.
186
187	   On entry to this basic block:
188	   t0 == the shifted high-order bits from the previous source word
189	   t2 == the unshifted current source word
190
191	   We further know that t2 does not contain a null terminator.  */
192
193	.align 3
194$u_loop:
195	extqh	t2, a1, t1	# U : extract high bits for current word
196	addq	a1, 8, a1	# E : (stall)
197	extql	t2, a1, t3	# U : extract low bits for next time (stall)
198	addq	a0, 8, a0	# E :
199
200	or	t0, t1, t1	# E : current dst word now complete
201	ldq_u	t2, 0(a1)	# L : Latency=3 load high word for next time
202	stq_u	t1, -8(a0)	# L : save the current word (stall)
203	mov	t3, t0		# E :
204
205	cmpbge	zero, t2, t10	# E : test new word for eos
206	beq	t10, $u_loop	# U : (stall)
207	nop
208	nop
209
210	/* We've found a zero somewhere in the source word we just read.
211	   If it resides in the lower half, we have one (probably partial)
212	   word to write out, and if it resides in the upper half, we
213	   have one full and one partial word left to write out.
214
215	   On entry to this basic block:
216	   t0 == the shifted high-order bits from the previous source word
217	   t2 == the unshifted current source word.  */
218$u_eos:
219	extqh	t2, a1, t1	# U :
220	or	t0, t1, t1	# E : first (partial) source word complete (stall)
221	cmpbge	zero, t1, t10	# E : is the null in this first bit? (stall)
222	bne	t10, $u_final	# U : (stall)
223
224$u_late_head_exit:
225	stq_u	t1, 0(a0)	# L : the null was in the high-order bits
226	addq	a0, 8, a0	# E :
227	extql	t2, a1, t1	# U :
228	cmpbge	zero, t1, t10	# E : (stall)
229
230	/* Take care of a final (probably partial) result word.
231	   On entry to this basic block:
232	   t1 == assembled source word
233	   t10 == cmpbge mask that found the null.  */
234$u_final:
235	negq	t10, t6		# E : isolate low bit set
236	and	t6, t10, t8	# E : (stall)
237	and	t8, 0x80, t6	# E : avoid dest word load if we can (stall)
238	bne	t6, 1f		# U : (stall)
239
240	ldq_u	t0, 0(a0)	# E :
241	subq	t8, 1, t6	# E :
242	or	t6, t8, t10	# E : (stall)
243	zapnot	t1, t6, t1	# U : kill source bytes >= null (stall)
244
245	zap	t0, t10, t0	# U : kill dest bytes <= null (2 cycle data stall)
246	or	t0, t1, t1	# E : (stall)
247	nop
248	nop
249
2501:	stq_u	t1, 0(a0)	# L :
251	ret	(t9)		# L0 : Latency=3
252	nop
253	nop
254
255	/* Unaligned copy entry point.  */
256	.align 4
257$unaligned:
258
259	ldq_u	t1, 0(a1)	# L : load first source word
260	and	a0, 7, t4	# E : find dest misalignment
261	and	a1, 7, t5	# E : find src misalignment
262	/* Conditionally load the first destination word and a bytemask
263	   with 0xff indicating that the destination byte is sacrosanct.  */
264	mov	zero, t0	# E :
265
266	mov	zero, t6	# E :
267	beq	t4, 1f		# U :
268	ldq_u	t0, 0(a0)	# L :
269	lda	t6, -1		# E :
270
271	mskql	t6, a0, t6	# U :
272	nop
273	nop
274	nop
2751:
276	subq	a1, t4, a1	# E : sub dest misalignment from src addr
277	/* If source misalignment is larger than dest misalignment, we need
278	   extra startup checks to avoid SEGV.  */
279	cmplt	t4, t5, t8	# E :
280	beq	t8, $u_head	# U :
281	lda	t2, -1		# E : mask out leading garbage in source
282
283	mskqh	t2, t5, t2	# U :
284	ornot	t1, t2, t3	# E : (stall)
285	cmpbge	zero, t3, t10	# E : is there a zero? (stall)
286	beq	t10, $u_head	# U : (stall)
287
288	/* At this point we've found a zero in the first partial word of
289	   the source.  We need to isolate the valid source data and mask
290	   it into the original destination data.  (Incidentally, we know
291	   that we'll need at least one byte of that original dest word.) */
292
293	ldq_u	t0, 0(a0)	# L :
294	negq	t10, t6		# E : build bitmask of bytes <= zero
295	and	t6, t10, t8	# E : (stall)
296	and	a1, 7, t5	# E :
297
298	subq	t8, 1, t6	# E :
299	or	t6, t8, t10	# E : (stall)
300	srl	t8, t5, t8	# U : adjust final null return value
301	zapnot	t2, t10, t2	# U : prepare source word; mirror changes (stall)
302
303	and	t1, t2, t1	# E : to source validity mask
304	extql	t2, a1, t2	# U :
305	extql	t1, a1, t1	# U : (stall)
306	andnot	t0, t2, t0	# .. e1 : zero place for source to reside (stall)
307
308	or	t0, t1, t1	# e1    : and put it there
309	stq_u	t1, 0(a0)	# .. e0 : (stall)
310	ret	(t9)		# e1    :
311
312	cfi_endproc
313