1/* Copyright (C) 2000-2022 Free Software Foundation, Inc.
2   This file is part of the GNU C Library.
3   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library.  If not, see
17   <https://www.gnu.org/licenses/>.  */
18
19/*
20 * Much of the information about 21264 scheduling/coding comes from:
21 *	Compiler Writer's Guide for the Alpha 21264
22 *	abbreviated as 'CWG' in other comments here
23 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
24 * Scheduling notation:
25 *	E	- either cluster
26 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
27 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
28 *
29 * Temp usage notes:
30 *	$0		- destination address
31 *	$1,$2,		- scratch
32 */
33
34#include <sysdep.h>
35
36	.arch ev6
37	.set noreorder
38	.set noat
39
40ENTRY(memcpy)
41	.prologue 0
42
43	mov	$16, $0			# E : copy dest to return
44	ble	$18, $nomoredata	# U : done with the copy?
45	xor	$16, $17, $1		# E : are source and dest alignments the same?
46	and	$1, 7, $1		# E : are they the same mod 8?
47
48	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
49	/* source and dest are same mod 8 address */
50	and	$16, 7, $1		# E : Are both 0mod8?
51	beq	$1, $both_0mod8		# U : Yes
52	nop				# E :
53
54	/*
55	 * source and dest are same misalignment.  move a byte at a time
56	 * until a 0mod8 alignment for both is reached.
57	 * At least one byte more to move
58	 */
59
60$head_align:
61	ldbu	$1, 0($17)		# L : grab a byte
62	subq	$18, 1, $18		# E : count--
63	addq	$17, 1, $17		# E : src++
64	stb	$1, 0($16)		# L :
65	addq	$16, 1, $16		# E : dest++
66	and	$16, 7, $1		# E : Are we at 0mod8 yet?
67	ble	$18, $nomoredata	# U : done with the copy?
68	bne	$1, $head_align		# U :
69
70$both_0mod8:
71	cmple	$18, 127, $1		# E : Can we unroll the loop?
72	bne	$1, $no_unroll		# U :
73	and	$16, 63, $1		# E : get mod64 alignment
74	beq	$1, $do_unroll		# U : no single quads to fiddle
75
76$single_head_quad:
77	ldq	$1, 0($17)		# L : get 8 bytes
78	subq	$18, 8, $18		# E : count -= 8
79	addq	$17, 8, $17		# E : src += 8
80	nop				# E :
81
82	stq	$1, 0($16)		# L : store
83	addq	$16, 8, $16		# E : dest += 8
84	and	$16, 63, $1		# E : get mod64 alignment
85	bne	$1, $single_head_quad	# U : still not fully aligned
86
87$do_unroll:
88	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
89	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
90	bne	$1, $tail_quads		# U : Nope
91	nop				# E :
92
93$unroll_body:
94	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
95					# ($7) are about to be over-written
96	ldq	$6, 0($17)		# L0 : bytes 0..7
97	nop				# E :
98	nop				# E :
99
100	ldq	$4, 8($17)		# L : bytes 8..15
101	ldq	$5, 16($17)		# L : bytes 16..23
102	addq	$7, 64, $7		# E : Update next wh64 address
103	nop				# E :
104
105	ldq	$3, 24($17)		# L : bytes 24..31
106	addq	$16, 64, $1		# E : fallback value for wh64
107	nop				# E :
108	nop				# E :
109
110	addq	$17, 32, $17		# E : src += 32 bytes
111	stq	$6, 0($16)		# L : bytes 0..7
112	nop				# E :
113	nop				# E :
114
115	stq	$4, 8($16)		# L : bytes 8..15
116	stq	$5, 16($16)		# L : bytes 16..23
117	subq	$18, 192, $2		# E : At least two more trips to go?
118	nop				# E :
119
120	stq	$3, 24($16)		# L : bytes 24..31
121	addq	$16, 32, $16		# E : dest += 32 bytes
122	nop				# E :
123	nop				# E :
124
125	ldq	$6, 0($17)		# L : bytes 0..7
126	ldq	$4, 8($17)		# L : bytes 8..15
127	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
128					# fallback wh64 address if < 2 more trips
129	nop				# E :
130
131	ldq	$5, 16($17)		# L : bytes 16..23
132	ldq	$3, 24($17)		# L : bytes 24..31
133	addq	$16, 32, $16		# E : dest += 32
134	subq	$18, 64, $18		# E : count -= 64
135
136	addq	$17, 32, $17		# E : src += 32
137	stq	$6, -32($16)		# L : bytes 0..7
138	stq	$4, -24($16)		# L : bytes 8..15
139	cmple	$18, 63, $1		# E : At least one more trip?
140
141	stq	$5, -16($16)		# L : bytes 16..23
142	stq	$3, -8($16)		# L : bytes 24..31
143	nop				# E :
144	beq	$1, $unroll_body
145
146$tail_quads:
147$no_unroll:
148	.align 4
149	subq	$18, 8, $18		# E : At least a quad left?
150	blt	$18, $less_than_8	# U : Nope
151	nop				# E :
152	nop				# E :
153
154$move_a_quad:
155	ldq	$1, 0($17)		# L : fetch 8
156	subq	$18, 8, $18		# E : count -= 8
157	addq	$17, 8, $17		# E : src += 8
158	nop				# E :
159
160	stq	$1, 0($16)		# L : store 8
161	addq	$16, 8, $16		# E : dest += 8
162	bge	$18, $move_a_quad	# U :
163	nop				# E :
164
165$less_than_8:
166	.align 4
167	addq	$18, 8, $18		# E : add back for trailing bytes
168	ble	$18, $nomoredata	# U : All-done
169	nop				# E :
170	nop				# E :
171
172	/* Trailing bytes */
173$tail_bytes:
174	subq	$18, 1, $18		# E : count--
175	ldbu	$1, 0($17)		# L : fetch a byte
176	addq	$17, 1, $17		# E : src++
177	nop				# E :
178
179	stb	$1, 0($16)		# L : store a byte
180	addq	$16, 1, $16		# E : dest++
181	bgt	$18, $tail_bytes	# U : more to be done?
182	nop				# E :
183
184	/* branching to exit takes 3 extra cycles, so replicate exit here */
185	ret	$31, ($26), 1		# L0 :
186	nop				# E :
187	nop				# E :
188	nop				# E :
189
190$misaligned:
191	mov	$0, $4			# E : dest temp
192	and	$0, 7, $1		# E : dest alignment mod8
193	beq	$1, $dest_0mod8		# U : life doesnt totally suck
194	nop
195
196$aligndest:
197	ble	$18, $nomoredata	# U :
198	ldbu	$1, 0($17)		# L : fetch a byte
199	subq	$18, 1, $18		# E : count--
200	addq	$17, 1, $17		# E : src++
201
202	stb	$1, 0($4)		# L : store it
203	addq	$4, 1, $4		# E : dest++
204	and	$4, 7, $1		# E : dest 0mod8 yet?
205	bne	$1, $aligndest		# U : go until we are aligned.
206
207	/* Source has unknown alignment, but dest is known to be 0mod8 */
208$dest_0mod8:
209	subq	$18, 8, $18		# E : At least a quad left?
210	blt	$18, $misalign_tail	# U : Nope
211	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
212	nop				# E :
213
214$mis_quad:
215	ldq_u	$16, 8($17)		# L : Fetch next 8
216	extql	$3, $17, $3		# U : masking
217	extqh	$16, $17, $1		# U : masking
218	bis	$3, $1, $1		# E : merged bytes to store
219
220	subq	$18, 8, $18		# E : count -= 8
221	addq	$17, 8, $17		# E : src += 8
222	stq	$1, 0($4)		# L : store 8 (aligned)
223	mov	$16, $3			# E : "rotate" source data
224
225	addq	$4, 8, $4		# E : dest += 8
226	bge	$18, $mis_quad		# U : More quads to move
227	nop
228	nop
229
230$misalign_tail:
231	addq	$18, 8, $18		# E : account for tail stuff
232	ble	$18, $nomoredata	# U :
233	nop
234	nop
235
236$misalign_byte:
237	ldbu	$1, 0($17)		# L : fetch 1
238	subq	$18, 1, $18		# E : count--
239	addq	$17, 1, $17		# E : src++
240	nop				# E :
241
242	stb	$1, 0($4)		# L : store
243	addq	$4, 1, $4		# E : dest++
244	bgt	$18, $misalign_byte	# U : more to go?
245	nop
246
247
248$nomoredata:
249	ret	$31, ($26), 1		# L0 :
250	nop				# E :
251	nop				# E :
252	nop				# E :
253
254END(memcpy)
255libc_hidden_builtin_def (memcpy)
256