1/* Copy SIZE bytes from SRC to DEST.
2   For UltraSPARC-III.
3   Copyright (C) 2001-2022 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21
22#define ASI_BLK_P 0xf0
23#define FPRS_FEF  0x04
24#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
25#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
26
27#ifndef XCC
28#define USE_BPR
29#define XCC xcc
30#endif
31
32#if IS_IN (libc)
33
34	.register	%g2,#scratch
35	.register	%g3,#scratch
36	.register	%g6,#scratch
37
38	.text
39
40ENTRY(__mempcpy_ultra3)
41	ba,pt		%XCC, 101f
42	 add		%o0, %o2, %g5
43END(__mempcpy_ultra3)
44
45	/* Special/non-trivial issues of this code:
46	 *
47	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
48	 * 2) Only low 32 FPU registers are used so that only the
49	 *    lower half of the FPU register set is dirtied by this
50	 *    code.  This is especially important in the kernel.
51	 * 3) This code never prefetches cachelines past the end
52	 *    of the source buffer.
53	 *
54	 * The cheetah's flexible spine, oversized liver, enlarged heart,
55	 * slender muscular body, and claws make it the swiftest hunter
56	 * in Africa and the fastest animal on land.  Can reach speeds
57	 * of up to 2.4GB per second.
58	 */
59	.align		32
60ENTRY(__memcpy_ultra3)
61
62100: /* %o0=dst, %o1=src, %o2=len */
63	mov		%o0, %g5
64101:
65	cmp		%o2, 0
66	be,pn		%XCC, out
67218:	 or		%o0, %o1, %o3
68	cmp		%o2, 16
69	bleu,a,pn	%XCC, small_copy
70	 or		%o3, %o2, %o3
71
72	cmp		%o2, 256
73	blu,pt		%XCC, medium_copy
74	 andcc		%o3, 0x7, %g0
75
76	ba,pt		%xcc, enter
77	 andcc		%o0, 0x3f, %g2
78
79	/* Here len >= 256 and condition codes reflect execution
80	 * of "andcc %o0, 0x7, %g2", done by caller.
81	 */
82	.align		64
83enter:
84	/* Is 'dst' already aligned on an 64-byte boundary? */
85	be,pt		%XCC, 2f
86
87	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
88	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
89	 * subtract this from 'len'.
90	 */
91	 sub		%g2, 0x40, %g2
92	sub		%g0, %g2, %g2
93	sub		%o2, %g2, %o2
94
95	/* Copy %g2 bytes from src to dst, one byte at a time. */
961:	ldub		[%o1 + 0x00], %o3
97	add		%o1, 0x1, %o1
98	add		%o0, 0x1, %o0
99	subcc		%g2, 0x1, %g2
100
101	bg,pt		%XCC, 1b
102	 stb		%o3, [%o0 + -1]
103
1042:	VISEntryHalf
105	and		%o1, 0x7, %g1
106	ba,pt		%xcc, begin
107	 alignaddr	%o1, %g0, %o1
108
109	.align		64
110begin:
111	prefetch	[%o1 + 0x000], #one_read
112	prefetch	[%o1 + 0x040], #one_read
113	andn		%o2, (0x40 - 1), %o4
114	prefetch	[%o1 + 0x080], #one_read
115	prefetch	[%o1 + 0x0c0], #one_read
116	ldd		[%o1 + 0x000], %f0
117	prefetch	[%o1 + 0x100], #one_read
118	ldd		[%o1 + 0x008], %f2
119	prefetch	[%o1 + 0x140], #one_read
120	ldd		[%o1 + 0x010], %f4
121	prefetch	[%o1 + 0x180], #one_read
122	faligndata	%f0, %f2, %f16
123	ldd		[%o1 + 0x018], %f6
124	faligndata	%f2, %f4, %f18
125	ldd		[%o1 + 0x020], %f8
126	faligndata	%f4, %f6, %f20
127	ldd		[%o1 + 0x028], %f10
128	faligndata	%f6, %f8, %f22
129
130	ldd		[%o1 + 0x030], %f12
131	faligndata	%f8, %f10, %f24
132	ldd		[%o1 + 0x038], %f14
133	faligndata	%f10, %f12, %f26
134	ldd		[%o1 + 0x040], %f0
135
136	sub		%o4, 0x80, %o4
137	add		%o1, 0x40, %o1
138	ba,pt		%xcc, loop
139	 srl		%o4, 6, %o3
140
141	.align		64
142loop:
143	ldd		[%o1 + 0x008], %f2
144	faligndata	%f12, %f14, %f28
145	ldd		[%o1 + 0x010], %f4
146	faligndata	%f14, %f0, %f30
147	stda		%f16, [%o0] ASI_BLK_P
148	ldd		[%o1 + 0x018], %f6
149	faligndata	%f0, %f2, %f16
150
151	ldd		[%o1 + 0x020], %f8
152	faligndata	%f2, %f4, %f18
153	ldd		[%o1 + 0x028], %f10
154	faligndata	%f4, %f6, %f20
155	ldd		[%o1 + 0x030], %f12
156	faligndata	%f6, %f8, %f22
157	ldd		[%o1 + 0x038], %f14
158	faligndata	%f8, %f10, %f24
159
160	ldd		[%o1 + 0x040], %f0
161	prefetch	[%o1 + 0x180], #one_read
162	faligndata	%f10, %f12, %f26
163	subcc		%o3, 0x01, %o3
164	add		%o1, 0x40, %o1
165	bg,pt		%XCC, loop
166	 add		%o0, 0x40, %o0
167
168	/* Finally we copy the last full 64-byte block. */
169loopfini:
170	ldd		[%o1 + 0x008], %f2
171	faligndata	%f12, %f14, %f28
172	ldd		[%o1 + 0x010], %f4
173	faligndata	%f14, %f0, %f30
174	stda		%f16, [%o0] ASI_BLK_P
175	ldd		[%o1 + 0x018], %f6
176	faligndata	%f0, %f2, %f16
177	ldd		[%o1 + 0x020], %f8
178	faligndata	%f2, %f4, %f18
179	ldd		[%o1 + 0x028], %f10
180	faligndata	%f4, %f6, %f20
181	ldd		[%o1 + 0x030], %f12
182	faligndata	%f6, %f8, %f22
183	ldd		[%o1 + 0x038], %f14
184	faligndata	%f8, %f10, %f24
185	cmp		%g1, 0
186	be,pt		%XCC, 1f
187	 add		%o0, 0x40, %o0
188	ldd		[%o1 + 0x040], %f0
1891:	faligndata	%f10, %f12, %f26
190	faligndata	%f12, %f14, %f28
191	faligndata	%f14, %f0, %f30
192	stda		%f16, [%o0] ASI_BLK_P
193	add		%o0, 0x40, %o0
194	add		%o1, 0x40, %o1
195	membar		#Sync
196
197	/* Now we copy the (len modulo 64) bytes at the end.
198	 * Note how we borrow the %f0 loaded above.
199	 *
200	 * Also notice how this code is careful not to perform a
201	 * load past the end of the src buffer.
202	 */
203loopend:
204	and		%o2, 0x3f, %o2
205	andcc		%o2, 0x38, %g2
206	be,pn		%XCC, endcruft
207	 subcc		%g2, 0x8, %g2
208	be,pn		%XCC, endcruft
209	 cmp		%g1, 0
210
211	be,a,pt		%XCC, 1f
212	 ldd		[%o1 + 0x00], %f0
213
2141:	ldd		[%o1 + 0x08], %f2
215	add		%o1, 0x8, %o1
216	sub		%o2, 0x8, %o2
217	subcc		%g2, 0x8, %g2
218	faligndata	%f0, %f2, %f8
219	std		%f8, [%o0 + 0x00]
220	be,pn		%XCC, endcruft
221	 add		%o0, 0x8, %o0
222	ldd		[%o1 + 0x08], %f0
223	add		%o1, 0x8, %o1
224	sub		%o2, 0x8, %o2
225	subcc		%g2, 0x8, %g2
226	faligndata	%f2, %f0, %f8
227	std		%f8, [%o0 + 0x00]
228	bne,pn		%XCC, 1b
229	 add		%o0, 0x8, %o0
230
231	/* If anything is left, we copy it one byte at a time.
232	 * Note that %g1 is (src & 0x3) saved above before the
233	 * alignaddr was performed.
234	 */
235endcruft:
236	cmp		%o2, 0
237	add		%o1, %g1, %o1
238	VISExitHalf
239	be,pn		%XCC, out
240	 sub		%o0, %o1, %o3
241
242	andcc		%g1, 0x7, %g0
243	bne,pn		%icc, small_copy_unaligned
244	 andcc		%o2, 0x8, %g0
245	be,pt		%icc, 1f
246	 nop
247	ldx		[%o1], %o5
248	stx		%o5, [%o1 + %o3]
249	add		%o1, 0x8, %o1
250
2511:	andcc		%o2, 0x4, %g0
252	be,pt		%icc, 1f
253	 nop
254	lduw		[%o1], %o5
255	stw		%o5, [%o1 + %o3]
256	add		%o1, 0x4, %o1
257
2581:	andcc		%o2, 0x2, %g0
259	be,pt		%icc, 1f
260	 nop
261	lduh		[%o1], %o5
262	sth		%o5, [%o1 + %o3]
263	add		%o1, 0x2, %o1
264
2651:	andcc		%o2, 0x1, %g0
266	be,pt		%icc, out
267	 nop
268	ldub		[%o1], %o5
269	ba,pt		%xcc, out
270	 stb		%o5, [%o1 + %o3]
271
272medium_copy: /* 16 < len <= 64 */
273	bne,pn		%XCC, small_copy_unaligned
274	 sub		%o0, %o1, %o3
275
276medium_copy_aligned:
277	andn		%o2, 0x7, %o4
278	and		%o2, 0x7, %o2
2791:	subcc		%o4, 0x8, %o4
280	ldx		[%o1], %o5
281	stx		%o5, [%o1 + %o3]
282	bgu,pt		%XCC, 1b
283	 add		%o1, 0x8, %o1
284	andcc		%o2, 0x4, %g0
285	be,pt		%XCC, 1f
286	 nop
287	sub		%o2, 0x4, %o2
288	lduw		[%o1], %o5
289	stw		%o5, [%o1 + %o3]
290	add		%o1, 0x4, %o1
2911:	cmp		%o2, 0
292	be,pt		%XCC, out
293	 nop
294	ba,pt		%xcc, small_copy_unaligned
295	 nop
296
297small_copy: /* 0 < len <= 16 */
298	andcc		%o3, 0x3, %g0
299	bne,pn		%XCC, small_copy_unaligned
300	 sub		%o0, %o1, %o3
301
302small_copy_aligned:
303	subcc		%o2, 4, %o2
304	lduw		[%o1], %g1
305	stw		%g1, [%o1 + %o3]
306	bgu,pt		%XCC, small_copy_aligned
307	 add		%o1, 4, %o1
308
309out:	retl
310	 mov		%g5, %o0
311
312	.align	32
313small_copy_unaligned:
314	subcc		%o2, 1, %o2
315	ldub		[%o1], %g1
316	stb		%g1, [%o1 + %o3]
317	bgu,pt		%XCC, small_copy_unaligned
318	 add		%o1, 1, %o1
319	retl
320	 mov		%g5, %o0
321
322END(__memcpy_ultra3)
323
324#endif
325