1/* Copy SIZE bytes from SRC to DEST.
2   For UltraSPARC.
3   Copyright (C) 1996-2022 Free Software Foundation, Inc.
4   This file is part of the GNU C Library.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, see
18   <https://www.gnu.org/licenses/>.  */
19
20#include <sysdep.h>
21#include <asm/asi.h>
22#ifndef XCC
23#define USE_BPR
24	.register	%g2, #scratch
25	.register	%g3, #scratch
26	.register	%g6, #scratch
27#define XCC	xcc
28#endif
29#define FPRS_FEF	4
30
31#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9)		\
32	faligndata	%f1, %f2, %f48;				\
33	faligndata	%f2, %f3, %f50;				\
34	faligndata	%f3, %f4, %f52;				\
35	faligndata	%f4, %f5, %f54;				\
36	faligndata	%f5, %f6, %f56;				\
37	faligndata	%f6, %f7, %f58;				\
38	faligndata	%f7, %f8, %f60;				\
39	faligndata	%f8, %f9, %f62;
40
41#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt)	\
42	ldda		[%src] %asi, %fdest;			\
43	add		%src, 0x40, %src;			\
44	add		%dest, 0x40, %dest;			\
45	subcc		%len, 0x40, %len;			\
46	be,pn		%xcc, jmptgt;				\
47	 stda		%fsrc, [%dest - 0x40] %asi;
48
49#define LOOP_CHUNK1(src, dest, len, branch_dest)		\
50	MAIN_LOOP_CHUNK(src, dest, f0,  f48, len, branch_dest)
51#define LOOP_CHUNK2(src, dest, len, branch_dest)		\
52	MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)
53#define LOOP_CHUNK3(src, dest, len, branch_dest)		\
54	MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)
55
56#define STORE_SYNC(dest, fsrc)					\
57	stda		%fsrc, [%dest] %asi;			\
58	add		%dest, 0x40, %dest;
59
60#define STORE_JUMP(dest, fsrc, target)				\
61	stda		%fsrc, [%dest] %asi;			\
62	add		%dest, 0x40, %dest;			\
63	ba,pt		%xcc, target;
64
65#define VISLOOP_PAD nop; nop; nop; nop; 			\
66		    nop; nop; nop; nop; 			\
67		    nop; nop; nop; nop; 			\
68		    nop; nop; nop;
69
70#define FINISH_VISCHUNK(dest, f0, f1, left)			\
71	subcc		%left, 8, %left;			\
72	bl,pn		%xcc, 205f;				\
73	 faligndata	%f0, %f1, %f48;				\
74	std		%f48, [%dest];				\
75	add		%dest, 8, %dest;
76
77#define UNEVEN_VISCHUNK(dest, f0, f1, left)			\
78	subcc		%left, 8, %left;			\
79	bl,pn		%xcc, 205f;				\
80	 fsrc2		%f0, %f1;				\
81	ba,a,pt		%xcc, 204f;
82
83	/* Macros for non-VIS memcpy code. */
84#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)		\
85	ldx		[%src + offset + 0x00], %t0; 		\
86	ldx		[%src + offset + 0x08], %t1; 		\
87	ldx		[%src + offset + 0x10], %t2; 		\
88	ldx		[%src + offset + 0x18], %t3; 		\
89	stw		%t0, [%dst + offset + 0x04]; 		\
90	srlx		%t0, 32, %t0;				\
91	stw		%t0, [%dst + offset + 0x00]; 		\
92	stw		%t1, [%dst + offset + 0x0c]; 		\
93	srlx		%t1, 32, %t1;				\
94	stw		%t1, [%dst + offset + 0x08]; 		\
95	stw		%t2, [%dst + offset + 0x14]; 		\
96	srlx		%t2, 32, %t2;				\
97	stw		%t2, [%dst + offset + 0x10]; 		\
98	stw		%t3, [%dst + offset + 0x1c];		\
99	srlx		%t3, 32, %t3;				\
100	stw		%t3, [%dst + offset + 0x18];
101
102#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)	\
103	ldx		[%src + offset + 0x00], %t0; 		\
104	ldx		[%src + offset + 0x08], %t1; 		\
105	ldx		[%src + offset + 0x10], %t2; 		\
106	ldx		[%src + offset + 0x18], %t3; 		\
107	stx		%t0, [%dst + offset + 0x00]; 		\
108	stx		%t1, [%dst + offset + 0x08]; 		\
109	stx		%t2, [%dst + offset + 0x10]; 		\
110	stx		%t3, [%dst + offset + 0x18]; 		\
111	ldx		[%src + offset + 0x20], %t0; 		\
112	ldx		[%src + offset + 0x28], %t1; 		\
113	ldx		[%src + offset + 0x30], %t2; 		\
114	ldx		[%src + offset + 0x38], %t3; 		\
115	stx		%t0, [%dst + offset + 0x20]; 		\
116	stx		%t1, [%dst + offset + 0x28]; 		\
117	stx		%t2, [%dst + offset + 0x30]; 		\
118	stx		%t3, [%dst + offset + 0x38];
119
120#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)	\
121	ldx		[%src - offset - 0x10], %t0;		\
122	ldx		[%src - offset - 0x08], %t1; 		\
123	stw		%t0, [%dst - offset - 0x0c]; 		\
124	srlx		%t0, 32, %t2;				\
125	stw		%t2, [%dst - offset - 0x10]; 		\
126	stw		%t1, [%dst - offset - 0x04]; 		\
127	srlx		%t1, 32, %t3;				\
128	stw		%t3, [%dst - offset - 0x08];
129
130#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)		\
131	ldx		[%src - offset - 0x10], %t0; 		\
132	ldx		[%src - offset - 0x08], %t1; 		\
133	stx		%t0, [%dst - offset - 0x10]; 		\
134	stx		%t1, [%dst - offset - 0x08];
135
136	.text
137	.align		32
138ENTRY(__memcpy_large)
139200:	be,pt		%xcc, 201f			/* CTI				*/
140	 andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
141	mov		8, %g1				/* IEU0				*/
142	sub		%g1, %g2, %g2			/* IEU0		Group		*/
143	andcc		%o0, 1, %g0			/* IEU1				*/
144	be,pt		%icc, 2f			/* CTI				*/
145	 sub		%o2, %g2, %o2			/* IEU0		Group		*/
1461:	ldub		[%o1], %o5			/* Load		Group		*/
147	add		%o1, 1, %o1			/* IEU0				*/
148	add		%o0, 1, %o0			/* IEU1				*/
149	subcc		%g2, 1, %g2			/* IEU1		Group		*/
150	be,pn		%xcc, 3f			/* CTI				*/
151	 stb		%o5, [%o0 - 1]			/* Store			*/
1522:	ldub		[%o1], %o5			/* Load		Group		*/
153	add		%o0, 2, %o0			/* IEU0				*/
154	ldub		[%o1 + 1], %g3			/* Load		Group		*/
155	subcc		%g2, 2, %g2			/* IEU1		Group		*/
156	stb		%o5, [%o0 - 2]			/* Store			*/
157	add		%o1, 2, %o1			/* IEU0				*/
158	bne,pt		%xcc, 2b			/* CTI		Group		*/
159	 stb		%g3, [%o0 - 1]			/* Store			*/
1603:	andcc		%o0, 0x38, %g5			/* IEU1		Group		*/
161201:	be,pt		%icc, 202f			/* CTI				*/
162	 mov		64, %g1				/* IEU0				*/
163	fsrc2		%f0, %f2			/* FPU				*/
164	sub		%g1, %g5, %g5			/* IEU0		Group		*/
165	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
166	ldd		[%g1], %f4			/* Load		Group		*/
167	sub		%o2, %g5, %o2			/* IEU0				*/
1681:	ldd		[%g1 + 0x8], %f6		/* Load		Group		*/
169	add		%g1, 0x8, %g1			/* IEU0		Group		*/
170	subcc		%g5, 8, %g5			/* IEU1				*/
171	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
172	std		%f0, [%o0]			/* Store			*/
173	add		%o1, 8, %o1			/* IEU0		Group		*/
174	be,pn		%xcc, 202f			/* CTI				*/
175	 add		%o0, 8, %o0			/* IEU1				*/
176	ldd		[%g1 + 0x8], %f4		/* Load		Group		*/
177	add		%g1, 8, %g1			/* IEU0				*/
178	subcc		%g5, 8, %g5			/* IEU1				*/
179	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
180	std		%f0, [%o0]			/* Store			*/
181	add		%o1, 8, %o1			/* IEU0				*/
182	bne,pt		%xcc, 1b			/* CTI		Group		*/
183	 add		%o0, 8, %o0			/* IEU0				*/
184202:	membar	  #LoadStore | #StoreStore | #StoreLoad	/* LSU		Group		*/
185	wr		%g0, ASI_BLK_P, %asi		/* LSU		Group		*/
186	subcc		%o2, 0x40, %g6			/* IEU1		Group		*/
187	mov		%o1, %g1			/* IEU0				*/
188	andncc		%g6, (0x40 - 1), %g6		/* IEU1		Group		*/
189	srl		%g1, 3, %g2			/* IEU0				*/
190	sub		%o2, %g6, %g3			/* IEU0		Group		*/
191	andn		%o1, (0x40 - 1), %o1		/* IEU1				*/
192	and		%g2, 7, %g2			/* IEU0		Group		*/
193	andncc		%g3, 0x7, %g3			/* IEU1				*/
194	fsrc2		%f0, %f2			/* FPU				*/
195	sub		%g3, 0x10, %g3			/* IEU0		Group		*/
196	sub		%o2, %g6, %o2			/* IEU1				*/
197	alignaddr	%g1, %g0, %g0			/* GRU		Group		*/
198	add		%g1, %g6, %g1			/* IEU0		Group		*/
199	subcc		%o2, %g3, %o2			/* IEU1				*/
200	ldda		[%o1 + 0x00] %asi, %f0		/* LSU		Group		*/
201	add		%g1, %g3, %g1			/* IEU0				*/
202	ldda		[%o1 + 0x40] %asi, %f16		/* LSU		Group		*/
203	sub		%g6, 0x80, %g6			/* IEU0				*/
204	ldda		[%o1 + 0x80] %asi, %f32		/* LSU		Group		*/
205							/* Clk1		Group 8-(	*/
206							/* Clk2		Group 8-(	*/
207							/* Clk3		Group 8-(	*/
208							/* Clk4		Group 8-(	*/
209203:	rd		%pc, %g5			/* PDU		Group 8-(	*/
210	addcc		%g5, %lo(300f - 203b), %g5	/* IEU1		Group		*/
211	sll		%g2, 9, %g2			/* IEU0				*/
212	jmpl		%g5 + %g2, %g0			/* CTI		Group brk forced*/
213	 addcc		%o1, 0xc0, %o1			/* IEU1		Group		*/
214
215	.align		512		/* OK, here comes the fun part... */
216300:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	LOOP_CHUNK1(o1, o0, g6, 301f)
217	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	LOOP_CHUNK2(o1, o0, g6, 302f)
218	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	LOOP_CHUNK3(o1, o0, g6, 303f)
219	b,pt		%xcc, 300b+4; faligndata %f0, %f2, %f48
220301:	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	STORE_SYNC(o0, f48) membar #Sync
221	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	STORE_JUMP(o0, f48, 400f) membar #Sync
222302:	FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)	STORE_SYNC(o0, f48) membar #Sync
223	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	STORE_JUMP(o0, f48, 416f) membar #Sync
224303:	FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)	STORE_SYNC(o0, f48) membar #Sync
225	FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)	STORE_JUMP(o0, f48, 432f) membar #Sync
226	VISLOOP_PAD
227310:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	LOOP_CHUNK1(o1, o0, g6, 311f)
228	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	LOOP_CHUNK2(o1, o0, g6, 312f)
229	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	LOOP_CHUNK3(o1, o0, g6, 313f)
230	b,pt		%xcc, 310b+4; faligndata %f2, %f4, %f48
231311:	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	STORE_SYNC(o0, f48) membar #Sync
232	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	STORE_JUMP(o0, f48, 402f) membar #Sync
233312:	FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)	STORE_SYNC(o0, f48) membar #Sync
234	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	STORE_JUMP(o0, f48, 418f) membar #Sync
235313:	FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)	STORE_SYNC(o0, f48) membar #Sync
236	FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)	STORE_JUMP(o0, f48, 434f) membar #Sync
237	VISLOOP_PAD
238320:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	LOOP_CHUNK1(o1, o0, g6, 321f)
239	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	LOOP_CHUNK2(o1, o0, g6, 322f)
240	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	LOOP_CHUNK3(o1, o0, g6, 323f)
241	b,pt		%xcc, 320b+4; faligndata %f4, %f6, %f48
242321:	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	STORE_SYNC(o0, f48) membar #Sync
243	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	STORE_JUMP(o0, f48, 404f) membar #Sync
244322:	FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)	STORE_SYNC(o0, f48) membar #Sync
245	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	STORE_JUMP(o0, f48, 420f) membar #Sync
246323:	FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)	STORE_SYNC(o0, f48) membar #Sync
247	FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)	STORE_JUMP(o0, f48, 436f) membar #Sync
248	VISLOOP_PAD
249330:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	LOOP_CHUNK1(o1, o0, g6, 331f)
250	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	LOOP_CHUNK2(o1, o0, g6, 332f)
251	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	LOOP_CHUNK3(o1, o0, g6, 333f)
252	b,pt		%xcc, 330b+4; faligndata %f6, %f8, %f48
253331:	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	STORE_SYNC(o0, f48) membar #Sync
254	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	STORE_JUMP(o0, f48, 406f) membar #Sync
255332:	FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)	STORE_SYNC(o0, f48) membar #Sync
256	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	STORE_JUMP(o0, f48, 422f) membar #Sync
257333:	FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)	STORE_SYNC(o0, f48) membar #Sync
258	FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)	STORE_JUMP(o0, f48, 438f) membar #Sync
259	VISLOOP_PAD
260340:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	LOOP_CHUNK1(o1, o0, g6, 341f)
261	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	LOOP_CHUNK2(o1, o0, g6, 342f)
262	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	LOOP_CHUNK3(o1, o0, g6, 343f)
263	b,pt		%xcc, 340b+4; faligndata %f8, %f10, %f48
264341:	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	STORE_SYNC(o0, f48) membar #Sync
265	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	STORE_JUMP(o0, f48, 408f) membar #Sync
266342:	FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)	STORE_SYNC(o0, f48) membar #Sync
267	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	STORE_JUMP(o0, f48, 424f) membar #Sync
268343:	FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)	STORE_SYNC(o0, f48) membar #Sync
269	FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)	STORE_JUMP(o0, f48, 440f) membar #Sync
270	VISLOOP_PAD
271350:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	LOOP_CHUNK1(o1, o0, g6, 351f)
272	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	LOOP_CHUNK2(o1, o0, g6, 352f)
273	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	LOOP_CHUNK3(o1, o0, g6, 353f)
274	b,pt		%xcc, 350b+4; faligndata %f10, %f12, %f48
275351:	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	STORE_SYNC(o0, f48) membar #Sync
276	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	STORE_JUMP(o0, f48, 410f) membar #Sync
277352:	FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)	STORE_SYNC(o0, f48) membar #Sync
278	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	STORE_JUMP(o0, f48, 426f) membar #Sync
279353:	FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)	STORE_SYNC(o0, f48) membar #Sync
280	FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)	STORE_JUMP(o0, f48, 442f) membar #Sync
281	VISLOOP_PAD
282360:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	LOOP_CHUNK1(o1, o0, g6, 361f)
283	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	LOOP_CHUNK2(o1, o0, g6, 362f)
284	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	LOOP_CHUNK3(o1, o0, g6, 363f)
285	b,pt		%xcc, 360b+4; faligndata %f12, %f14, %f48
286361:	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	STORE_SYNC(o0, f48) membar #Sync
287	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	STORE_JUMP(o0, f48, 412f) membar #Sync
288362:	FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)	STORE_SYNC(o0, f48) membar #Sync
289	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	STORE_JUMP(o0, f48, 428f) membar #Sync
290363:	FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)	STORE_SYNC(o0, f48) membar #Sync
291	FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)	STORE_JUMP(o0, f48, 444f) membar #Sync
292	VISLOOP_PAD
293370:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	LOOP_CHUNK1(o1, o0, g6, 371f)
294	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	LOOP_CHUNK2(o1, o0, g6, 372f)
295	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	LOOP_CHUNK3(o1, o0, g6, 373f)
296	b,pt		%xcc, 370b+4; faligndata %f14, %f16, %f48
297371:	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	STORE_SYNC(o0, f48) membar #Sync
298	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	STORE_JUMP(o0, f48, 414f) membar #Sync
299372:	FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)	STORE_SYNC(o0, f48) membar #Sync
300	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	STORE_JUMP(o0, f48, 430f) membar #Sync
301373:	FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)	STORE_SYNC(o0, f48) membar #Sync
302	FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)	STORE_JUMP(o0, f48, 446f) membar #Sync
303	VISLOOP_PAD
304400:	FINISH_VISCHUNK(o0, f0,  f2,  g3)
305402:	FINISH_VISCHUNK(o0, f2,  f4,  g3)
306404:	FINISH_VISCHUNK(o0, f4,  f6,  g3)
307406:	FINISH_VISCHUNK(o0, f6,  f8,  g3)
308408:	FINISH_VISCHUNK(o0, f8,  f10, g3)
309410:	FINISH_VISCHUNK(o0, f10, f12, g3)
310412:	FINISH_VISCHUNK(o0, f12, f14, g3)
311414:	UNEVEN_VISCHUNK(o0, f14, f0,  g3)
312416:	FINISH_VISCHUNK(o0, f16, f18, g3)
313418:	FINISH_VISCHUNK(o0, f18, f20, g3)
314420:	FINISH_VISCHUNK(o0, f20, f22, g3)
315422:	FINISH_VISCHUNK(o0, f22, f24, g3)
316424:	FINISH_VISCHUNK(o0, f24, f26, g3)
317426:	FINISH_VISCHUNK(o0, f26, f28, g3)
318428:	FINISH_VISCHUNK(o0, f28, f30, g3)
319430:	UNEVEN_VISCHUNK(o0, f30, f0,  g3)
320432:	FINISH_VISCHUNK(o0, f32, f34, g3)
321434:	FINISH_VISCHUNK(o0, f34, f36, g3)
322436:	FINISH_VISCHUNK(o0, f36, f38, g3)
323438:	FINISH_VISCHUNK(o0, f38, f40, g3)
324440:	FINISH_VISCHUNK(o0, f40, f42, g3)
325442:	FINISH_VISCHUNK(o0, f42, f44, g3)
326444:	FINISH_VISCHUNK(o0, f44, f46, g3)
327446:	UNEVEN_VISCHUNK(o0, f46, f0,  g3)
328204:	ldd		[%o1], %f2			/* Load		Group		*/
329	add		%o1, 8, %o1			/* IEU0				*/
330	subcc		%g3, 8, %g3			/* IEU1				*/
331	faligndata	%f0, %f2, %f8			/* GRU		Group		*/
332	std		%f8, [%o0]			/* Store			*/
333	bl,pn		%xcc, 205f			/* CTI				*/
334	 add		%o0, 8, %o0			/* IEU0		Group		*/
335	ldd		[%o1], %f0			/* Load		Group		*/
336	add		%o1, 8, %o1			/* IEU0				*/
337	subcc		%g3, 8, %g3			/* IEU1				*/
338	faligndata	%f2, %f0, %f8			/* GRU		Group		*/
339	std		%f8, [%o0]			/* Store			*/
340	bge,pt		%xcc, 204b			/* CTI				*/
341	 add		%o0, 8, %o0			/* IEU0		Group		*/
342205:	brz,pt		%o2, 207f			/* CTI		Group		*/
343	 mov		%g1, %o1			/* IEU0				*/
344206:	ldub		[%o1], %g5			/* LOAD				*/
345	add		%o1, 1, %o1			/* IEU0				*/
346	add		%o0, 1, %o0			/* IEU1				*/
347	subcc		%o2, 1, %o2			/* IEU1				*/
348	bne,pt		%xcc, 206b			/* CTI				*/
349	 stb		%g5, [%o0 - 1]			/* Store	Group		*/
350207:	membar		#StoreLoad | #StoreStore	/* LSU		Group		*/
351	wr		%g0, FPRS_FEF, %fprs
352	retl
353	 mov		%g4, %o0
354
355208:	andcc		%o2, 1, %g0			/* IEU1		Group		*/
356	be,pt		%icc, 2f+4			/* CTI				*/
3571:	 ldub		[%o1], %g5			/* LOAD		Group		*/
358	add		%o1, 1, %o1			/* IEU0				*/
359	add		%o0, 1, %o0			/* IEU1				*/
360	subcc		%o2, 1, %o2			/* IEU1		Group		*/
361	be,pn		%xcc, 209f			/* CTI				*/
362	 stb		%g5, [%o0 - 1]			/* Store			*/
3632:	ldub		[%o1], %g5			/* LOAD		Group		*/
364	add		%o0, 2, %o0			/* IEU0				*/
365	ldub		[%o1 + 1], %o5			/* LOAD		Group		*/
366	add		%o1, 2, %o1			/* IEU0				*/
367	subcc		%o2, 2, %o2			/* IEU1		Group		*/
368	stb		%g5, [%o0 - 2]			/* Store			*/
369	bne,pt		%xcc, 2b			/* CTI				*/
370	 stb		%o5, [%o0 - 1]			/* Store			*/
371209:	retl
372	 mov		%g4, %o0
373END(__memcpy_large)
374
375ENTRY(__mempcpy)
376	ba,pt		%xcc, 210f
377	 add		%o0, %o2, %g4
378END(__mempcpy)
379
380	.align		32
381ENTRY(memcpy)
382	 mov		%o0, %g4			/* IEU0		Group		*/
383210:
384#ifndef USE_BPR
385	srl		%o2, 0, %o2			/* IEU1				*/
386#endif
387	brz,pn		%o2, 209b			/* CTI		Group		*/
388218:	 cmp		%o2, 15				/* IEU1				*/
389	bleu,pn		%xcc, 208b			/* CTI		Group		*/
390	 cmp		%o2, (64 * 6)			/* IEU1				*/
391	bgeu,pn		%xcc, 200b			/* CTI		Group		*/
392	 andcc		%o0, 7, %g2			/* IEU1				*/
393	sub		%o0, %o1, %g5			/* IEU0				*/
394	andcc		%g5, 3, %o5			/* IEU1		Group		*/
395	bne,pn		%xcc, 212f			/* CTI				*/
396	 andcc		%o1, 3, %g0			/* IEU1		Group		*/
397	be,a,pt		%xcc, 216f			/* CTI				*/
398	 andcc		%o1, 4, %g0			/* IEU1		Group		*/
399	andcc		%o1, 1, %g0			/* IEU1		Group		*/
400	be,pn		%xcc, 4f			/* CTI				*/
401	 andcc		%o1, 2, %g0			/* IEU1		Group		*/
402	ldub		[%o1], %g2			/* Load		Group		*/
403	add		%o1, 1, %o1			/* IEU0				*/
404	add		%o0, 1, %o0			/* IEU1				*/
405	sub		%o2, 1, %o2			/* IEU0		Group		*/
406	bne,pn		%xcc, 5f			/* CTI		Group		*/
407	 stb		%g2, [%o0 - 1]			/* Store			*/
4084:	lduh		[%o1], %g2			/* Load		Group		*/
409	add		%o1, 2, %o1			/* IEU0				*/
410	add		%o0, 2, %o0			/* IEU1				*/
411	sub		%o2, 2, %o2			/* IEU0				*/
412	sth		%g2, [%o0 - 2]			/* Store	Group + bubble	*/
4135:	andcc		%o1, 4, %g0			/* IEU1				*/
414216:	be,a,pn		%xcc, 2f			/* CTI				*/
415	 andcc		%o2, -128, %g6			/* IEU1		Group		*/
416	lduw		[%o1], %g5			/* Load		Group		*/
417	add		%o1, 4, %o1			/* IEU0				*/
418	add		%o0, 4, %o0			/* IEU1				*/
419	sub		%o2, 4, %o2			/* IEU0		Group		*/
420	stw		%g5, [%o0 - 4]			/* Store			*/
421	andcc		%o2, -128, %g6			/* IEU1		Group		*/
4222:	be,pn		%xcc, 215f			/* CTI				*/
423	 andcc		%o0, 4, %g0			/* IEU1		Group		*/
424	be,pn		%xcc, 82f + 4			/* CTI		Group		*/
4255:	MOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
426	MOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
427	MOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
428	MOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
42935:	subcc		%g6, 128, %g6			/* IEU1		Group		*/
430	add		%o1, 128, %o1			/* IEU0				*/
431	bne,pt		%xcc, 5b			/* CTI				*/
432	 add		%o0, 128, %o0			/* IEU0		Group		*/
433215:	andcc		%o2, 0x70, %g6			/* IEU1		Group		*/
43441:	be,pn		%xcc, 80f			/* CTI				*/
435	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
436							/* Clk1 8-(			*/
437							/* Clk2 8-(			*/
438							/* Clk3 8-(			*/
439							/* Clk4 8-(			*/
44079:	rd		%pc, %o5			/* PDU		Group		*/
441	sll		%g6, 1, %g5			/* IEU0		Group		*/
442	add		%o1, %g6, %o1			/* IEU1				*/
443	sub		%o5, %g5, %o5			/* IEU0  	Group		*/
444	jmpl		%o5 + %lo(80f - 79b), %g0	/* CTI		Group brk forced*/
445	 add		%o0, %g6, %o0			/* IEU0		Group		*/
44636:	MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
447	MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
448	MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
449	MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
450	MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
451	MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
452	MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
45380:	be,pt		%xcc, 81f			/* CTI				*/
454	 andcc		%o2, 4, %g0			/* IEU1				*/
455	ldx		[%o1], %g2			/* Load		Group		*/
456	add		%o0, 8, %o0			/* IEU0				*/
457	stw		%g2, [%o0 - 0x4]		/* Store	Group		*/
458	add		%o1, 8, %o1			/* IEU1				*/
459	srlx		%g2, 32, %g2			/* IEU0		Group		*/
460	stw		%g2, [%o0 - 0x8]		/* Store			*/
46181:	be,pt		%xcc, 1f			/* CTI				*/
462	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
463	lduw		[%o1], %g2			/* Load		Group		*/
464	add		%o1, 4, %o1			/* IEU0				*/
465	stw		%g2, [%o0]			/* Store	Group		*/
466	add		%o0, 4, %o0			/* IEU0				*/
4671:	be,pt		%xcc, 1f			/* CTI				*/
468	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
469	lduh		[%o1], %g2			/* Load		Group		*/
470	add		%o1, 2, %o1			/* IEU0				*/
471	sth		%g2, [%o0]			/* Store	Group		*/
472	add		%o0, 2, %o0			/* IEU0				*/
4731:	be,pt		%xcc, 211f			/* CTI				*/
474	 nop						/* IEU1				*/
475	ldub		[%o1], %g2			/* Load		Group		*/
476	stb		%g2, [%o0]			/* Store	Group + bubble	*/
477211:	retl
478	 mov		%g4, %o0
479
48082:	MOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
481	MOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
48237:	subcc		%g6, 128, %g6			/* IEU1		Group		*/
483	add		%o1, 128, %o1			/* IEU0				*/
484	bne,pt		%xcc, 82b			/* CTI				*/
485	 add		%o0, 128, %o0			/* IEU0		Group		*/
486	andcc		%o2, 0x70, %g6			/* IEU1				*/
487	be,pn		%xcc, 84f			/* CTI				*/
488	 andcc		%o2, 8, %g0			/* IEU1		Group		*/
489							/* Clk1 8-(			*/
490							/* Clk2 8-(			*/
491							/* Clk3 8-(			*/
492							/* Clk4 8-(			*/
49383:	rd		%pc, %o5			/* PDU		Group		*/
494	add		%o1, %g6, %o1			/* IEU0		Group		*/
495	sub		%o5, %g6, %o5			/* IEU1				*/
496	jmpl		%o5 + %lo(84f - 83b), %g0	/* CTI		Group brk forced*/
497	 add		%o0, %g6, %o0			/* IEU0		Group		*/
49838:	MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
499	MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
500	MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
501	MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
502	MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
503	MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
504	MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
50584:	be,pt		%xcc, 85f			/* CTI		Group		*/
506	 andcc		%o2, 4, %g0			/* IEU1				*/
507	ldx		[%o1], %g2			/* Load		Group		*/
508	add		%o0, 8, %o0			/* IEU0				*/
509	add		%o1, 8, %o1			/* IEU0		Group		*/
510	stx		%g2, [%o0 - 0x8]		/* Store			*/
51185:	be,pt		%xcc, 1f			/* CTI				*/
512	 andcc		%o2, 2, %g0			/* IEU1		Group		*/
513	lduw		[%o1], %g2			/* Load		Group		*/
514	add		%o0, 4, %o0			/* IEU0				*/
515	add		%o1, 4, %o1			/* IEU0		Group		*/
516	stw		%g2, [%o0 - 0x4]		/* Store			*/
5171:	be,pt		%xcc, 1f			/* CTI				*/
518	 andcc		%o2, 1, %g0			/* IEU1		Group		*/
519	lduh		[%o1], %g2			/* Load		Group		*/
520	add		%o0, 2, %o0			/* IEU0				*/
521	add		%o1, 2, %o1			/* IEU0		Group		*/
522	sth		%g2, [%o0 - 0x2]		/* Store			*/
5231:	be,pt		%xcc, 1f			/* CTI				*/
524	 nop						/* IEU0		Group		*/
525	ldub		[%o1], %g2			/* Load		Group		*/
526	stb		%g2, [%o0]			/* Store	Group + bubble	*/
5271:	retl
528	 mov		%g4, %o0
529
530212:	brz,pt		%g2, 2f				/* CTI		Group		*/
531	 mov		8, %g1				/* IEU0				*/
532	sub		%g1, %g2, %g2			/* IEU0		Group		*/
533	sub		%o2, %g2, %o2			/* IEU0		Group		*/
5341:	ldub		[%o1], %g5			/* Load		Group		*/
535	add		%o1, 1, %o1			/* IEU0				*/
536	add		%o0, 1, %o0			/* IEU1				*/
537	subcc		%g2, 1, %g2			/* IEU1		Group		*/
538	bne,pt		%xcc, 1b			/* CTI				*/
539	 stb		%g5, [%o0 - 1]			/* Store			*/
5402:	andn		%o2, 7, %g5 			/* IEU0		Group		*/
541	and		%o2, 7, %o2			/* IEU1				*/
542	fsrc2		%f0, %f2			/* FPU				*/
543	alignaddr	%o1, %g0, %g1			/* GRU		Group		*/
544	ldd		[%g1], %f4			/* Load		Group		*/
5451:	ldd		[%g1 + 0x8], %f6		/* Load		Group		*/
546	add		%g1, 0x8, %g1			/* IEU0		Group		*/
547	subcc		%g5, 8, %g5			/* IEU1				*/
548	faligndata	%f4, %f6, %f0			/* GRU		Group		*/
549	std		%f0, [%o0]			/* Store			*/
550	add		%o1, 8, %o1			/* IEU0		Group		*/
551	be,pn		%xcc, 213f			/* CTI				*/
552	 add		%o0, 8, %o0			/* IEU1				*/
553	ldd		[%g1 + 0x8], %f4		/* Load		Group		*/
554	add		%g1, 8, %g1			/* IEU0				*/
555	subcc		%g5, 8, %g5			/* IEU1				*/
556	faligndata	%f6, %f4, %f0			/* GRU		Group		*/
557	std		%f0, [%o0]			/* Store			*/
558	add		%o1, 8, %o1			/* IEU0				*/
559	bne,pn		%xcc, 1b			/* CTI		Group		*/
560	 add		%o0, 8, %o0			/* IEU0				*/
561213:	brz,pn		%o2, 214f			/* CTI		Group		*/
562	 nop						/* IEU0				*/
563	ldub		[%o1], %g5			/* LOAD				*/
564	add		%o1, 1, %o1			/* IEU0				*/
565	add		%o0, 1, %o0			/* IEU1				*/
566	subcc		%o2, 1, %o2			/* IEU1				*/
567	bne,pt		%xcc, 206b			/* CTI				*/
568	 stb		%g5, [%o0 - 1]			/* Store	Group		*/
569214:	wr		%g0, FPRS_FEF, %fprs
570	retl
571	 mov		%g4, %o0
572END(memcpy)
573
574libc_hidden_builtin_def (memcpy)
575
576libc_hidden_def (__mempcpy)
577weak_alias (__mempcpy, mempcpy)
578libc_hidden_builtin_def (mempcpy)
579