1/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara-2.
2   Copyright (C) 2007-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
22#define ASI_BLK_P		0xf0
23#define ASI_P			0x80
24#define ASI_PNF			0x82
25
26#define FPRS_FEF		0x04
27
28#define VISEntryHalf			\
29	rd	%fprs, %o5;		\
30	wr	%g0, FPRS_FEF, %fprs
31
32#define VISExitHalf			\
33	and	%o5, FPRS_FEF, %o5;	\
34	wr	%o5, 0x0, %fprs
35
36#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_P
37
38#define LOAD(type,addr,dest)	type [addr], dest
39#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest
40#define STORE(type,src,addr)	type src, [addr]
41#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P
42#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
43
44#ifndef XCC
45#define USE_BPR
46#define XCC xcc
47#endif
48
49#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
50	faligndata	%x0, %x1, %f0; \
51	faligndata	%x1, %x2, %f2; \
52	faligndata	%x2, %x3, %f4; \
53	faligndata	%x3, %x4, %f6; \
54	faligndata	%x4, %x5, %f8; \
55	faligndata	%x5, %x6, %f10; \
56	faligndata	%x6, %x7, %f12; \
57	faligndata	%x7, %x8, %f14;
58
59#define FREG_MOVE_1(x0) \
60	fsrc2		%x0, %f0;
61#define FREG_MOVE_2(x0, x1) \
62	fsrc2		%x0, %f0; \
63	fsrc2		%x1, %f2;
64#define FREG_MOVE_3(x0, x1, x2) \
65	fsrc2		%x0, %f0; \
66	fsrc2		%x1, %f2; \
67	fsrc2		%x2, %f4;
68#define FREG_MOVE_4(x0, x1, x2, x3) \
69	fsrc2		%x0, %f0; \
70	fsrc2		%x1, %f2; \
71	fsrc2		%x2, %f4; \
72	fsrc2		%x3, %f6;
73#define FREG_MOVE_5(x0, x1, x2, x3, x4) \
74	fsrc2		%x0, %f0; \
75	fsrc2		%x1, %f2; \
76	fsrc2		%x2, %f4; \
77	fsrc2		%x3, %f6; \
78	fsrc2		%x4, %f8;
79#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \
80	fsrc2		%x0, %f0; \
81	fsrc2		%x1, %f2; \
82	fsrc2		%x2, %f4; \
83	fsrc2		%x3, %f6; \
84	fsrc2		%x4, %f8; \
85	fsrc2		%x5, %f10;
86#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \
87	fsrc2		%x0, %f0; \
88	fsrc2		%x1, %f2; \
89	fsrc2		%x2, %f4; \
90	fsrc2		%x3, %f6; \
91	fsrc2		%x4, %f8; \
92	fsrc2		%x5, %f10; \
93	fsrc2		%x6, %f12;
94#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \
95	fsrc2		%x0, %f0; \
96	fsrc2		%x1, %f2; \
97	fsrc2		%x2, %f4; \
98	fsrc2		%x3, %f6; \
99	fsrc2		%x4, %f8; \
100	fsrc2		%x5, %f10; \
101	fsrc2		%x6, %f12; \
102	fsrc2		%x7, %f14;
103#define FREG_LOAD_1(base, x0) \
104	LOAD(ldd, base + 0x00, %x0)
105#define FREG_LOAD_2(base, x0, x1) \
106	LOAD(ldd, base + 0x00, %x0); \
107	LOAD(ldd, base + 0x08, %x1);
108#define FREG_LOAD_3(base, x0, x1, x2) \
109	LOAD(ldd, base + 0x00, %x0); \
110	LOAD(ldd, base + 0x08, %x1); \
111	LOAD(ldd, base + 0x10, %x2);
112#define FREG_LOAD_4(base, x0, x1, x2, x3) \
113	LOAD(ldd, base + 0x00, %x0); \
114	LOAD(ldd, base + 0x08, %x1); \
115	LOAD(ldd, base + 0x10, %x2); \
116	LOAD(ldd, base + 0x18, %x3);
117#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \
118	LOAD(ldd, base + 0x00, %x0); \
119	LOAD(ldd, base + 0x08, %x1); \
120	LOAD(ldd, base + 0x10, %x2); \
121	LOAD(ldd, base + 0x18, %x3); \
122	LOAD(ldd, base + 0x20, %x4);
123#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \
124	LOAD(ldd, base + 0x00, %x0); \
125	LOAD(ldd, base + 0x08, %x1); \
126	LOAD(ldd, base + 0x10, %x2); \
127	LOAD(ldd, base + 0x18, %x3); \
128	LOAD(ldd, base + 0x20, %x4); \
129	LOAD(ldd, base + 0x28, %x5);
130#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \
131	LOAD(ldd, base + 0x00, %x0); \
132	LOAD(ldd, base + 0x08, %x1); \
133	LOAD(ldd, base + 0x10, %x2); \
134	LOAD(ldd, base + 0x18, %x3); \
135	LOAD(ldd, base + 0x20, %x4); \
136	LOAD(ldd, base + 0x28, %x5); \
137	LOAD(ldd, base + 0x30, %x6);
138
139#if IS_IN (libc)
140
141	.register	%g2,#scratch
142	.register	%g3,#scratch
143	.register	%g6,#scratch
144
145	.text
146
147ENTRY(__mempcpy_niagara2)
148	ba,pt		%XCC, 101f
149	 add		%o0, %o2, %g5
150END(__mempcpy_niagara2)
151
152	.align		32
153ENTRY(__memcpy_niagara2)
154100:	/* %o0=dst, %o1=src, %o2=len */
155	mov		%o0, %g5
156101:
157# ifndef USE_BPR
158	srl		%o2, 0, %o2
159# endif
160	cmp		%o2, 0
161	be,pn		%XCC, 85f
162218:	 or		%o0, %o1, %o3
163	cmp		%o2, 16
164	blu,a,pn	%XCC, 80f
165	 or		%o3, %o2, %o3
166
167	/* 2 blocks (128 bytes) is the minimum we can do the block
168	 * copy with.  We need to ensure that we'll iterate at least
169	 * once in the block copy loop.  At worst we'll need to align
170	 * the destination to a 64-byte boundary which can chew up
171	 * to (64 - 1) bytes from the length before we perform the
172	 * block copy loop.
173	 *
174	 * However, the cut-off point, performance wise, is around
175	 * 4 64-byte blocks.
176	 */
177	cmp		%o2, (4 * 64)
178	blu,pt		%XCC, 75f
179	 andcc		%o3, 0x7, %g0
180
181	/* %o0:	dst
182	 * %o1:	src
183	 * %o2:	len  (known to be >= 128)
184	 *
185	 * The block copy loops can use %o4, %g2, %g3 as
186	 * temporaries while copying the data.  %o5 must
187	 * be preserved between VISEntryHalf and VISExitHalf
188	 */
189
190	LOAD(prefetch, %o1 + 0x000, #one_read)
191	LOAD(prefetch, %o1 + 0x040, #one_read)
192	LOAD(prefetch, %o1 + 0x080, #one_read)
193
194	/* Align destination on 64-byte boundary.  */
195	andcc		%o0, (64 - 1), %o4
196	be,pt		%XCC, 2f
197	 sub		%o4, 64, %o4
198	sub		%g0, %o4, %o4	! bytes to align dst
199	sub		%o2, %o4, %o2
2001:	subcc		%o4, 1, %o4
201	LOAD(ldub, %o1, %g1)
202	STORE(stb, %g1, %o0)
203	add		%o1, 1, %o1
204	bne,pt		%XCC, 1b
205	add		%o0, 1, %o0
206
2072:
208	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve
209	 * o5 from here until we hit VISExitHalf.
210	 */
211	VISEntryHalf
212
213	membar		#Sync
214	alignaddr	%o1, %g0, %g0
215
216	add		%o1, (64 - 1), %o4
217	andn		%o4, (64 - 1), %o4
218	andn		%o2, (64 - 1), %g1
219	sub		%o2, %g1, %o2
220
221	and		%o1, (64 - 1), %g2
222	add		%o1, %g1, %o1
223	sub		%o0, %o4, %g3
224	brz,pt		%g2, 190f
225	 cmp		%g2, 32
226	blu,a		5f
227	 cmp		%g2, 16
228	cmp		%g2, 48
229	blu,a		4f
230	 cmp		%g2, 40
231	cmp		%g2, 56
232	blu		170f
233	 nop
234	ba,a,pt		%xcc, 180f
235
2364:	/* 32 <= low bits < 48 */
237	blu		150f
238	 nop
239	ba,a,pt		%xcc, 160f
2405:	/* 0 < low bits < 32 */
241	blu,a		6f
242	 cmp		%g2, 8
243	cmp		%g2, 24
244	blu		130f
245	 nop
246	ba,a,pt		%xcc, 140f
2476:	/* 0 < low bits < 16 */
248	bgeu		120f
249	 nop
250	/* fall through for 0 < low bits < 8 */
251110:	sub		%o4, 64, %g2
252	LOAD_BLK(%g2, %f0)
2531:	STORE_INIT(%g0, %o4 + %g3)
254	LOAD_BLK(%o4, %f16)
255	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)
256	STORE_BLK(%f0, %o4 + %g3)
257	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)
258	subcc		%g1, 64, %g1
259	add		%o4, 64, %o4
260	bne,pt		%XCC, 1b
261	 LOAD(prefetch, %o4 + 64, #one_read)
262	ba,pt		%xcc, 195f
263	 nop
264
265120:	sub		%o4, 56, %g2
266	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)
2671:	STORE_INIT(%g0, %o4 + %g3)
268	LOAD_BLK(%o4, %f16)
269	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)
270	STORE_BLK(%f0, %o4 + %g3)
271	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)
272	subcc		%g1, 64, %g1
273	add		%o4, 64, %o4
274	bne,pt		%XCC, 1b
275	 LOAD(prefetch, %o4 + 64, #one_read)
276	ba,pt		%xcc, 195f
277	 nop
278
279130:	sub		%o4, 48, %g2
280	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)
2811:	STORE_INIT(%g0, %o4 + %g3)
282	LOAD_BLK(%o4, %f16)
283	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)
284	STORE_BLK(%f0, %o4 + %g3)
285	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)
286	subcc		%g1, 64, %g1
287	add		%o4, 64, %o4
288	bne,pt		%XCC, 1b
289	 LOAD(prefetch, %o4 + 64, #one_read)
290	ba,pt		%xcc, 195f
291	 nop
292
293140:	sub		%o4, 40, %g2
294	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)
2951:	STORE_INIT(%g0, %o4 + %g3)
296	LOAD_BLK(%o4, %f16)
297	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)
298	STORE_BLK(%f0, %o4 + %g3)
299	FREG_MOVE_5(f22, f24, f26, f28, f30)
300	subcc		%g1, 64, %g1
301	add		%o4, 64, %o4
302	bne,pt		%XCC, 1b
303	 LOAD(prefetch, %o4 + 64, #one_read)
304	ba,pt		%xcc, 195f
305	 nop
306
307150:	sub		%o4, 32, %g2
308	FREG_LOAD_4(%g2, f0, f2, f4, f6)
3091:	STORE_INIT(%g0, %o4 + %g3)
310	LOAD_BLK(%o4, %f16)
311	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)
312	STORE_BLK(%f0, %o4 + %g3)
313	FREG_MOVE_4(f24, f26, f28, f30)
314	subcc		%g1, 64, %g1
315	add		%o4, 64, %o4
316	bne,pt		%XCC, 1b
317	 LOAD(prefetch, %o4 + 64, #one_read)
318	ba,pt		%xcc, 195f
319	 nop
320
321160:	sub		%o4, 24, %g2
322	FREG_LOAD_3(%g2, f0, f2, f4)
3231:	STORE_INIT(%g0, %o4 + %g3)
324	LOAD_BLK(%o4, %f16)
325	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)
326	STORE_BLK(%f0, %o4 + %g3)
327	FREG_MOVE_3(f26, f28, f30)
328	subcc		%g1, 64, %g1
329	add		%o4, 64, %o4
330	bne,pt		%XCC, 1b
331	 LOAD(prefetch, %o4 + 64, #one_read)
332	ba,pt		%xcc, 195f
333	 nop
334
335170:	sub		%o4, 16, %g2
336	FREG_LOAD_2(%g2, f0, f2)
3371:	STORE_INIT(%g0, %o4 + %g3)
338	LOAD_BLK(%o4, %f16)
339	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)
340	STORE_BLK(%f0, %o4 + %g3)
341	FREG_MOVE_2(f28, f30)
342	subcc		%g1, 64, %g1
343	add		%o4, 64, %o4
344	bne,pt		%XCC, 1b
345	 LOAD(prefetch, %o4 + 64, #one_read)
346	ba,pt		%xcc, 195f
347	 nop
348
349180:	sub		%o4, 8, %g2
350	FREG_LOAD_1(%g2, f0)
3511:	STORE_INIT(%g0, %o4 + %g3)
352	LOAD_BLK(%o4, %f16)
353	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)
354	STORE_BLK(%f0, %o4 + %g3)
355	FREG_MOVE_1(f30)
356	subcc		%g1, 64, %g1
357	add		%o4, 64, %o4
358	bne,pt		%XCC, 1b
359	 LOAD(prefetch, %o4 + 64, #one_read)
360	ba,pt		%xcc, 195f
361	 nop
362
363190:
3641:	STORE_INIT(%g0, %o4 + %g3)
365	subcc		%g1, 64, %g1
366	LOAD_BLK(%o4, %f0)
367	STORE_BLK(%f0, %o4 + %g3)
368	add		%o4, 64, %o4
369	bne,pt		%XCC, 1b
370	 LOAD(prefetch, %o4 + 64, #one_read)
371
372195:
373	add		%o4, %g3, %o0
374	membar		#Sync
375
376	VISExitHalf
377
378	/* %o2 contains any final bytes still needed to be copied
379	 * over. If anything is left, we copy it one byte at a time.
380	 */
381	brz,pt		%o2, 85f
382	 sub		%o0, %o1, %o3
383	ba,a,pt		%XCC, 90f
384
385	.align		64
38675: /* 16 < len <= 64 */
387	bne,pn		%XCC, 75f
388	 sub		%o0, %o1, %o3
389
39072:
391	andn		%o2, 0xf, %o4
392	and		%o2, 0xf, %o2
3931:	subcc		%o4, 0x10, %o4
394	LOAD(ldx, %o1, %o5)
395	add		%o1, 0x08, %o1
396	LOAD(ldx, %o1, %g1)
397	sub		%o1, 0x08, %o1
398	STORE(stx, %o5, %o1 + %o3)
399	add		%o1, 0x8, %o1
400	STORE(stx, %g1, %o1 + %o3)
401	bgu,pt		%XCC, 1b
402	 add		%o1, 0x8, %o1
40373:	andcc		%o2, 0x8, %g0
404	be,pt		%XCC, 1f
405	 nop
406	sub		%o2, 0x8, %o2
407	LOAD(ldx, %o1, %o5)
408	STORE(stx, %o5, %o1 + %o3)
409	add		%o1, 0x8, %o1
4101:	andcc		%o2, 0x4, %g0
411	be,pt		%XCC, 1f
412	 nop
413	sub		%o2, 0x4, %o2
414	LOAD(lduw, %o1, %o5)
415	STORE(stw, %o5, %o1 + %o3)
416	add		%o1, 0x4, %o1
4171:	cmp		%o2, 0
418	be,pt		%XCC, 85f
419	 nop
420	ba,pt		%xcc, 90f
421	 nop
422
42375:
424	andcc		%o0, 0x7, %g1
425	sub		%g1, 0x8, %g1
426	be,pn		%icc, 2f
427	 sub		%g0, %g1, %g1
428	sub		%o2, %g1, %o2
429
4301:	subcc		%g1, 1, %g1
431	LOAD(ldub, %o1, %o5)
432	STORE(stb, %o5, %o1 + %o3)
433	bgu,pt		%icc, 1b
434	 add		%o1, 1, %o1
435
4362:	add		%o1, %o3, %o0
437	andcc		%o1, 0x7, %g1
438	bne,pt		%icc, 8f
439	 sll		%g1, 3, %g1
440
441	cmp		%o2, 16
442	bgeu,pt		%icc, 72b
443	 nop
444	ba,a,pt		%xcc, 73b
445
4468:	mov		64, %o3
447	andn		%o1, 0x7, %o1
448	LOAD(ldx, %o1, %g2)
449	sub		%o3, %g1, %o3
450	andn		%o2, 0x7, %o4
451	sllx		%g2, %g1, %g2
4521:	add		%o1, 0x8, %o1
453	LOAD(ldx, %o1, %g3)
454	subcc		%o4, 0x8, %o4
455	srlx		%g3, %o3, %o5
456	or		%o5, %g2, %o5
457	STORE(stx, %o5, %o0)
458	add		%o0, 0x8, %o0
459	bgu,pt		%icc, 1b
460	 sllx		%g3, %g1, %g2
461
462	srl		%g1, 3, %g1
463	andcc		%o2, 0x7, %o2
464	be,pn		%icc, 85f
465	 add		%o1, %g1, %o1
466	ba,pt		%xcc, 90f
467	 sub		%o0, %o1, %o3
468
469	.align		64
47080: /* 0 < len <= 16 */
471	andcc		%o3, 0x3, %g0
472	bne,pn		%XCC, 90f
473	 sub		%o0, %o1, %o3
474
4751:
476	subcc		%o2, 4, %o2
477	LOAD(lduw, %o1, %g1)
478	STORE(stw, %g1, %o1 + %o3)
479	bgu,pt		%XCC, 1b
480	 add		%o1, 4, %o1
481
48285:	retl
483	 mov		%g5, %o0
484
485	.align		32
48690:
487	subcc		%o2, 1, %o2
488	LOAD(ldub, %o1, %g1)
489	STORE(stb, %g1, %o1 + %o3)
490	bgu,pt		%XCC, 90b
491	 add		%o1, 1, %o1
492	retl
493	 mov		%g5, %o0
494
495END(__memcpy_niagara2)
496
497#endif
498