1/* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara-4.
2   Copyright (C) 2012-2022 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4
5   The GNU C Library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License as published by the Free Software Foundation; either
8   version 2.1 of the License, or (at your option) any later version.
9
10   The GNU C Library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14
15   You should have received a copy of the GNU Lesser General Public
16   License along with the GNU C Library; if not, see
17   <https://www.gnu.org/licenses/>.  */
18
19#include <sysdep.h>
20
21#define ASI_BLK_INIT_QUAD_LDD_P	0xe2
22
23#define FPRS_FEF		0x04
24
25/* On T4 it is very expensive to access ASRs like %fprs and
26 * %asi, avoiding a read or a write can save ~50 cycles.
27 */
28#define FPU_ENTER			\
29	rd	%fprs, %o5;		\
30	andcc	%o5, FPRS_FEF, %g0;	\
31	be,a,pn	%icc, 999f;		\
32	 wr	%g0, FPRS_FEF, %fprs;	\
33	999:
34
35#define VISEntryHalf FPU_ENTER
36#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
37
38#define GLOBAL_SPARE	%g5
39
40#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
41#define EX_LD(x)	x
42#define EX_ST(x)	x
43#define EX_RETVAL(x)	x
44#define LOAD(type,addr,dest)	type [addr], dest
45#define STORE(type,src,addr)	type src, [addr]
46#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
47
48#if IS_IN (libc)
49
50	.register	%g2,#scratch
51	.register	%g3,#scratch
52	.register	%g6,#scratch
53
54	.text
55
56ENTRY(__mempcpy_niagara4)
57	ba,pt		%icc, 101f
58	 add		%o0, %o2, %o3
59END(__mempcpy_niagara4)
60
61	.align		32
62ENTRY(__memcpy_niagara4)
63100:	/* %o0=dst, %o1=src, %o2=len */
64	mov		%o0, %o3
65101:
66#ifndef __arch64__
67	srl		%o2, 0, %o2
68#endif
69	brz,pn		%o2, .Lexit
70	 cmp		%o2, 3
71	ble,pn		%icc, .Ltiny
72	 cmp		%o2, 19
73	ble,pn		%icc, .Lsmall
74	 or		%o0, %o1, %g2
75	cmp		%o2, 128
76	bl,pn		%icc, .Lmedium
77	 nop
78
79.Llarge:/* len >= 0x80 */
80	/* First get dest 8 byte aligned.  */
81	sub		%g0, %o0, %g1
82	and		%g1, 0x7, %g1
83	brz,pt		%g1, 51f
84	 sub		%o2, %g1, %o2
85
861:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
87	add		%o1, 1, %o1
88	subcc		%g1, 1, %g1
89	add		%o0, 1, %o0
90	bne,pt		%icc, 1b
91	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
92
9351:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
94	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
95	LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
96	LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
97	LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
98	LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
99	LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
100	LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
101
102	/* Check if we can use the straight fully aligned
103	 * loop, or we require the alignaddr/faligndata variant.
104	 */
105	andcc		%o1, 0x7, %o5
106	bne,pn		%icc, .Llarge_src_unaligned
107	 sub		%g0, %o0, %g1
108
109	/* Legitimize the use of initializing stores by getting dest
110	 * to be 64-byte aligned.
111	 */
112	and		%g1, 0x3f, %g1
113	brz,pt		%g1, .Llarge_aligned
114	 sub		%o2, %g1, %o2
115
1161:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
117	add		%o1, 8, %o1
118	subcc		%g1, 8, %g1
119	add		%o0, 8, %o0
120	bne,pt		%icc, 1b
121	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
122
123.Llarge_aligned:
124	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
125	andn		%o2, 0x3f, %o4
126	sub		%o2, %o4, %o2
127
1281:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
129	add		%o1, 0x40, %o1
130	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
131	subcc		%o4, 0x40, %o4
132	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
133	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
134	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
135	EX_ST(STORE_INIT(%g1, %o0))
136	add		%o0, 0x08, %o0
137	EX_ST(STORE_INIT(%g2, %o0))
138	add		%o0, 0x08, %o0
139	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
140	EX_ST(STORE_INIT(%g3, %o0))
141	add		%o0, 0x08, %o0
142	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
143	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
144	add		%o0, 0x08, %o0
145	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
146	EX_ST(STORE_INIT(%o5, %o0))
147	add		%o0, 0x08, %o0
148	EX_ST(STORE_INIT(%g2, %o0))
149	add		%o0, 0x08, %o0
150	EX_ST(STORE_INIT(%g3, %o0))
151	add		%o0, 0x08, %o0
152	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
153	add		%o0, 0x08, %o0
154	bne,pt		%icc, 1b
155	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
156
157	membar		#StoreLoad | #StoreStore
158
159	brz,pn		%o2, .Lexit
160	 cmp		%o2, 19
161	ble,pn		%icc, .Lsmall_unaligned
162	 nop
163	ba,a,pt		%icc, .Lmedium_noprefetch
164
165.Lexit:	retl
166	 mov		EX_RETVAL(%o3), %o0
167
168.Llarge_src_unaligned:
169	andn		%o2, 0x3f, %o4
170	sub		%o2, %o4, %o2
171	VISEntryHalf
172	alignaddr	%o1, %g0, %g1
173	add		%o1, %o4, %o1
174	EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
1751:	EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
176	subcc		%o4, 0x40, %o4
177	EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
178	EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
179	EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
180	EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
181	EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
182	EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
183	faligndata	%f0, %f2, %f16
184	EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
185	faligndata	%f2, %f4, %f18
186	add		%g1, 0x40, %g1
187	faligndata	%f4, %f6, %f20
188	faligndata	%f6, %f8, %f22
189	faligndata	%f8, %f10, %f24
190	faligndata	%f10, %f12, %f26
191	faligndata	%f12, %f14, %f28
192	faligndata	%f14, %f0, %f30
193	EX_ST(STORE(std, %f16, %o0 + 0x00))
194	EX_ST(STORE(std, %f18, %o0 + 0x08))
195	EX_ST(STORE(std, %f20, %o0 + 0x10))
196	EX_ST(STORE(std, %f22, %o0 + 0x18))
197	EX_ST(STORE(std, %f24, %o0 + 0x20))
198	EX_ST(STORE(std, %f26, %o0 + 0x28))
199	EX_ST(STORE(std, %f28, %o0 + 0x30))
200	EX_ST(STORE(std, %f30, %o0 + 0x38))
201	add		%o0, 0x40, %o0
202	bne,pt		%icc, 1b
203	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
204	VISExitHalf
205
206	brz,pn		%o2, .Lexit
207	 cmp		%o2, 19
208	ble,pn		%icc, .Lsmall_unaligned
209	 nop
210	ba,a,pt		%icc, .Lmedium_unaligned
211
212.Lmedium:
213	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
214	andcc		%g2, 0x7, %g0
215	bne,pn		%icc, .Lmedium_unaligned
216	 nop
217.Lmedium_noprefetch:
218	andncc		%o2, 0x20 - 1, %o5
219	be,pn		%icc, 2f
220	 sub		%o2, %o5, %o2
2211:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
222	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
223	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
224	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
225	add		%o1, 0x20, %o1
226	subcc		%o5, 0x20, %o5
227	EX_ST(STORE(stx, %g1, %o0 + 0x00))
228	EX_ST(STORE(stx, %g2, %o0 + 0x08))
229	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
230	EX_ST(STORE(stx, %o4, %o0 + 0x18))
231	bne,pt		%icc, 1b
232	 add		%o0, 0x20, %o0
2332:	andcc		%o2, 0x18, %o5
234	be,pt		%icc, 3f
235	 sub		%o2, %o5, %o2
2361:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
237	add		%o1, 0x08, %o1
238	add		%o0, 0x08, %o0
239	subcc		%o5, 0x08, %o5
240	bne,pt		%icc, 1b
241	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
2423:	brz,pt		%o2, .Lexit
243	 cmp		%o2, 0x04
244	bl,pn		%icc, .Ltiny
245	 nop
246	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
247	add		%o1, 0x04, %o1
248	add		%o0, 0x04, %o0
249	subcc		%o2, 0x04, %o2
250	bne,pn		%icc, .Ltiny
251	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
252	ba,a,pt		%icc, .Lexit
253.Lmedium_unaligned:
254	/* First get dest 8 byte aligned.  */
255	sub		%g0, %o0, %g1
256	and		%g1, 0x7, %g1
257	brz,pt		%g1, 2f
258	 sub		%o2, %g1, %o2
259
2601:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
261	add		%o1, 1, %o1
262	subcc		%g1, 1, %g1
263	add		%o0, 1, %o0
264	bne,pt		%icc, 1b
265	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
2662:
267	and		%o1, 0x7, %g1
268	brz,pn		%g1, .Lmedium_noprefetch
269	 sll		%g1, 3, %g1
270	mov		64, %g2
271	sub		%g2, %g1, %g2
272	andn		%o1, 0x7, %o1
273	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
274	sllx		%o4, %g1, %o4
275	andn		%o2, 0x08 - 1, %o5
276	sub		%o2, %o5, %o2
2771:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
278	add		%o1, 0x08, %o1
279	subcc		%o5, 0x08, %o5
280	srlx		%g3, %g2, GLOBAL_SPARE
281	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
282	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
283	add		%o0, 0x08, %o0
284	bne,pt		%icc, 1b
285	 sllx		%g3, %g1, %o4
286	srl		%g1, 3, %g1
287	add		%o1, %g1, %o1
288	brz,pn		%o2, .Lexit
289	 nop
290	ba,pt		%icc, .Lsmall_unaligned
291
292.Ltiny:
293	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
294	subcc		%o2, 1, %o2
295	be,pn		%icc, .Lexit
296	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
297	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
298	subcc		%o2, 1, %o2
299	be,pn		%icc, .Lexit
300	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
301	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
302	ba,pt		%icc, .Lexit
303	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
304
305.Lsmall:
306	andcc		%g2, 0x3, %g0
307	bne,pn		%icc, .Lsmall_unaligned
308	 andn		%o2, 0x4 - 1, %o5
309	sub		%o2, %o5, %o2
3101:
311	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
312	add		%o1, 0x04, %o1
313	subcc		%o5, 0x04, %o5
314	add		%o0, 0x04, %o0
315	bne,pt		%icc, 1b
316	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
317	brz,pt		%o2, .Lexit
318	 nop
319	ba,a,pt		%icc, .Ltiny
320
321.Lsmall_unaligned:
3221:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
323	add		%o1, 1, %o1
324	add		%o0, 1, %o0
325	subcc		%o2, 1, %o2
326	bne,pt		%icc, 1b
327	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
328	ba,a,pt		%icc, .Lexit
329END(__memcpy_niagara4)
330
331#endif
332